aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 18:20:36 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 18:20:36 -0400
commit1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree0bba044c4ce775e45a88a51686b5d9f90697ea9d /fs/xfs
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/Kconfig85
-rw-r--r--fs/xfs/Makefile150
-rw-r--r--fs/xfs/linux-2.6/kmem.c134
-rw-r--r--fs/xfs/linux-2.6/kmem.h157
-rw-r--r--fs/xfs/linux-2.6/mrlock.h106
-rw-r--r--fs/xfs/linux-2.6/mutex.h53
-rw-r--r--fs/xfs/linux-2.6/sema.h67
-rw-r--r--fs/xfs/linux-2.6/spin.h56
-rw-r--r--fs/xfs/linux-2.6/sv.h89
-rw-r--r--fs/xfs/linux-2.6/time.h51
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c1275
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c1980
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h591
-rw-r--r--fs/xfs/linux-2.6/xfs_cred.h50
-rw-r--r--fs/xfs/linux-2.6/xfs_export.c205
-rw-r--r--fs/xfs/linux-2.6/xfs_export.h122
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c573
-rw-r--r--fs/xfs/linux-2.6/xfs_fs_subr.c124
-rw-r--r--fs/xfs/linux-2.6/xfs_fs_subr.h48
-rw-r--r--fs/xfs/linux-2.6/xfs_globals.c74
-rw-r--r--fs/xfs/linux-2.6/xfs_globals.h44
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c1336
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c163
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.h34
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c680
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.h51
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h374
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c1082
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.h116
-rw-r--r--fs/xfs/linux-2.6/xfs_stats.c132
-rw-r--r--fs/xfs/linux-2.6/xfs_stats.h166
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c912
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h138
-rw-r--r--fs/xfs/linux-2.6/xfs_sysctl.c174
-rw-r--r--fs/xfs/linux-2.6/xfs_sysctl.h114
-rw-r--r--fs/xfs/linux-2.6/xfs_version.h44
-rw-r--r--fs/xfs/linux-2.6/xfs_vfs.c330
-rw-r--r--fs/xfs/linux-2.6/xfs_vfs.h223
-rw-r--r--fs/xfs/linux-2.6/xfs_vnode.c455
-rw-r--r--fs/xfs/linux-2.6/xfs_vnode.h666
-rw-r--r--fs/xfs/quota/xfs_dquot.c1648
-rw-r--r--fs/xfs/quota/xfs_dquot.h224
-rw-r--r--fs/xfs/quota/xfs_dquot_item.c715
-rw-r--r--fs/xfs/quota/xfs_dquot_item.h66
-rw-r--r--fs/xfs/quota/xfs_qm.c2848
-rw-r--r--fs/xfs/quota/xfs_qm.h236
-rw-r--r--fs/xfs/quota/xfs_qm_bhv.c410
-rw-r--r--fs/xfs/quota/xfs_qm_stats.c149
-rw-r--r--fs/xfs/quota/xfs_qm_stats.h68
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c1458
-rw-r--r--fs/xfs/quota/xfs_quota_priv.h192
-rw-r--r--fs/xfs/quota/xfs_trans_dquot.c941
-rw-r--r--fs/xfs/support/debug.c127
-rw-r--r--fs/xfs/support/debug.h72
-rw-r--r--fs/xfs/support/ktrace.c346
-rw-r--r--fs/xfs/support/ktrace.h101
-rw-r--r--fs/xfs/support/move.c66
-rw-r--r--fs/xfs/support/move.h84
-rw-r--r--fs/xfs/support/qsort.c155
-rw-r--r--fs/xfs/support/qsort.h41
-rw-r--r--fs/xfs/support/uuid.c151
-rw-r--r--fs/xfs/support/uuid.h48
-rw-r--r--fs/xfs/xfs.h40
-rw-r--r--fs/xfs/xfs_acl.c937
-rw-r--r--fs/xfs/xfs_acl.h116
-rw-r--r--fs/xfs/xfs_ag.h345
-rw-r--r--fs/xfs/xfs_alloc.c2623
-rw-r--r--fs/xfs/xfs_alloc.h203
-rw-r--r--fs/xfs/xfs_alloc_btree.c2204
-rw-r--r--fs/xfs/xfs_alloc_btree.h257
-rw-r--r--fs/xfs/xfs_arch.h213
-rw-r--r--fs/xfs/xfs_attr.c2660
-rw-r--r--fs/xfs/xfs_attr.h193
-rw-r--r--fs/xfs/xfs_attr_leaf.c3050
-rw-r--r--fs/xfs/xfs_attr_leaf.h308
-rw-r--r--fs/xfs/xfs_attr_sf.h149
-rw-r--r--fs/xfs/xfs_behavior.c218
-rw-r--r--fs/xfs/xfs_behavior.h204
-rw-r--r--fs/xfs/xfs_bit.c312
-rw-r--r--fs/xfs/xfs_bit.h85
-rw-r--r--fs/xfs/xfs_bmap.c6246
-rw-r--r--fs/xfs/xfs_bmap.h379
-rw-r--r--fs/xfs/xfs_bmap_btree.c2807
-rw-r--r--fs/xfs/xfs_bmap_btree.h701
-rw-r--r--fs/xfs/xfs_btree.c949
-rw-r--r--fs/xfs/xfs_btree.h592
-rw-r--r--fs/xfs/xfs_buf_item.c1221
-rw-r--r--fs/xfs/xfs_buf_item.h171
-rw-r--r--fs/xfs/xfs_cap.h84
-rw-r--r--fs/xfs/xfs_clnt.h110
-rw-r--r--fs/xfs/xfs_da_btree.c2648
-rw-r--r--fs/xfs/xfs_da_btree.h335
-rw-r--r--fs/xfs/xfs_dfrag.c387
-rw-r--r--fs/xfs/xfs_dfrag.h67
-rw-r--r--fs/xfs/xfs_dinode.h418
-rw-r--r--fs/xfs/xfs_dir.c1223
-rw-r--r--fs/xfs/xfs_dir.h154
-rw-r--r--fs/xfs/xfs_dir2.c859
-rw-r--r--fs/xfs/xfs_dir2.h109
-rw-r--r--fs/xfs/xfs_dir2_block.c1248
-rw-r--r--fs/xfs/xfs_dir2_block.h126
-rw-r--r--fs/xfs/xfs_dir2_data.c855
-rw-r--r--fs/xfs/xfs_dir2_data.h231
-rw-r--r--fs/xfs/xfs_dir2_leaf.c1896
-rw-r--r--fs/xfs/xfs_dir2_leaf.h360
-rw-r--r--fs/xfs/xfs_dir2_node.c2020
-rw-r--r--fs/xfs/xfs_dir2_node.h159
-rw-r--r--fs/xfs/xfs_dir2_sf.c1317
-rw-r--r--fs/xfs/xfs_dir2_sf.h243
-rw-r--r--fs/xfs/xfs_dir2_trace.c235
-rw-r--r--fs/xfs/xfs_dir2_trace.h86
-rw-r--r--fs/xfs/xfs_dir_leaf.c2231
-rw-r--r--fs/xfs/xfs_dir_leaf.h248
-rw-r--r--fs/xfs/xfs_dir_sf.h172
-rw-r--r--fs/xfs/xfs_dmapi.h212
-rw-r--r--fs/xfs/xfs_dmops.c52
-rw-r--r--fs/xfs/xfs_error.c327
-rw-r--r--fs/xfs/xfs_error.h196
-rw-r--r--fs/xfs/xfs_extfree_item.c668
-rw-r--r--fs/xfs/xfs_extfree_item.h123
-rw-r--r--fs/xfs/xfs_fs.h527
-rw-r--r--fs/xfs/xfs_fsops.c616
-rw-r--r--fs/xfs/xfs_fsops.h67
-rw-r--r--fs/xfs/xfs_ialloc.c1401
-rw-r--r--fs/xfs/xfs_ialloc.h184
-rw-r--r--fs/xfs/xfs_ialloc_btree.c2094
-rw-r--r--fs/xfs/xfs_ialloc_btree.h314
-rw-r--r--fs/xfs/xfs_iget.c1022
-rw-r--r--fs/xfs/xfs_imap.h54
-rw-r--r--fs/xfs/xfs_inode.c3876
-rw-r--r--fs/xfs/xfs_inode.h554
-rw-r--r--fs/xfs/xfs_inode_item.c1092
-rw-r--r--fs/xfs/xfs_inode_item.h197
-rw-r--r--fs/xfs/xfs_inum.h173
-rw-r--r--fs/xfs/xfs_iocore.c133
-rw-r--r--fs/xfs/xfs_iomap.c1000
-rw-r--r--fs/xfs/xfs_iomap.h107
-rw-r--r--fs/xfs/xfs_itable.c858
-rw-r--r--fs/xfs/xfs_itable.h106
-rw-r--r--fs/xfs/xfs_log.c3560
-rw-r--r--fs/xfs/xfs_log.h182
-rw-r--r--fs/xfs/xfs_log_priv.h561
-rw-r--r--fs/xfs/xfs_log_recover.c4098
-rw-r--r--fs/xfs/xfs_log_recover.h81
-rw-r--r--fs/xfs/xfs_mac.h120
-rw-r--r--fs/xfs/xfs_macros.c2136
-rw-r--r--fs/xfs/xfs_macros.h104
-rw-r--r--fs/xfs/xfs_mount.c1586
-rw-r--r--fs/xfs/xfs_mount.h573
-rw-r--r--fs/xfs/xfs_qmops.c71
-rw-r--r--fs/xfs/xfs_quota.h356
-rw-r--r--fs/xfs/xfs_refcache.h66
-rw-r--r--fs/xfs/xfs_rename.c673
-rw-r--r--fs/xfs/xfs_rtalloc.c2469
-rw-r--r--fs/xfs/xfs_rtalloc.h187
-rw-r--r--fs/xfs/xfs_rw.c356
-rw-r--r--fs/xfs/xfs_rw.h154
-rw-r--r--fs/xfs/xfs_sb.h583
-rw-r--r--fs/xfs/xfs_trans.c1315
-rw-r--r--fs/xfs/xfs_trans.h1042
-rw-r--r--fs/xfs/xfs_trans_ail.c596
-rw-r--r--fs/xfs/xfs_trans_buf.c1093
-rw-r--r--fs/xfs/xfs_trans_extfree.c156
-rw-r--r--fs/xfs/xfs_trans_inode.c342
-rw-r--r--fs/xfs/xfs_trans_item.c553
-rw-r--r--fs/xfs/xfs_trans_priv.h73
-rw-r--r--fs/xfs/xfs_trans_space.h105
-rw-r--r--fs/xfs/xfs_types.h182
-rw-r--r--fs/xfs/xfs_utils.c488
-rw-r--r--fs/xfs/xfs_utils.h52
-rw-r--r--fs/xfs/xfs_vfsops.c1941
-rw-r--r--fs/xfs/xfs_vnodeops.c4712
172 files changed, 114893 insertions, 0 deletions
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
new file mode 100644
index 000000000000..c92306f0fdc5
--- /dev/null
+++ b/fs/xfs/Kconfig
@@ -0,0 +1,85 @@
1menu "XFS support"
2
3config XFS_FS
4 tristate "XFS filesystem support"
5 select EXPORTFS if NFSD!=n
6 help
7 XFS is a high performance journaling filesystem which originated
8 on the SGI IRIX platform. It is completely multi-threaded, can
9 support large files and large filesystems, extended attributes,
10 variable block sizes, is extent based, and makes extensive use of
11 Btrees (directories, extents, free space) to aid both performance
12 and scalability.
13
14 Refer to the documentation at <http://oss.sgi.com/projects/xfs/>
15 for complete details. This implementation is on-disk compatible
16 with the IRIX version of XFS.
17
18 To compile this file system support as a module, choose M here: the
19 module will be called xfs. Be aware, however, that if the file
20 system of your root partition is compiled as a module, you'll need
21 to use an initial ramdisk (initrd) to boot.
22
23config XFS_EXPORT
24 bool
25 default y if XFS_FS && EXPORTFS
26
27config XFS_RT
28 bool "Realtime support (EXPERIMENTAL)"
29 depends on XFS_FS && EXPERIMENTAL
30 help
31 If you say Y here you will be able to mount and use XFS filesystems
32 which contain a realtime subvolume. The realtime subvolume is a
33 separate area of disk space where only file data is stored. The
34 realtime subvolume is designed to provide very deterministic
35 data rates suitable for media streaming applications.
36
37 See the xfs man page in section 5 for a bit more information.
38
39 This feature is unsupported at this time, is not yet fully
40 functional, and may cause serious problems.
41
42 If unsure, say N.
43
44config XFS_QUOTA
45 bool "Quota support"
46 depends on XFS_FS
47 help
48 If you say Y here, you will be able to set limits for disk usage on
49 a per user and/or a per group basis under XFS. XFS considers quota
50 information as filesystem metadata and uses journaling to provide a
51 higher level guarantee of consistency. The on-disk data format for
52 quota is also compatible with the IRIX version of XFS, allowing a
53 filesystem to be migrated between Linux and IRIX without any need
54 for conversion.
55
56 If unsure, say N. More comprehensive documentation can be found in
57 README.quota in the xfsprogs package. XFS quota can be used either
58 with or without the generic quota support enabled (CONFIG_QUOTA) -
59 they are completely independent subsystems.
60
61config XFS_SECURITY
62 bool "Security Label support"
63 depends on XFS_FS
64 help
65 Security labels support alternative access control models
66 implemented by security modules like SELinux. This option
67 enables an extended attribute namespace for inode security
68 labels in the XFS filesystem.
69
70 If you are not using a security module that requires using
71 extended attributes for inode security labels, say N.
72
73config XFS_POSIX_ACL
74 bool "POSIX ACL support"
75 depends on XFS_FS
76 help
77 POSIX Access Control Lists (ACLs) support permissions for users and
78 groups beyond the owner/group/world scheme.
79
80 To learn more about Access Control Lists, visit the POSIX ACLs for
81 Linux website <http://acl.bestbits.at/>.
82
83 If you don't know what Access Control Lists are, say N.
84
85endmenu
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
new file mode 100644
index 000000000000..554e4a18c152
--- /dev/null
+++ b/fs/xfs/Makefile
@@ -0,0 +1,150 @@
1#
2# Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
3#
4# This program is free software; you can redistribute it and/or modify it
5# under the terms of version 2 of the GNU General Public License as
6# published by the Free Software Foundation.
7#
8# This program is distributed in the hope that it would be useful, but
9# WITHOUT ANY WARRANTY; without even the implied warranty of
10# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11#
12# Further, this software is distributed without any warranty that it is
13# free of the rightful claim of any third person regarding infringement
14# or the like. Any license provided herein, whether implied or
15# otherwise, applies only to this software file. Patent licenses, if
16# any, provided herein do not apply to combinations of this program with
17# other software, or any other product whatsoever.
18#
19# You should have received a copy of the GNU General Public License along
20# with this program; if not, write the Free Software Foundation, Inc., 59
21# Temple Place - Suite 330, Boston MA 02111-1307, USA.
22#
23# Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24# Mountain View, CA 94043, or:
25#
26# http://www.sgi.com
27#
28# For further information regarding this notice, see:
29#
30# http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31#
32
33EXTRA_CFLAGS += -Ifs/xfs -Ifs/xfs/linux-2.6 -funsigned-char
34
35ifeq ($(CONFIG_XFS_DEBUG),y)
36 EXTRA_CFLAGS += -g -DSTATIC="" -DDEBUG
37 EXTRA_CFLAGS += -DPAGEBUF_LOCK_TRACKING
38endif
39ifeq ($(CONFIG_XFS_TRACE),y)
40 EXTRA_CFLAGS += -DXFS_ALLOC_TRACE
41 EXTRA_CFLAGS += -DXFS_ATTR_TRACE
42 EXTRA_CFLAGS += -DXFS_BLI_TRACE
43 EXTRA_CFLAGS += -DXFS_BMAP_TRACE
44 EXTRA_CFLAGS += -DXFS_BMBT_TRACE
45 EXTRA_CFLAGS += -DXFS_DIR_TRACE
46 EXTRA_CFLAGS += -DXFS_DIR2_TRACE
47 EXTRA_CFLAGS += -DXFS_DQUOT_TRACE
48 EXTRA_CFLAGS += -DXFS_ILOCK_TRACE
49 EXTRA_CFLAGS += -DXFS_LOG_TRACE
50 EXTRA_CFLAGS += -DXFS_RW_TRACE
51 EXTRA_CFLAGS += -DPAGEBUF_TRACE
52 # EXTRA_CFLAGS += -DXFS_VNODE_TRACE
53endif
54
55obj-$(CONFIG_XFS_FS) += xfs.o
56
57xfs-$(CONFIG_XFS_QUOTA) += $(addprefix quota/, \
58 xfs_dquot.o \
59 xfs_dquot_item.o \
60 xfs_trans_dquot.o \
61 xfs_qm_syscalls.o \
62 xfs_qm_bhv.o \
63 xfs_qm.o)
64ifeq ($(CONFIG_XFS_QUOTA),y)
65xfs-$(CONFIG_PROC_FS) += quota/xfs_qm_stats.o
66endif
67
68xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o
69xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
70xfs-$(CONFIG_PROC_FS) += linux-2.6/xfs_stats.o
71xfs-$(CONFIG_SYSCTL) += linux-2.6/xfs_sysctl.o
72xfs-$(CONFIG_COMPAT) += linux-2.6/xfs_ioctl32.o
73xfs-$(CONFIG_XFS_EXPORT) += linux-2.6/xfs_export.o
74
75
76xfs-y += xfs_alloc.o \
77 xfs_alloc_btree.o \
78 xfs_attr.o \
79 xfs_attr_leaf.o \
80 xfs_behavior.o \
81 xfs_bit.o \
82 xfs_bmap.o \
83 xfs_bmap_btree.o \
84 xfs_btree.o \
85 xfs_buf_item.o \
86 xfs_da_btree.o \
87 xfs_dir.o \
88 xfs_dir2.o \
89 xfs_dir2_block.o \
90 xfs_dir2_data.o \
91 xfs_dir2_leaf.o \
92 xfs_dir2_node.o \
93 xfs_dir2_sf.o \
94 xfs_dir_leaf.o \
95 xfs_error.o \
96 xfs_extfree_item.o \
97 xfs_fsops.o \
98 xfs_ialloc.o \
99 xfs_ialloc_btree.o \
100 xfs_iget.o \
101 xfs_inode.o \
102 xfs_inode_item.o \
103 xfs_iocore.o \
104 xfs_iomap.o \
105 xfs_itable.o \
106 xfs_dfrag.o \
107 xfs_log.o \
108 xfs_log_recover.o \
109 xfs_macros.o \
110 xfs_mount.o \
111 xfs_rename.o \
112 xfs_trans.o \
113 xfs_trans_ail.o \
114 xfs_trans_buf.o \
115 xfs_trans_extfree.o \
116 xfs_trans_inode.o \
117 xfs_trans_item.o \
118 xfs_utils.o \
119 xfs_vfsops.o \
120 xfs_vnodeops.o \
121 xfs_rw.o \
122 xfs_dmops.o \
123 xfs_qmops.o
124
125xfs-$(CONFIG_XFS_TRACE) += xfs_dir2_trace.o
126
127# Objects in linux-2.6/
128xfs-y += $(addprefix linux-2.6/, \
129 kmem.o \
130 xfs_aops.o \
131 xfs_buf.o \
132 xfs_file.o \
133 xfs_fs_subr.o \
134 xfs_globals.o \
135 xfs_ioctl.o \
136 xfs_iops.o \
137 xfs_lrw.o \
138 xfs_super.o \
139 xfs_vfs.o \
140 xfs_vnode.o)
141
142# Objects in support/
143xfs-y += $(addprefix support/, \
144 debug.o \
145 move.o \
146 qsort.o \
147 uuid.o)
148
149xfs-$(CONFIG_XFS_TRACE) += support/ktrace.o
150
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
new file mode 100644
index 000000000000..364ea8c386b1
--- /dev/null
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -0,0 +1,134 @@
1/*
2 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include <linux/sched.h>
34#include <linux/mm.h>
35#include <linux/vmalloc.h>
36#include <linux/highmem.h>
37#include <linux/swap.h>
38#include <linux/blkdev.h>
39
40#include "time.h"
41#include "kmem.h"
42
43#define MAX_VMALLOCS 6
44#define MAX_SLAB_SIZE 0x20000
45
46
47void *
48kmem_alloc(size_t size, int flags)
49{
50 int retries = 0;
51 int lflags = kmem_flags_convert(flags);
52 void *ptr;
53
54 do {
55 if (size < MAX_SLAB_SIZE || retries > MAX_VMALLOCS)
56 ptr = kmalloc(size, lflags);
57 else
58 ptr = __vmalloc(size, lflags, PAGE_KERNEL);
59 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
60 return ptr;
61 if (!(++retries % 100))
62 printk(KERN_ERR "XFS: possible memory allocation "
63 "deadlock in %s (mode:0x%x)\n",
64 __FUNCTION__, lflags);
65 blk_congestion_wait(WRITE, HZ/50);
66 } while (1);
67}
68
69void *
70kmem_zalloc(size_t size, int flags)
71{
72 void *ptr;
73
74 ptr = kmem_alloc(size, flags);
75 if (ptr)
76 memset((char *)ptr, 0, (int)size);
77 return ptr;
78}
79
80void
81kmem_free(void *ptr, size_t size)
82{
83 if (((unsigned long)ptr < VMALLOC_START) ||
84 ((unsigned long)ptr >= VMALLOC_END)) {
85 kfree(ptr);
86 } else {
87 vfree(ptr);
88 }
89}
90
91void *
92kmem_realloc(void *ptr, size_t newsize, size_t oldsize, int flags)
93{
94 void *new;
95
96 new = kmem_alloc(newsize, flags);
97 if (ptr) {
98 if (new)
99 memcpy(new, ptr,
100 ((oldsize < newsize) ? oldsize : newsize));
101 kmem_free(ptr, oldsize);
102 }
103 return new;
104}
105
106void *
107kmem_zone_alloc(kmem_zone_t *zone, int flags)
108{
109 int retries = 0;
110 int lflags = kmem_flags_convert(flags);
111 void *ptr;
112
113 do {
114 ptr = kmem_cache_alloc(zone, lflags);
115 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
116 return ptr;
117 if (!(++retries % 100))
118 printk(KERN_ERR "XFS: possible memory allocation "
119 "deadlock in %s (mode:0x%x)\n",
120 __FUNCTION__, lflags);
121 blk_congestion_wait(WRITE, HZ/50);
122 } while (1);
123}
124
125void *
126kmem_zone_zalloc(kmem_zone_t *zone, int flags)
127{
128 void *ptr;
129
130 ptr = kmem_zone_alloc(zone, flags);
131 if (ptr)
132 memset((char *)ptr, 0, kmem_cache_size(zone));
133 return ptr;
134}
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
new file mode 100644
index 000000000000..1397b669b059
--- /dev/null
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -0,0 +1,157 @@
1/*
2 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_SUPPORT_KMEM_H__
33#define __XFS_SUPPORT_KMEM_H__
34
35#include <linux/slab.h>
36#include <linux/sched.h>
37#include <linux/mm.h>
38
39/*
40 * memory management routines
41 */
42#define KM_SLEEP 0x0001
43#define KM_NOSLEEP 0x0002
44#define KM_NOFS 0x0004
45#define KM_MAYFAIL 0x0008
46
47#define kmem_zone kmem_cache_s
48#define kmem_zone_t kmem_cache_t
49
50typedef unsigned long xfs_pflags_t;
51
52#define PFLAGS_TEST_NOIO() (current->flags & PF_NOIO)
53#define PFLAGS_TEST_FSTRANS() (current->flags & PF_FSTRANS)
54
55#define PFLAGS_SET_NOIO() do { \
56 current->flags |= PF_NOIO; \
57} while (0)
58
59#define PFLAGS_CLEAR_NOIO() do { \
60 current->flags &= ~PF_NOIO; \
61} while (0)
62
63/* these could be nested, so we save state */
64#define PFLAGS_SET_FSTRANS(STATEP) do { \
65 *(STATEP) = current->flags; \
66 current->flags |= PF_FSTRANS; \
67} while (0)
68
69#define PFLAGS_CLEAR_FSTRANS(STATEP) do { \
70 *(STATEP) = current->flags; \
71 current->flags &= ~PF_FSTRANS; \
72} while (0)
73
74/* Restore the PF_FSTRANS state to what was saved in STATEP */
75#define PFLAGS_RESTORE_FSTRANS(STATEP) do { \
76 current->flags = ((current->flags & ~PF_FSTRANS) | \
77 (*(STATEP) & PF_FSTRANS)); \
78} while (0)
79
80#define PFLAGS_DUP(OSTATEP, NSTATEP) do { \
81 *(NSTATEP) = *(OSTATEP); \
82} while (0)
83
84static __inline unsigned int kmem_flags_convert(int flags)
85{
86 int lflags = __GFP_NOWARN; /* we'll report problems, if need be */
87
88#ifdef DEBUG
89 if (unlikely(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL))) {
90 printk(KERN_WARNING
91 "XFS: memory allocation with wrong flags (%x)\n", flags);
92 BUG();
93 }
94#endif
95
96 if (flags & KM_NOSLEEP) {
97 lflags |= GFP_ATOMIC;
98 } else {
99 lflags |= GFP_KERNEL;
100
101 /* avoid recusive callbacks to filesystem during transactions */
102 if (PFLAGS_TEST_FSTRANS() || (flags & KM_NOFS))
103 lflags &= ~__GFP_FS;
104 }
105
106 return lflags;
107}
108
109static __inline kmem_zone_t *
110kmem_zone_init(int size, char *zone_name)
111{
112 return kmem_cache_create(zone_name, size, 0, 0, NULL, NULL);
113}
114
115static __inline void
116kmem_zone_free(kmem_zone_t *zone, void *ptr)
117{
118 kmem_cache_free(zone, ptr);
119}
120
121static __inline void
122kmem_zone_destroy(kmem_zone_t *zone)
123{
124 if (zone && kmem_cache_destroy(zone))
125 BUG();
126}
127
128extern void *kmem_zone_zalloc(kmem_zone_t *, int);
129extern void *kmem_zone_alloc(kmem_zone_t *, int);
130
131extern void *kmem_alloc(size_t, int);
132extern void *kmem_realloc(void *, size_t, size_t, int);
133extern void *kmem_zalloc(size_t, int);
134extern void kmem_free(void *, size_t);
135
136typedef struct shrinker *kmem_shaker_t;
137typedef int (*kmem_shake_func_t)(int, unsigned int);
138
139static __inline kmem_shaker_t
140kmem_shake_register(kmem_shake_func_t sfunc)
141{
142 return set_shrinker(DEFAULT_SEEKS, sfunc);
143}
144
145static __inline void
146kmem_shake_deregister(kmem_shaker_t shrinker)
147{
148 remove_shrinker(shrinker);
149}
150
151static __inline int
152kmem_shake_allow(unsigned int gfp_mask)
153{
154 return (gfp_mask & __GFP_WAIT);
155}
156
157#endif /* __XFS_SUPPORT_KMEM_H__ */
diff --git a/fs/xfs/linux-2.6/mrlock.h b/fs/xfs/linux-2.6/mrlock.h
new file mode 100644
index 000000000000..d2c11a098ff2
--- /dev/null
+++ b/fs/xfs/linux-2.6/mrlock.h
@@ -0,0 +1,106 @@
1/*
2 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_SUPPORT_MRLOCK_H__
33#define __XFS_SUPPORT_MRLOCK_H__
34
35#include <linux/rwsem.h>
36
37enum { MR_NONE, MR_ACCESS, MR_UPDATE };
38
39typedef struct {
40 struct rw_semaphore mr_lock;
41 int mr_writer;
42} mrlock_t;
43
44#define mrinit(mrp, name) \
45 ( (mrp)->mr_writer = 0, init_rwsem(&(mrp)->mr_lock) )
46#define mrlock_init(mrp, t,n,s) mrinit(mrp, n)
47#define mrfree(mrp) do { } while (0)
48#define mraccess(mrp) mraccessf(mrp, 0)
49#define mrupdate(mrp) mrupdatef(mrp, 0)
50
51static inline void mraccessf(mrlock_t *mrp, int flags)
52{
53 down_read(&mrp->mr_lock);
54}
55
56static inline void mrupdatef(mrlock_t *mrp, int flags)
57{
58 down_write(&mrp->mr_lock);
59 mrp->mr_writer = 1;
60}
61
62static inline int mrtryaccess(mrlock_t *mrp)
63{
64 return down_read_trylock(&mrp->mr_lock);
65}
66
67static inline int mrtryupdate(mrlock_t *mrp)
68{
69 if (!down_write_trylock(&mrp->mr_lock))
70 return 0;
71 mrp->mr_writer = 1;
72 return 1;
73}
74
75static inline void mrunlock(mrlock_t *mrp)
76{
77 if (mrp->mr_writer) {
78 mrp->mr_writer = 0;
79 up_write(&mrp->mr_lock);
80 } else {
81 up_read(&mrp->mr_lock);
82 }
83}
84
85static inline void mrdemote(mrlock_t *mrp)
86{
87 mrp->mr_writer = 0;
88 downgrade_write(&mrp->mr_lock);
89}
90
91#ifdef DEBUG
92/*
93 * Debug-only routine, without some platform-specific asm code, we can
94 * now only answer requests regarding whether we hold the lock for write
95 * (reader state is outside our visibility, we only track writer state).
96 * Note: means !ismrlocked would give false positivies, so don't do that.
97 */
98static inline int ismrlocked(mrlock_t *mrp, int type)
99{
100 if (mrp && type == MR_UPDATE)
101 return mrp->mr_writer;
102 return 1;
103}
104#endif
105
106#endif /* __XFS_SUPPORT_MRLOCK_H__ */
diff --git a/fs/xfs/linux-2.6/mutex.h b/fs/xfs/linux-2.6/mutex.h
new file mode 100644
index 000000000000..0b296bb944cb
--- /dev/null
+++ b/fs/xfs/linux-2.6/mutex.h
@@ -0,0 +1,53 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_SUPPORT_MUTEX_H__
33#define __XFS_SUPPORT_MUTEX_H__
34
35#include <linux/spinlock.h>
36#include <asm/semaphore.h>
37
38/*
39 * Map the mutex'es from IRIX to Linux semaphores.
40 *
41 * Destroy just simply initializes to -99 which should block all other
42 * callers.
43 */
44#define MUTEX_DEFAULT 0x0
45typedef struct semaphore mutex_t;
46
47#define mutex_init(lock, type, name) sema_init(lock, 1)
48#define mutex_destroy(lock) sema_init(lock, -99)
49#define mutex_lock(lock, num) down(lock)
50#define mutex_trylock(lock) (down_trylock(lock) ? 0 : 1)
51#define mutex_unlock(lock) up(lock)
52
53#endif /* __XFS_SUPPORT_MUTEX_H__ */
diff --git a/fs/xfs/linux-2.6/sema.h b/fs/xfs/linux-2.6/sema.h
new file mode 100644
index 000000000000..30b67b4e1cbf
--- /dev/null
+++ b/fs/xfs/linux-2.6/sema.h
@@ -0,0 +1,67 @@
1/*
2 * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_SUPPORT_SEMA_H__
33#define __XFS_SUPPORT_SEMA_H__
34
35#include <linux/time.h>
36#include <linux/wait.h>
37#include <asm/atomic.h>
38#include <asm/semaphore.h>
39
40/*
41 * sema_t structure just maps to struct semaphore in Linux kernel.
42 */
43
44typedef struct semaphore sema_t;
45
46#define init_sema(sp, val, c, d) sema_init(sp, val)
47#define initsema(sp, val) sema_init(sp, val)
48#define initnsema(sp, val, name) sema_init(sp, val)
49#define psema(sp, b) down(sp)
50#define vsema(sp) up(sp)
51#define valusema(sp) (atomic_read(&(sp)->count))
52#define freesema(sema)
53
54/*
55 * Map cpsema (try to get the sema) to down_trylock. We need to switch
56 * the return values since cpsema returns 1 (acquired) 0 (failed) and
57 * down_trylock returns the reverse 0 (acquired) 1 (failed).
58 */
59
60#define cpsema(sp) (down_trylock(sp) ? 0 : 1)
61
62/*
63 * Didn't do cvsema(sp). Not sure how to map this to up/down/...
64 * It does a vsema if the values is < 0 other wise nothing.
65 */
66
67#endif /* __XFS_SUPPORT_SEMA_H__ */
diff --git a/fs/xfs/linux-2.6/spin.h b/fs/xfs/linux-2.6/spin.h
new file mode 100644
index 000000000000..bcf60a0b8df0
--- /dev/null
+++ b/fs/xfs/linux-2.6/spin.h
@@ -0,0 +1,56 @@
1/*
2 * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_SUPPORT_SPIN_H__
33#define __XFS_SUPPORT_SPIN_H__
34
35#include <linux/sched.h> /* preempt needs this */
36#include <linux/spinlock.h>
37
38/*
39 * Map lock_t from IRIX to Linux spinlocks.
40 *
41 * We do not make use of lock_t from interrupt context, so we do not
42 * have to worry about disabling interrupts at all (unlike IRIX).
43 */
44
45typedef spinlock_t lock_t;
46
47#define SPLDECL(s) unsigned long s
48
49#define spinlock_init(lock, name) spin_lock_init(lock)
50#define spinlock_destroy(lock)
51#define mutex_spinlock(lock) ({ spin_lock(lock); 0; })
52#define mutex_spinunlock(lock, s) do { spin_unlock(lock); (void)s; } while (0)
53#define nested_spinlock(lock) spin_lock(lock)
54#define nested_spinunlock(lock) spin_unlock(lock)
55
56#endif /* __XFS_SUPPORT_SPIN_H__ */
diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h
new file mode 100644
index 000000000000..821d3167e05b
--- /dev/null
+++ b/fs/xfs/linux-2.6/sv.h
@@ -0,0 +1,89 @@
1/*
2 * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_SUPPORT_SV_H__
33#define __XFS_SUPPORT_SV_H__
34
35#include <linux/wait.h>
36#include <linux/sched.h>
37#include <linux/spinlock.h>
38
39/*
40 * Synchronisation variables.
41 *
42 * (Parameters "pri", "svf" and "rts" are not implemented)
43 */
44
45typedef struct sv_s {
46 wait_queue_head_t waiters;
47} sv_t;
48
49#define SV_FIFO 0x0 /* sv_t is FIFO type */
50#define SV_LIFO 0x2 /* sv_t is LIFO type */
51#define SV_PRIO 0x4 /* sv_t is PRIO type */
52#define SV_KEYED 0x6 /* sv_t is KEYED type */
53#define SV_DEFAULT SV_FIFO
54
55
56static inline void _sv_wait(sv_t *sv, spinlock_t *lock, int state,
57 unsigned long timeout)
58{
59 DECLARE_WAITQUEUE(wait, current);
60
61 add_wait_queue_exclusive(&sv->waiters, &wait);
62 __set_current_state(state);
63 spin_unlock(lock);
64
65 schedule_timeout(timeout);
66
67 remove_wait_queue(&sv->waiters, &wait);
68}
69
70#define init_sv(sv,type,name,flag) \
71 init_waitqueue_head(&(sv)->waiters)
72#define sv_init(sv,flag,name) \
73 init_waitqueue_head(&(sv)->waiters)
74#define sv_destroy(sv) \
75 /*NOTHING*/
76#define sv_wait(sv, pri, lock, s) \
77 _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT)
78#define sv_wait_sig(sv, pri, lock, s) \
79 _sv_wait(sv, lock, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT)
80#define sv_timedwait(sv, pri, lock, s, svf, ts, rts) \
81 _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, timespec_to_jiffies(ts))
82#define sv_timedwait_sig(sv, pri, lock, s, svf, ts, rts) \
83 _sv_wait(sv, lock, TASK_INTERRUPTIBLE, timespec_to_jiffies(ts))
84#define sv_signal(sv) \
85 wake_up(&(sv)->waiters)
86#define sv_broadcast(sv) \
87 wake_up_all(&(sv)->waiters)
88
89#endif /* __XFS_SUPPORT_SV_H__ */
diff --git a/fs/xfs/linux-2.6/time.h b/fs/xfs/linux-2.6/time.h
new file mode 100644
index 000000000000..6c6fd0faa8e1
--- /dev/null
+++ b/fs/xfs/linux-2.6/time.h
@@ -0,0 +1,51 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_SUPPORT_TIME_H__
33#define __XFS_SUPPORT_TIME_H__
34
35#include <linux/sched.h>
36#include <linux/time.h>
37
38typedef struct timespec timespec_t;
39
40static inline void delay(long ticks)
41{
42 set_current_state(TASK_UNINTERRUPTIBLE);
43 schedule_timeout(ticks);
44}
45
46static inline void nanotime(struct timespec *tvp)
47{
48 *tvp = CURRENT_TIME;
49}
50
51#endif /* __XFS_SUPPORT_TIME_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
new file mode 100644
index 000000000000..76a84758073a
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -0,0 +1,1275 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34#include "xfs_inum.h"
35#include "xfs_log.h"
36#include "xfs_sb.h"
37#include "xfs_dir.h"
38#include "xfs_dir2.h"
39#include "xfs_trans.h"
40#include "xfs_dmapi.h"
41#include "xfs_mount.h"
42#include "xfs_bmap_btree.h"
43#include "xfs_alloc_btree.h"
44#include "xfs_ialloc_btree.h"
45#include "xfs_alloc.h"
46#include "xfs_btree.h"
47#include "xfs_attr_sf.h"
48#include "xfs_dir_sf.h"
49#include "xfs_dir2_sf.h"
50#include "xfs_dinode.h"
51#include "xfs_inode.h"
52#include "xfs_error.h"
53#include "xfs_rw.h"
54#include "xfs_iomap.h"
55#include <linux/mpage.h>
56#include <linux/writeback.h>
57
58STATIC void xfs_count_page_state(struct page *, int *, int *, int *);
59STATIC void xfs_convert_page(struct inode *, struct page *, xfs_iomap_t *,
60 struct writeback_control *wbc, void *, int, int);
61
62#if defined(XFS_RW_TRACE)
63void
64xfs_page_trace(
65 int tag,
66 struct inode *inode,
67 struct page *page,
68 int mask)
69{
70 xfs_inode_t *ip;
71 bhv_desc_t *bdp;
72 vnode_t *vp = LINVFS_GET_VP(inode);
73 loff_t isize = i_size_read(inode);
74 loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
75 int delalloc = -1, unmapped = -1, unwritten = -1;
76
77 if (page_has_buffers(page))
78 xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
79
80 bdp = vn_bhv_lookup(VN_BHV_HEAD(vp), &xfs_vnodeops);
81 ip = XFS_BHVTOI(bdp);
82 if (!ip->i_rwtrace)
83 return;
84
85 ktrace_enter(ip->i_rwtrace,
86 (void *)((unsigned long)tag),
87 (void *)ip,
88 (void *)inode,
89 (void *)page,
90 (void *)((unsigned long)mask),
91 (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
92 (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
93 (void *)((unsigned long)((isize >> 32) & 0xffffffff)),
94 (void *)((unsigned long)(isize & 0xffffffff)),
95 (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
96 (void *)((unsigned long)(offset & 0xffffffff)),
97 (void *)((unsigned long)delalloc),
98 (void *)((unsigned long)unmapped),
99 (void *)((unsigned long)unwritten),
100 (void *)NULL,
101 (void *)NULL);
102}
103#else
104#define xfs_page_trace(tag, inode, page, mask)
105#endif
106
107void
108linvfs_unwritten_done(
109 struct buffer_head *bh,
110 int uptodate)
111{
112 xfs_buf_t *pb = (xfs_buf_t *)bh->b_private;
113
114 ASSERT(buffer_unwritten(bh));
115 bh->b_end_io = NULL;
116 clear_buffer_unwritten(bh);
117 if (!uptodate)
118 pagebuf_ioerror(pb, EIO);
119 if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) {
120 pagebuf_iodone(pb, 1, 1);
121 }
122 end_buffer_async_write(bh, uptodate);
123}
124
125/*
126 * Issue transactions to convert a buffer range from unwritten
127 * to written extents (buffered IO).
128 */
129STATIC void
130linvfs_unwritten_convert(
131 xfs_buf_t *bp)
132{
133 vnode_t *vp = XFS_BUF_FSPRIVATE(bp, vnode_t *);
134 int error;
135
136 BUG_ON(atomic_read(&bp->pb_hold) < 1);
137 VOP_BMAP(vp, XFS_BUF_OFFSET(bp), XFS_BUF_SIZE(bp),
138 BMAPI_UNWRITTEN, NULL, NULL, error);
139 XFS_BUF_SET_FSPRIVATE(bp, NULL);
140 XFS_BUF_CLR_IODONE_FUNC(bp);
141 XFS_BUF_UNDATAIO(bp);
142 iput(LINVFS_GET_IP(vp));
143 pagebuf_iodone(bp, 0, 0);
144}
145
146/*
147 * Issue transactions to convert a buffer range from unwritten
148 * to written extents (direct IO).
149 */
150STATIC void
151linvfs_unwritten_convert_direct(
152 struct inode *inode,
153 loff_t offset,
154 ssize_t size,
155 void *private)
156{
157 ASSERT(!private || inode == (struct inode *)private);
158
159 /* private indicates an unwritten extent lay beneath this IO */
160 if (private && size > 0) {
161 vnode_t *vp = LINVFS_GET_VP(inode);
162 int error;
163
164 VOP_BMAP(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL, error);
165 }
166}
167
168STATIC int
169xfs_map_blocks(
170 struct inode *inode,
171 loff_t offset,
172 ssize_t count,
173 xfs_iomap_t *mapp,
174 int flags)
175{
176 vnode_t *vp = LINVFS_GET_VP(inode);
177 int error, nmaps = 1;
178
179 VOP_BMAP(vp, offset, count, flags, mapp, &nmaps, error);
180 if (!error && (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)))
181 VMODIFY(vp);
182 return -error;
183}
184
185/*
186 * Finds the corresponding mapping in block @map array of the
187 * given @offset within a @page.
188 */
189STATIC xfs_iomap_t *
190xfs_offset_to_map(
191 struct page *page,
192 xfs_iomap_t *iomapp,
193 unsigned long offset)
194{
195 loff_t full_offset; /* offset from start of file */
196
197 ASSERT(offset < PAGE_CACHE_SIZE);
198
199 full_offset = page->index; /* NB: using 64bit number */
200 full_offset <<= PAGE_CACHE_SHIFT; /* offset from file start */
201 full_offset += offset; /* offset from page start */
202
203 if (full_offset < iomapp->iomap_offset)
204 return NULL;
205 if (iomapp->iomap_offset + (iomapp->iomap_bsize -1) >= full_offset)
206 return iomapp;
207 return NULL;
208}
209
210STATIC void
211xfs_map_at_offset(
212 struct page *page,
213 struct buffer_head *bh,
214 unsigned long offset,
215 int block_bits,
216 xfs_iomap_t *iomapp)
217{
218 xfs_daddr_t bn;
219 loff_t delta;
220 int sector_shift;
221
222 ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE));
223 ASSERT(!(iomapp->iomap_flags & IOMAP_DELAY));
224 ASSERT(iomapp->iomap_bn != IOMAP_DADDR_NULL);
225
226 delta = page->index;
227 delta <<= PAGE_CACHE_SHIFT;
228 delta += offset;
229 delta -= iomapp->iomap_offset;
230 delta >>= block_bits;
231
232 sector_shift = block_bits - BBSHIFT;
233 bn = iomapp->iomap_bn >> sector_shift;
234 bn += delta;
235 BUG_ON(!bn && !(iomapp->iomap_flags & IOMAP_REALTIME));
236 ASSERT((bn << sector_shift) >= iomapp->iomap_bn);
237
238 lock_buffer(bh);
239 bh->b_blocknr = bn;
240 bh->b_bdev = iomapp->iomap_target->pbr_bdev;
241 set_buffer_mapped(bh);
242 clear_buffer_delay(bh);
243}
244
245/*
246 * Look for a page at index which is unlocked and contains our
247 * unwritten extent flagged buffers at its head. Returns page
248 * locked and with an extra reference count, and length of the
249 * unwritten extent component on this page that we can write,
250 * in units of filesystem blocks.
251 */
252STATIC struct page *
253xfs_probe_unwritten_page(
254 struct address_space *mapping,
255 pgoff_t index,
256 xfs_iomap_t *iomapp,
257 xfs_buf_t *pb,
258 unsigned long max_offset,
259 unsigned long *fsbs,
260 unsigned int bbits)
261{
262 struct page *page;
263
264 page = find_trylock_page(mapping, index);
265 if (!page)
266 return NULL;
267 if (PageWriteback(page))
268 goto out;
269
270 if (page->mapping && page_has_buffers(page)) {
271 struct buffer_head *bh, *head;
272 unsigned long p_offset = 0;
273
274 *fsbs = 0;
275 bh = head = page_buffers(page);
276 do {
277 if (!buffer_unwritten(bh) || !buffer_uptodate(bh))
278 break;
279 if (!xfs_offset_to_map(page, iomapp, p_offset))
280 break;
281 if (p_offset >= max_offset)
282 break;
283 xfs_map_at_offset(page, bh, p_offset, bbits, iomapp);
284 set_buffer_unwritten_io(bh);
285 bh->b_private = pb;
286 p_offset += bh->b_size;
287 (*fsbs)++;
288 } while ((bh = bh->b_this_page) != head);
289
290 if (p_offset)
291 return page;
292 }
293
294out:
295 unlock_page(page);
296 return NULL;
297}
298
299/*
300 * Look for a page at index which is unlocked and not mapped
301 * yet - clustering for mmap write case.
302 */
303STATIC unsigned int
304xfs_probe_unmapped_page(
305 struct address_space *mapping,
306 pgoff_t index,
307 unsigned int pg_offset)
308{
309 struct page *page;
310 int ret = 0;
311
312 page = find_trylock_page(mapping, index);
313 if (!page)
314 return 0;
315 if (PageWriteback(page))
316 goto out;
317
318 if (page->mapping && PageDirty(page)) {
319 if (page_has_buffers(page)) {
320 struct buffer_head *bh, *head;
321
322 bh = head = page_buffers(page);
323 do {
324 if (buffer_mapped(bh) || !buffer_uptodate(bh))
325 break;
326 ret += bh->b_size;
327 if (ret >= pg_offset)
328 break;
329 } while ((bh = bh->b_this_page) != head);
330 } else
331 ret = PAGE_CACHE_SIZE;
332 }
333
334out:
335 unlock_page(page);
336 return ret;
337}
338
339STATIC unsigned int
340xfs_probe_unmapped_cluster(
341 struct inode *inode,
342 struct page *startpage,
343 struct buffer_head *bh,
344 struct buffer_head *head)
345{
346 pgoff_t tindex, tlast, tloff;
347 unsigned int pg_offset, len, total = 0;
348 struct address_space *mapping = inode->i_mapping;
349
350 /* First sum forwards in this page */
351 do {
352 if (buffer_mapped(bh))
353 break;
354 total += bh->b_size;
355 } while ((bh = bh->b_this_page) != head);
356
357 /* If we reached the end of the page, sum forwards in
358 * following pages.
359 */
360 if (bh == head) {
361 tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
362 /* Prune this back to avoid pathological behavior */
363 tloff = min(tlast, startpage->index + 64);
364 for (tindex = startpage->index + 1; tindex < tloff; tindex++) {
365 len = xfs_probe_unmapped_page(mapping, tindex,
366 PAGE_CACHE_SIZE);
367 if (!len)
368 return total;
369 total += len;
370 }
371 if (tindex == tlast &&
372 (pg_offset = i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {
373 total += xfs_probe_unmapped_page(mapping,
374 tindex, pg_offset);
375 }
376 }
377 return total;
378}
379
380/*
381 * Probe for a given page (index) in the inode and test if it is delayed
382 * and without unwritten buffers. Returns page locked and with an extra
383 * reference count.
384 */
385STATIC struct page *
386xfs_probe_delalloc_page(
387 struct inode *inode,
388 pgoff_t index)
389{
390 struct page *page;
391
392 page = find_trylock_page(inode->i_mapping, index);
393 if (!page)
394 return NULL;
395 if (PageWriteback(page))
396 goto out;
397
398 if (page->mapping && page_has_buffers(page)) {
399 struct buffer_head *bh, *head;
400 int acceptable = 0;
401
402 bh = head = page_buffers(page);
403 do {
404 if (buffer_unwritten(bh)) {
405 acceptable = 0;
406 break;
407 } else if (buffer_delay(bh)) {
408 acceptable = 1;
409 }
410 } while ((bh = bh->b_this_page) != head);
411
412 if (acceptable)
413 return page;
414 }
415
416out:
417 unlock_page(page);
418 return NULL;
419}
420
421STATIC int
422xfs_map_unwritten(
423 struct inode *inode,
424 struct page *start_page,
425 struct buffer_head *head,
426 struct buffer_head *curr,
427 unsigned long p_offset,
428 int block_bits,
429 xfs_iomap_t *iomapp,
430 struct writeback_control *wbc,
431 int startio,
432 int all_bh)
433{
434 struct buffer_head *bh = curr;
435 xfs_iomap_t *tmp;
436 xfs_buf_t *pb;
437 loff_t offset, size;
438 unsigned long nblocks = 0;
439
440 offset = start_page->index;
441 offset <<= PAGE_CACHE_SHIFT;
442 offset += p_offset;
443
444 /* get an "empty" pagebuf to manage IO completion
445 * Proper values will be set before returning */
446 pb = pagebuf_lookup(iomapp->iomap_target, 0, 0, 0);
447 if (!pb)
448 return -EAGAIN;
449
450 /* Take a reference to the inode to prevent it from
451 * being reclaimed while we have outstanding unwritten
452 * extent IO on it.
453 */
454 if ((igrab(inode)) != inode) {
455 pagebuf_free(pb);
456 return -EAGAIN;
457 }
458
459 /* Set the count to 1 initially, this will stop an I/O
460 * completion callout which happens before we have started
461 * all the I/O from calling pagebuf_iodone too early.
462 */
463 atomic_set(&pb->pb_io_remaining, 1);
464
465 /* First map forwards in the page consecutive buffers
466 * covering this unwritten extent
467 */
468 do {
469 if (!buffer_unwritten(bh))
470 break;
471 tmp = xfs_offset_to_map(start_page, iomapp, p_offset);
472 if (!tmp)
473 break;
474 xfs_map_at_offset(start_page, bh, p_offset, block_bits, iomapp);
475 set_buffer_unwritten_io(bh);
476 bh->b_private = pb;
477 p_offset += bh->b_size;
478 nblocks++;
479 } while ((bh = bh->b_this_page) != head);
480
481 atomic_add(nblocks, &pb->pb_io_remaining);
482
483 /* If we reached the end of the page, map forwards in any
484 * following pages which are also covered by this extent.
485 */
486 if (bh == head) {
487 struct address_space *mapping = inode->i_mapping;
488 pgoff_t tindex, tloff, tlast;
489 unsigned long bs;
490 unsigned int pg_offset, bbits = inode->i_blkbits;
491 struct page *page;
492
493 tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
494 tloff = (iomapp->iomap_offset + iomapp->iomap_bsize) >> PAGE_CACHE_SHIFT;
495 tloff = min(tlast, tloff);
496 for (tindex = start_page->index + 1; tindex < tloff; tindex++) {
497 page = xfs_probe_unwritten_page(mapping,
498 tindex, iomapp, pb,
499 PAGE_CACHE_SIZE, &bs, bbits);
500 if (!page)
501 break;
502 nblocks += bs;
503 atomic_add(bs, &pb->pb_io_remaining);
504 xfs_convert_page(inode, page, iomapp, wbc, pb,
505 startio, all_bh);
506 /* stop if converting the next page might add
507 * enough blocks that the corresponding byte
508 * count won't fit in our ulong page buf length */
509 if (nblocks >= ((ULONG_MAX - PAGE_SIZE) >> block_bits))
510 goto enough;
511 }
512
513 if (tindex == tlast &&
514 (pg_offset = (i_size_read(inode) & (PAGE_CACHE_SIZE - 1)))) {
515 page = xfs_probe_unwritten_page(mapping,
516 tindex, iomapp, pb,
517 pg_offset, &bs, bbits);
518 if (page) {
519 nblocks += bs;
520 atomic_add(bs, &pb->pb_io_remaining);
521 xfs_convert_page(inode, page, iomapp, wbc, pb,
522 startio, all_bh);
523 if (nblocks >= ((ULONG_MAX - PAGE_SIZE) >> block_bits))
524 goto enough;
525 }
526 }
527 }
528
529enough:
530 size = nblocks; /* NB: using 64bit number here */
531 size <<= block_bits; /* convert fsb's to byte range */
532
533 XFS_BUF_DATAIO(pb);
534 XFS_BUF_ASYNC(pb);
535 XFS_BUF_SET_SIZE(pb, size);
536 XFS_BUF_SET_COUNT(pb, size);
537 XFS_BUF_SET_OFFSET(pb, offset);
538 XFS_BUF_SET_FSPRIVATE(pb, LINVFS_GET_VP(inode));
539 XFS_BUF_SET_IODONE_FUNC(pb, linvfs_unwritten_convert);
540
541 if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) {
542 pagebuf_iodone(pb, 1, 1);
543 }
544
545 return 0;
546}
547
548STATIC void
549xfs_submit_page(
550 struct page *page,
551 struct writeback_control *wbc,
552 struct buffer_head *bh_arr[],
553 int bh_count,
554 int probed_page,
555 int clear_dirty)
556{
557 struct buffer_head *bh;
558 int i;
559
560 BUG_ON(PageWriteback(page));
561 set_page_writeback(page);
562 if (clear_dirty)
563 clear_page_dirty(page);
564 unlock_page(page);
565
566 if (bh_count) {
567 for (i = 0; i < bh_count; i++) {
568 bh = bh_arr[i];
569 mark_buffer_async_write(bh);
570 if (buffer_unwritten(bh))
571 set_buffer_unwritten_io(bh);
572 set_buffer_uptodate(bh);
573 clear_buffer_dirty(bh);
574 }
575
576 for (i = 0; i < bh_count; i++)
577 submit_bh(WRITE, bh_arr[i]);
578
579 if (probed_page && clear_dirty)
580 wbc->nr_to_write--; /* Wrote an "extra" page */
581 } else {
582 end_page_writeback(page);
583 wbc->pages_skipped++; /* We didn't write this page */
584 }
585}
586
587/*
588 * Allocate & map buffers for page given the extent map. Write it out.
589 * except for the original page of a writepage, this is called on
590 * delalloc/unwritten pages only, for the original page it is possible
591 * that the page has no mapping at all.
592 */
593STATIC void
594xfs_convert_page(
595 struct inode *inode,
596 struct page *page,
597 xfs_iomap_t *iomapp,
598 struct writeback_control *wbc,
599 void *private,
600 int startio,
601 int all_bh)
602{
603 struct buffer_head *bh_arr[MAX_BUF_PER_PAGE], *bh, *head;
604 xfs_iomap_t *mp = iomapp, *tmp;
605 unsigned long end, offset;
606 pgoff_t end_index;
607 int i = 0, index = 0;
608 int bbits = inode->i_blkbits;
609
610 end_index = i_size_read(inode) >> PAGE_CACHE_SHIFT;
611 if (page->index < end_index) {
612 end = PAGE_CACHE_SIZE;
613 } else {
614 end = i_size_read(inode) & (PAGE_CACHE_SIZE-1);
615 }
616 bh = head = page_buffers(page);
617 do {
618 offset = i << bbits;
619 if (offset >= end)
620 break;
621 if (!(PageUptodate(page) || buffer_uptodate(bh)))
622 continue;
623 if (buffer_mapped(bh) && all_bh &&
624 !(buffer_unwritten(bh) || buffer_delay(bh))) {
625 if (startio) {
626 lock_buffer(bh);
627 bh_arr[index++] = bh;
628 }
629 continue;
630 }
631 tmp = xfs_offset_to_map(page, mp, offset);
632 if (!tmp)
633 continue;
634 ASSERT(!(tmp->iomap_flags & IOMAP_HOLE));
635 ASSERT(!(tmp->iomap_flags & IOMAP_DELAY));
636
637 /* If this is a new unwritten extent buffer (i.e. one
638 * that we haven't passed in private data for, we must
639 * now map this buffer too.
640 */
641 if (buffer_unwritten(bh) && !bh->b_end_io) {
642 ASSERT(tmp->iomap_flags & IOMAP_UNWRITTEN);
643 xfs_map_unwritten(inode, page, head, bh, offset,
644 bbits, tmp, wbc, startio, all_bh);
645 } else if (! (buffer_unwritten(bh) && buffer_locked(bh))) {
646 xfs_map_at_offset(page, bh, offset, bbits, tmp);
647 if (buffer_unwritten(bh)) {
648 set_buffer_unwritten_io(bh);
649 bh->b_private = private;
650 ASSERT(private);
651 }
652 }
653 if (startio) {
654 bh_arr[index++] = bh;
655 } else {
656 set_buffer_dirty(bh);
657 unlock_buffer(bh);
658 mark_buffer_dirty(bh);
659 }
660 } while (i++, (bh = bh->b_this_page) != head);
661
662 if (startio) {
663 xfs_submit_page(page, wbc, bh_arr, index, 1, index == i);
664 } else {
665 unlock_page(page);
666 }
667}
668
669/*
670 * Convert & write out a cluster of pages in the same extent as defined
671 * by mp and following the start page.
672 */
673STATIC void
674xfs_cluster_write(
675 struct inode *inode,
676 pgoff_t tindex,
677 xfs_iomap_t *iomapp,
678 struct writeback_control *wbc,
679 int startio,
680 int all_bh,
681 pgoff_t tlast)
682{
683 struct page *page;
684
685 for (; tindex <= tlast; tindex++) {
686 page = xfs_probe_delalloc_page(inode, tindex);
687 if (!page)
688 break;
689 xfs_convert_page(inode, page, iomapp, wbc, NULL,
690 startio, all_bh);
691 }
692}
693
694/*
695 * Calling this without startio set means we are being asked to make a dirty
696 * page ready for freeing it's buffers. When called with startio set then
697 * we are coming from writepage.
698 *
699 * When called with startio set it is important that we write the WHOLE
700 * page if possible.
701 * The bh->b_state's cannot know if any of the blocks or which block for
702 * that matter are dirty due to mmap writes, and therefore bh uptodate is
703 * only vaild if the page itself isn't completely uptodate. Some layers
704 * may clear the page dirty flag prior to calling write page, under the
705 * assumption the entire page will be written out; by not writing out the
706 * whole page the page can be reused before all valid dirty data is
707 * written out. Note: in the case of a page that has been dirty'd by
708 * mapwrite and but partially setup by block_prepare_write the
709 * bh->b_states's will not agree and only ones setup by BPW/BCW will have
710 * valid state, thus the whole page must be written out thing.
711 */
712
713STATIC int
714xfs_page_state_convert(
715 struct inode *inode,
716 struct page *page,
717 struct writeback_control *wbc,
718 int startio,
719 int unmapped) /* also implies page uptodate */
720{
721 struct buffer_head *bh_arr[MAX_BUF_PER_PAGE], *bh, *head;
722 xfs_iomap_t *iomp, iomap;
723 loff_t offset;
724 unsigned long p_offset = 0;
725 __uint64_t end_offset;
726 pgoff_t end_index, last_index, tlast;
727 int len, err, i, cnt = 0, uptodate = 1;
728 int flags = startio ? 0 : BMAPI_TRYLOCK;
729 int page_dirty, delalloc = 0;
730
731 /* Is this page beyond the end of the file? */
732 offset = i_size_read(inode);
733 end_index = offset >> PAGE_CACHE_SHIFT;
734 last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
735 if (page->index >= end_index) {
736 if ((page->index >= end_index + 1) ||
737 !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {
738 err = -EIO;
739 goto error;
740 }
741 }
742
743 offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
744 end_offset = min_t(unsigned long long,
745 offset + PAGE_CACHE_SIZE, i_size_read(inode));
746
747 bh = head = page_buffers(page);
748 iomp = NULL;
749
750 /*
751 * page_dirty is initially a count of buffers on the page and
752 * is decrememted as we move each into a cleanable state.
753 */
754 len = bh->b_size;
755 page_dirty = PAGE_CACHE_SIZE / len;
756
757 do {
758 if (offset >= end_offset)
759 break;
760 if (!buffer_uptodate(bh))
761 uptodate = 0;
762 if (!(PageUptodate(page) || buffer_uptodate(bh)) && !startio)
763 continue;
764
765 if (iomp) {
766 iomp = xfs_offset_to_map(page, &iomap, p_offset);
767 }
768
769 /*
770 * First case, map an unwritten extent and prepare for
771 * extent state conversion transaction on completion.
772 */
773 if (buffer_unwritten(bh)) {
774 if (!startio)
775 continue;
776 if (!iomp) {
777 err = xfs_map_blocks(inode, offset, len, &iomap,
778 BMAPI_READ|BMAPI_IGNSTATE);
779 if (err) {
780 goto error;
781 }
782 iomp = xfs_offset_to_map(page, &iomap,
783 p_offset);
784 }
785 if (iomp) {
786 if (!bh->b_end_io) {
787 err = xfs_map_unwritten(inode, page,
788 head, bh, p_offset,
789 inode->i_blkbits, iomp,
790 wbc, startio, unmapped);
791 if (err) {
792 goto error;
793 }
794 } else {
795 set_bit(BH_Lock, &bh->b_state);
796 }
797 BUG_ON(!buffer_locked(bh));
798 bh_arr[cnt++] = bh;
799 page_dirty--;
800 }
801 /*
802 * Second case, allocate space for a delalloc buffer.
803 * We can return EAGAIN here in the release page case.
804 */
805 } else if (buffer_delay(bh)) {
806 if (!iomp) {
807 delalloc = 1;
808 err = xfs_map_blocks(inode, offset, len, &iomap,
809 BMAPI_ALLOCATE | flags);
810 if (err) {
811 goto error;
812 }
813 iomp = xfs_offset_to_map(page, &iomap,
814 p_offset);
815 }
816 if (iomp) {
817 xfs_map_at_offset(page, bh, p_offset,
818 inode->i_blkbits, iomp);
819 if (startio) {
820 bh_arr[cnt++] = bh;
821 } else {
822 set_buffer_dirty(bh);
823 unlock_buffer(bh);
824 mark_buffer_dirty(bh);
825 }
826 page_dirty--;
827 }
828 } else if ((buffer_uptodate(bh) || PageUptodate(page)) &&
829 (unmapped || startio)) {
830
831 if (!buffer_mapped(bh)) {
832 int size;
833
834 /*
835 * Getting here implies an unmapped buffer
836 * was found, and we are in a path where we
837 * need to write the whole page out.
838 */
839 if (!iomp) {
840 size = xfs_probe_unmapped_cluster(
841 inode, page, bh, head);
842 err = xfs_map_blocks(inode, offset,
843 size, &iomap,
844 BMAPI_WRITE|BMAPI_MMAP);
845 if (err) {
846 goto error;
847 }
848 iomp = xfs_offset_to_map(page, &iomap,
849 p_offset);
850 }
851 if (iomp) {
852 xfs_map_at_offset(page,
853 bh, p_offset,
854 inode->i_blkbits, iomp);
855 if (startio) {
856 bh_arr[cnt++] = bh;
857 } else {
858 set_buffer_dirty(bh);
859 unlock_buffer(bh);
860 mark_buffer_dirty(bh);
861 }
862 page_dirty--;
863 }
864 } else if (startio) {
865 if (buffer_uptodate(bh) &&
866 !test_and_set_bit(BH_Lock, &bh->b_state)) {
867 bh_arr[cnt++] = bh;
868 page_dirty--;
869 }
870 }
871 }
872 } while (offset += len, p_offset += len,
873 ((bh = bh->b_this_page) != head));
874
875 if (uptodate && bh == head)
876 SetPageUptodate(page);
877
878 if (startio)
879 xfs_submit_page(page, wbc, bh_arr, cnt, 0, 1);
880
881 if (iomp) {
882 tlast = (iomp->iomap_offset + iomp->iomap_bsize - 1) >>
883 PAGE_CACHE_SHIFT;
884 if (delalloc && (tlast > last_index))
885 tlast = last_index;
886 xfs_cluster_write(inode, page->index + 1, iomp, wbc,
887 startio, unmapped, tlast);
888 }
889
890 return page_dirty;
891
892error:
893 for (i = 0; i < cnt; i++) {
894 unlock_buffer(bh_arr[i]);
895 }
896
897 /*
898 * If it's delalloc and we have nowhere to put it,
899 * throw it away, unless the lower layers told
900 * us to try again.
901 */
902 if (err != -EAGAIN) {
903 if (!unmapped) {
904 block_invalidatepage(page, 0);
905 }
906 ClearPageUptodate(page);
907 }
908 return err;
909}
910
911STATIC int
912__linvfs_get_block(
913 struct inode *inode,
914 sector_t iblock,
915 unsigned long blocks,
916 struct buffer_head *bh_result,
917 int create,
918 int direct,
919 bmapi_flags_t flags)
920{
921 vnode_t *vp = LINVFS_GET_VP(inode);
922 xfs_iomap_t iomap;
923 int retpbbm = 1;
924 int error;
925 ssize_t size;
926 loff_t offset = (loff_t)iblock << inode->i_blkbits;
927
928 if (blocks)
929 size = blocks << inode->i_blkbits;
930 else
931 size = 1 << inode->i_blkbits;
932
933 VOP_BMAP(vp, offset, size,
934 create ? flags : BMAPI_READ, &iomap, &retpbbm, error);
935 if (error)
936 return -error;
937
938 if (retpbbm == 0)
939 return 0;
940
941 if (iomap.iomap_bn != IOMAP_DADDR_NULL) {
942 xfs_daddr_t bn;
943 loff_t delta;
944
945 /* For unwritten extents do not report a disk address on
946 * the read case (treat as if we're reading into a hole).
947 */
948 if (create || !(iomap.iomap_flags & IOMAP_UNWRITTEN)) {
949 delta = offset - iomap.iomap_offset;
950 delta >>= inode->i_blkbits;
951
952 bn = iomap.iomap_bn >> (inode->i_blkbits - BBSHIFT);
953 bn += delta;
954 BUG_ON(!bn && !(iomap.iomap_flags & IOMAP_REALTIME));
955 bh_result->b_blocknr = bn;
956 set_buffer_mapped(bh_result);
957 }
958 if (create && (iomap.iomap_flags & IOMAP_UNWRITTEN)) {
959 if (direct)
960 bh_result->b_private = inode;
961 set_buffer_unwritten(bh_result);
962 set_buffer_delay(bh_result);
963 }
964 }
965
966 /* If this is a realtime file, data might be on a new device */
967 bh_result->b_bdev = iomap.iomap_target->pbr_bdev;
968
969 /* If we previously allocated a block out beyond eof and
970 * we are now coming back to use it then we will need to
971 * flag it as new even if it has a disk address.
972 */
973 if (create &&
974 ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
975 (offset >= i_size_read(inode)) || (iomap.iomap_flags & IOMAP_NEW))) {
976 set_buffer_new(bh_result);
977 }
978
979 if (iomap.iomap_flags & IOMAP_DELAY) {
980 BUG_ON(direct);
981 if (create) {
982 set_buffer_uptodate(bh_result);
983 set_buffer_mapped(bh_result);
984 set_buffer_delay(bh_result);
985 }
986 }
987
988 if (blocks) {
989 bh_result->b_size = (ssize_t)min(
990 (loff_t)(iomap.iomap_bsize - iomap.iomap_delta),
991 (loff_t)(blocks << inode->i_blkbits));
992 }
993
994 return 0;
995}
996
997int
998linvfs_get_block(
999 struct inode *inode,
1000 sector_t iblock,
1001 struct buffer_head *bh_result,
1002 int create)
1003{
1004 return __linvfs_get_block(inode, iblock, 0, bh_result,
1005 create, 0, BMAPI_WRITE);
1006}
1007
1008STATIC int
1009linvfs_get_blocks_direct(
1010 struct inode *inode,
1011 sector_t iblock,
1012 unsigned long max_blocks,
1013 struct buffer_head *bh_result,
1014 int create)
1015{
1016 return __linvfs_get_block(inode, iblock, max_blocks, bh_result,
1017 create, 1, BMAPI_WRITE|BMAPI_DIRECT);
1018}
1019
1020STATIC ssize_t
1021linvfs_direct_IO(
1022 int rw,
1023 struct kiocb *iocb,
1024 const struct iovec *iov,
1025 loff_t offset,
1026 unsigned long nr_segs)
1027{
1028 struct file *file = iocb->ki_filp;
1029 struct inode *inode = file->f_mapping->host;
1030 vnode_t *vp = LINVFS_GET_VP(inode);
1031 xfs_iomap_t iomap;
1032 int maps = 1;
1033 int error;
1034
1035 VOP_BMAP(vp, offset, 0, BMAPI_DEVICE, &iomap, &maps, error);
1036 if (error)
1037 return -error;
1038
1039 return blockdev_direct_IO_own_locking(rw, iocb, inode,
1040 iomap.iomap_target->pbr_bdev,
1041 iov, offset, nr_segs,
1042 linvfs_get_blocks_direct,
1043 linvfs_unwritten_convert_direct);
1044}
1045
1046
1047STATIC sector_t
1048linvfs_bmap(
1049 struct address_space *mapping,
1050 sector_t block)
1051{
1052 struct inode *inode = (struct inode *)mapping->host;
1053 vnode_t *vp = LINVFS_GET_VP(inode);
1054 int error;
1055
1056 vn_trace_entry(vp, "linvfs_bmap", (inst_t *)__return_address);
1057
1058 VOP_RWLOCK(vp, VRWLOCK_READ);
1059 VOP_FLUSH_PAGES(vp, (xfs_off_t)0, -1, 0, FI_REMAPF, error);
1060 VOP_RWUNLOCK(vp, VRWLOCK_READ);
1061 return generic_block_bmap(mapping, block, linvfs_get_block);
1062}
1063
1064STATIC int
1065linvfs_readpage(
1066 struct file *unused,
1067 struct page *page)
1068{
1069 return mpage_readpage(page, linvfs_get_block);
1070}
1071
1072STATIC int
1073linvfs_readpages(
1074 struct file *unused,
1075 struct address_space *mapping,
1076 struct list_head *pages,
1077 unsigned nr_pages)
1078{
1079 return mpage_readpages(mapping, pages, nr_pages, linvfs_get_block);
1080}
1081
1082STATIC void
1083xfs_count_page_state(
1084 struct page *page,
1085 int *delalloc,
1086 int *unmapped,
1087 int *unwritten)
1088{
1089 struct buffer_head *bh, *head;
1090
1091 *delalloc = *unmapped = *unwritten = 0;
1092
1093 bh = head = page_buffers(page);
1094 do {
1095 if (buffer_uptodate(bh) && !buffer_mapped(bh))
1096 (*unmapped) = 1;
1097 else if (buffer_unwritten(bh) && !buffer_delay(bh))
1098 clear_buffer_unwritten(bh);
1099 else if (buffer_unwritten(bh))
1100 (*unwritten) = 1;
1101 else if (buffer_delay(bh))
1102 (*delalloc) = 1;
1103 } while ((bh = bh->b_this_page) != head);
1104}
1105
1106
1107/*
1108 * writepage: Called from one of two places:
1109 *
1110 * 1. we are flushing a delalloc buffer head.
1111 *
1112 * 2. we are writing out a dirty page. Typically the page dirty
1113 * state is cleared before we get here. In this case is it
1114 * conceivable we have no buffer heads.
1115 *
1116 * For delalloc space on the page we need to allocate space and
1117 * flush it. For unmapped buffer heads on the page we should
1118 * allocate space if the page is uptodate. For any other dirty
1119 * buffer heads on the page we should flush them.
1120 *
1121 * If we detect that a transaction would be required to flush
1122 * the page, we have to check the process flags first, if we
1123 * are already in a transaction or disk I/O during allocations
1124 * is off, we need to fail the writepage and redirty the page.
1125 */
1126
1127STATIC int
1128linvfs_writepage(
1129 struct page *page,
1130 struct writeback_control *wbc)
1131{
1132 int error;
1133 int need_trans;
1134 int delalloc, unmapped, unwritten;
1135 struct inode *inode = page->mapping->host;
1136
1137 xfs_page_trace(XFS_WRITEPAGE_ENTER, inode, page, 0);
1138
1139 /*
1140 * We need a transaction if:
1141 * 1. There are delalloc buffers on the page
1142 * 2. The page is uptodate and we have unmapped buffers
1143 * 3. The page is uptodate and we have no buffers
1144 * 4. There are unwritten buffers on the page
1145 */
1146
1147 if (!page_has_buffers(page)) {
1148 unmapped = 1;
1149 need_trans = 1;
1150 } else {
1151 xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
1152 if (!PageUptodate(page))
1153 unmapped = 0;
1154 need_trans = delalloc + unmapped + unwritten;
1155 }
1156
1157 /*
1158 * If we need a transaction and the process flags say
1159 * we are already in a transaction, or no IO is allowed
1160 * then mark the page dirty again and leave the page
1161 * as is.
1162 */
1163 if (PFLAGS_TEST_FSTRANS() && need_trans)
1164 goto out_fail;
1165
1166 /*
1167 * Delay hooking up buffer heads until we have
1168 * made our go/no-go decision.
1169 */
1170 if (!page_has_buffers(page))
1171 create_empty_buffers(page, 1 << inode->i_blkbits, 0);
1172
1173 /*
1174 * Convert delayed allocate, unwritten or unmapped space
1175 * to real space and flush out to disk.
1176 */
1177 error = xfs_page_state_convert(inode, page, wbc, 1, unmapped);
1178 if (error == -EAGAIN)
1179 goto out_fail;
1180 if (unlikely(error < 0))
1181 goto out_unlock;
1182
1183 return 0;
1184
1185out_fail:
1186 redirty_page_for_writepage(wbc, page);
1187 unlock_page(page);
1188 return 0;
1189out_unlock:
1190 unlock_page(page);
1191 return error;
1192}
1193
1194/*
1195 * Called to move a page into cleanable state - and from there
1196 * to be released. Possibly the page is already clean. We always
1197 * have buffer heads in this call.
1198 *
1199 * Returns 0 if the page is ok to release, 1 otherwise.
1200 *
1201 * Possible scenarios are:
1202 *
1203 * 1. We are being called to release a page which has been written
1204 * to via regular I/O. buffer heads will be dirty and possibly
1205 * delalloc. If no delalloc buffer heads in this case then we
1206 * can just return zero.
1207 *
1208 * 2. We are called to release a page which has been written via
1209 * mmap, all we need to do is ensure there is no delalloc
1210 * state in the buffer heads, if not we can let the caller
1211 * free them and we should come back later via writepage.
1212 */
1213STATIC int
1214linvfs_release_page(
1215 struct page *page,
1216 int gfp_mask)
1217{
1218 struct inode *inode = page->mapping->host;
1219 int dirty, delalloc, unmapped, unwritten;
1220 struct writeback_control wbc = {
1221 .sync_mode = WB_SYNC_ALL,
1222 .nr_to_write = 1,
1223 };
1224
1225 xfs_page_trace(XFS_RELEASEPAGE_ENTER, inode, page, gfp_mask);
1226
1227 xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
1228 if (!delalloc && !unwritten)
1229 goto free_buffers;
1230
1231 if (!(gfp_mask & __GFP_FS))
1232 return 0;
1233
1234 /* If we are already inside a transaction or the thread cannot
1235 * do I/O, we cannot release this page.
1236 */
1237 if (PFLAGS_TEST_FSTRANS())
1238 return 0;
1239
1240 /*
1241 * Convert delalloc space to real space, do not flush the
1242 * data out to disk, that will be done by the caller.
1243 * Never need to allocate space here - we will always
1244 * come back to writepage in that case.
1245 */
1246 dirty = xfs_page_state_convert(inode, page, &wbc, 0, 0);
1247 if (dirty == 0 && !unwritten)
1248 goto free_buffers;
1249 return 0;
1250
1251free_buffers:
1252 return try_to_free_buffers(page);
1253}
1254
1255STATIC int
1256linvfs_prepare_write(
1257 struct file *file,
1258 struct page *page,
1259 unsigned int from,
1260 unsigned int to)
1261{
1262 return block_prepare_write(page, from, to, linvfs_get_block);
1263}
1264
1265struct address_space_operations linvfs_aops = {
1266 .readpage = linvfs_readpage,
1267 .readpages = linvfs_readpages,
1268 .writepage = linvfs_writepage,
1269 .sync_page = block_sync_page,
1270 .releasepage = linvfs_release_page,
1271 .prepare_write = linvfs_prepare_write,
1272 .commit_write = generic_commit_write,
1273 .bmap = linvfs_bmap,
1274 .direct_IO = linvfs_direct_IO,
1275};
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
new file mode 100644
index 000000000000..23e0eb67fc25
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -0,0 +1,1980 @@
1/*
2 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33/*
34 * The xfs_buf.c code provides an abstract buffer cache model on top
35 * of the Linux page cache. Cached metadata blocks for a file system
36 * are hashed to the inode for the block device. xfs_buf.c assembles
37 * buffers (xfs_buf_t) on demand to aggregate such cached pages for I/O.
38 *
39 * Written by Steve Lord, Jim Mostek, Russell Cattelan
40 * and Rajagopal Ananthanarayanan ("ananth") at SGI.
41 *
42 */
43
44#include <linux/stddef.h>
45#include <linux/errno.h>
46#include <linux/slab.h>
47#include <linux/pagemap.h>
48#include <linux/init.h>
49#include <linux/vmalloc.h>
50#include <linux/bio.h>
51#include <linux/sysctl.h>
52#include <linux/proc_fs.h>
53#include <linux/workqueue.h>
54#include <linux/percpu.h>
55#include <linux/blkdev.h>
56#include <linux/hash.h>
57
58#include "xfs_linux.h"
59
60/*
61 * File wide globals
62 */
63
64STATIC kmem_cache_t *pagebuf_cache;
65STATIC kmem_shaker_t pagebuf_shake;
66STATIC int pagebuf_daemon_wakeup(int, unsigned int);
67STATIC void pagebuf_delwri_queue(xfs_buf_t *, int);
68STATIC struct workqueue_struct *pagebuf_logio_workqueue;
69STATIC struct workqueue_struct *pagebuf_dataio_workqueue;
70
71/*
72 * Pagebuf debugging
73 */
74
75#ifdef PAGEBUF_TRACE
76void
77pagebuf_trace(
78 xfs_buf_t *pb,
79 char *id,
80 void *data,
81 void *ra)
82{
83 ktrace_enter(pagebuf_trace_buf,
84 pb, id,
85 (void *)(unsigned long)pb->pb_flags,
86 (void *)(unsigned long)pb->pb_hold.counter,
87 (void *)(unsigned long)pb->pb_sema.count.counter,
88 (void *)current,
89 data, ra,
90 (void *)(unsigned long)((pb->pb_file_offset>>32) & 0xffffffff),
91 (void *)(unsigned long)(pb->pb_file_offset & 0xffffffff),
92 (void *)(unsigned long)pb->pb_buffer_length,
93 NULL, NULL, NULL, NULL, NULL);
94}
95ktrace_t *pagebuf_trace_buf;
96#define PAGEBUF_TRACE_SIZE 4096
97#define PB_TRACE(pb, id, data) \
98 pagebuf_trace(pb, id, (void *)data, (void *)__builtin_return_address(0))
99#else
100#define PB_TRACE(pb, id, data) do { } while (0)
101#endif
102
103#ifdef PAGEBUF_LOCK_TRACKING
104# define PB_SET_OWNER(pb) ((pb)->pb_last_holder = current->pid)
105# define PB_CLEAR_OWNER(pb) ((pb)->pb_last_holder = -1)
106# define PB_GET_OWNER(pb) ((pb)->pb_last_holder)
107#else
108# define PB_SET_OWNER(pb) do { } while (0)
109# define PB_CLEAR_OWNER(pb) do { } while (0)
110# define PB_GET_OWNER(pb) do { } while (0)
111#endif
112
113/*
114 * Pagebuf allocation / freeing.
115 */
116
117#define pb_to_gfp(flags) \
118 ((((flags) & PBF_READ_AHEAD) ? __GFP_NORETRY : \
119 ((flags) & PBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN)
120
121#define pb_to_km(flags) \
122 (((flags) & PBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
123
124
125#define pagebuf_allocate(flags) \
126 kmem_zone_alloc(pagebuf_cache, pb_to_km(flags))
127#define pagebuf_deallocate(pb) \
128 kmem_zone_free(pagebuf_cache, (pb));
129
130/*
131 * Page Region interfaces.
132 *
133 * For pages in filesystems where the blocksize is smaller than the
134 * pagesize, we use the page->private field (long) to hold a bitmap
135 * of uptodate regions within the page.
136 *
137 * Each such region is "bytes per page / bits per long" bytes long.
138 *
139 * NBPPR == number-of-bytes-per-page-region
140 * BTOPR == bytes-to-page-region (rounded up)
141 * BTOPRT == bytes-to-page-region-truncated (rounded down)
142 */
143#if (BITS_PER_LONG == 32)
144#define PRSHIFT (PAGE_CACHE_SHIFT - 5) /* (32 == 1<<5) */
145#elif (BITS_PER_LONG == 64)
146#define PRSHIFT (PAGE_CACHE_SHIFT - 6) /* (64 == 1<<6) */
147#else
148#error BITS_PER_LONG must be 32 or 64
149#endif
150#define NBPPR (PAGE_CACHE_SIZE/BITS_PER_LONG)
151#define BTOPR(b) (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT)
152#define BTOPRT(b) (((unsigned int)(b) >> PRSHIFT))
153
154STATIC unsigned long
155page_region_mask(
156 size_t offset,
157 size_t length)
158{
159 unsigned long mask;
160 int first, final;
161
162 first = BTOPR(offset);
163 final = BTOPRT(offset + length - 1);
164 first = min(first, final);
165
166 mask = ~0UL;
167 mask <<= BITS_PER_LONG - (final - first);
168 mask >>= BITS_PER_LONG - (final);
169
170 ASSERT(offset + length <= PAGE_CACHE_SIZE);
171 ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0);
172
173 return mask;
174}
175
176STATIC inline void
177set_page_region(
178 struct page *page,
179 size_t offset,
180 size_t length)
181{
182 page->private |= page_region_mask(offset, length);
183 if (page->private == ~0UL)
184 SetPageUptodate(page);
185}
186
187STATIC inline int
188test_page_region(
189 struct page *page,
190 size_t offset,
191 size_t length)
192{
193 unsigned long mask = page_region_mask(offset, length);
194
195 return (mask && (page->private & mask) == mask);
196}
197
198/*
199 * Mapping of multi-page buffers into contiguous virtual space
200 */
201
202typedef struct a_list {
203 void *vm_addr;
204 struct a_list *next;
205} a_list_t;
206
207STATIC a_list_t *as_free_head;
208STATIC int as_list_len;
209STATIC DEFINE_SPINLOCK(as_lock);
210
211/*
212 * Try to batch vunmaps because they are costly.
213 */
214STATIC void
215free_address(
216 void *addr)
217{
218 a_list_t *aentry;
219
220 aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC & ~__GFP_HIGH);
221 if (likely(aentry)) {
222 spin_lock(&as_lock);
223 aentry->next = as_free_head;
224 aentry->vm_addr = addr;
225 as_free_head = aentry;
226 as_list_len++;
227 spin_unlock(&as_lock);
228 } else {
229 vunmap(addr);
230 }
231}
232
233STATIC void
234purge_addresses(void)
235{
236 a_list_t *aentry, *old;
237
238 if (as_free_head == NULL)
239 return;
240
241 spin_lock(&as_lock);
242 aentry = as_free_head;
243 as_free_head = NULL;
244 as_list_len = 0;
245 spin_unlock(&as_lock);
246
247 while ((old = aentry) != NULL) {
248 vunmap(aentry->vm_addr);
249 aentry = aentry->next;
250 kfree(old);
251 }
252}
253
254/*
255 * Internal pagebuf object manipulation
256 */
257
258STATIC void
259_pagebuf_initialize(
260 xfs_buf_t *pb,
261 xfs_buftarg_t *target,
262 loff_t range_base,
263 size_t range_length,
264 page_buf_flags_t flags)
265{
266 /*
267 * We don't want certain flags to appear in pb->pb_flags.
268 */
269 flags &= ~(PBF_LOCK|PBF_MAPPED|PBF_DONT_BLOCK|PBF_READ_AHEAD);
270
271 memset(pb, 0, sizeof(xfs_buf_t));
272 atomic_set(&pb->pb_hold, 1);
273 init_MUTEX_LOCKED(&pb->pb_iodonesema);
274 INIT_LIST_HEAD(&pb->pb_list);
275 INIT_LIST_HEAD(&pb->pb_hash_list);
276 init_MUTEX_LOCKED(&pb->pb_sema); /* held, no waiters */
277 PB_SET_OWNER(pb);
278 pb->pb_target = target;
279 pb->pb_file_offset = range_base;
280 /*
281 * Set buffer_length and count_desired to the same value initially.
282 * I/O routines should use count_desired, which will be the same in
283 * most cases but may be reset (e.g. XFS recovery).
284 */
285 pb->pb_buffer_length = pb->pb_count_desired = range_length;
286 pb->pb_flags = flags | PBF_NONE;
287 pb->pb_bn = XFS_BUF_DADDR_NULL;
288 atomic_set(&pb->pb_pin_count, 0);
289 init_waitqueue_head(&pb->pb_waiters);
290
291 XFS_STATS_INC(pb_create);
292 PB_TRACE(pb, "initialize", target);
293}
294
295/*
296 * Allocate a page array capable of holding a specified number
297 * of pages, and point the page buf at it.
298 */
299STATIC int
300_pagebuf_get_pages(
301 xfs_buf_t *pb,
302 int page_count,
303 page_buf_flags_t flags)
304{
305 /* Make sure that we have a page list */
306 if (pb->pb_pages == NULL) {
307 pb->pb_offset = page_buf_poff(pb->pb_file_offset);
308 pb->pb_page_count = page_count;
309 if (page_count <= PB_PAGES) {
310 pb->pb_pages = pb->pb_page_array;
311 } else {
312 pb->pb_pages = kmem_alloc(sizeof(struct page *) *
313 page_count, pb_to_km(flags));
314 if (pb->pb_pages == NULL)
315 return -ENOMEM;
316 }
317 memset(pb->pb_pages, 0, sizeof(struct page *) * page_count);
318 }
319 return 0;
320}
321
322/*
323 * Frees pb_pages if it was malloced.
324 */
325STATIC void
326_pagebuf_free_pages(
327 xfs_buf_t *bp)
328{
329 if (bp->pb_pages != bp->pb_page_array) {
330 kmem_free(bp->pb_pages,
331 bp->pb_page_count * sizeof(struct page *));
332 }
333}
334
335/*
336 * Releases the specified buffer.
337 *
338 * The modification state of any associated pages is left unchanged.
339 * The buffer most not be on any hash - use pagebuf_rele instead for
340 * hashed and refcounted buffers
341 */
342void
343pagebuf_free(
344 xfs_buf_t *bp)
345{
346 PB_TRACE(bp, "free", 0);
347
348 ASSERT(list_empty(&bp->pb_hash_list));
349
350 if (bp->pb_flags & _PBF_PAGE_CACHE) {
351 uint i;
352
353 if ((bp->pb_flags & PBF_MAPPED) && (bp->pb_page_count > 1))
354 free_address(bp->pb_addr - bp->pb_offset);
355
356 for (i = 0; i < bp->pb_page_count; i++)
357 page_cache_release(bp->pb_pages[i]);
358 _pagebuf_free_pages(bp);
359 } else if (bp->pb_flags & _PBF_KMEM_ALLOC) {
360 /*
361 * XXX(hch): bp->pb_count_desired might be incorrect (see
362 * pagebuf_associate_memory for details), but fortunately
363 * the Linux version of kmem_free ignores the len argument..
364 */
365 kmem_free(bp->pb_addr, bp->pb_count_desired);
366 _pagebuf_free_pages(bp);
367 }
368
369 pagebuf_deallocate(bp);
370}
371
372/*
373 * Finds all pages for buffer in question and builds it's page list.
374 */
375STATIC int
376_pagebuf_lookup_pages(
377 xfs_buf_t *bp,
378 uint flags)
379{
380 struct address_space *mapping = bp->pb_target->pbr_mapping;
381 size_t blocksize = bp->pb_target->pbr_bsize;
382 size_t size = bp->pb_count_desired;
383 size_t nbytes, offset;
384 int gfp_mask = pb_to_gfp(flags);
385 unsigned short page_count, i;
386 pgoff_t first;
387 loff_t end;
388 int error;
389
390 end = bp->pb_file_offset + bp->pb_buffer_length;
391 page_count = page_buf_btoc(end) - page_buf_btoct(bp->pb_file_offset);
392
393 error = _pagebuf_get_pages(bp, page_count, flags);
394 if (unlikely(error))
395 return error;
396 bp->pb_flags |= _PBF_PAGE_CACHE;
397
398 offset = bp->pb_offset;
399 first = bp->pb_file_offset >> PAGE_CACHE_SHIFT;
400
401 for (i = 0; i < bp->pb_page_count; i++) {
402 struct page *page;
403 uint retries = 0;
404
405 retry:
406 page = find_or_create_page(mapping, first + i, gfp_mask);
407 if (unlikely(page == NULL)) {
408 if (flags & PBF_READ_AHEAD) {
409 bp->pb_page_count = i;
410 for (i = 0; i < bp->pb_page_count; i++)
411 unlock_page(bp->pb_pages[i]);
412 return -ENOMEM;
413 }
414
415 /*
416 * This could deadlock.
417 *
418 * But until all the XFS lowlevel code is revamped to
419 * handle buffer allocation failures we can't do much.
420 */
421 if (!(++retries % 100))
422 printk(KERN_ERR
423 "XFS: possible memory allocation "
424 "deadlock in %s (mode:0x%x)\n",
425 __FUNCTION__, gfp_mask);
426
427 XFS_STATS_INC(pb_page_retries);
428 pagebuf_daemon_wakeup(0, gfp_mask);
429 blk_congestion_wait(WRITE, HZ/50);
430 goto retry;
431 }
432
433 XFS_STATS_INC(pb_page_found);
434
435 nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset);
436 size -= nbytes;
437
438 if (!PageUptodate(page)) {
439 page_count--;
440 if (blocksize >= PAGE_CACHE_SIZE) {
441 if (flags & PBF_READ)
442 bp->pb_locked = 1;
443 } else if (!PagePrivate(page)) {
444 if (test_page_region(page, offset, nbytes))
445 page_count++;
446 }
447 }
448
449 bp->pb_pages[i] = page;
450 offset = 0;
451 }
452
453 if (!bp->pb_locked) {
454 for (i = 0; i < bp->pb_page_count; i++)
455 unlock_page(bp->pb_pages[i]);
456 }
457
458 if (page_count) {
459 /* if we have any uptodate pages, mark that in the buffer */
460 bp->pb_flags &= ~PBF_NONE;
461
462 /* if some pages aren't uptodate, mark that in the buffer */
463 if (page_count != bp->pb_page_count)
464 bp->pb_flags |= PBF_PARTIAL;
465 }
466
467 PB_TRACE(bp, "lookup_pages", (long)page_count);
468 return error;
469}
470
471/*
472 * Map buffer into kernel address-space if nessecary.
473 */
474STATIC int
475_pagebuf_map_pages(
476 xfs_buf_t *bp,
477 uint flags)
478{
479 /* A single page buffer is always mappable */
480 if (bp->pb_page_count == 1) {
481 bp->pb_addr = page_address(bp->pb_pages[0]) + bp->pb_offset;
482 bp->pb_flags |= PBF_MAPPED;
483 } else if (flags & PBF_MAPPED) {
484 if (as_list_len > 64)
485 purge_addresses();
486 bp->pb_addr = vmap(bp->pb_pages, bp->pb_page_count,
487 VM_MAP, PAGE_KERNEL);
488 if (unlikely(bp->pb_addr == NULL))
489 return -ENOMEM;
490 bp->pb_addr += bp->pb_offset;
491 bp->pb_flags |= PBF_MAPPED;
492 }
493
494 return 0;
495}
496
497/*
498 * Finding and Reading Buffers
499 */
500
501/*
502 * _pagebuf_find
503 *
504 * Looks up, and creates if absent, a lockable buffer for
505 * a given range of an inode. The buffer is returned
506 * locked. If other overlapping buffers exist, they are
507 * released before the new buffer is created and locked,
508 * which may imply that this call will block until those buffers
509 * are unlocked. No I/O is implied by this call.
510 */
511xfs_buf_t *
512_pagebuf_find(
513 xfs_buftarg_t *btp, /* block device target */
514 loff_t ioff, /* starting offset of range */
515 size_t isize, /* length of range */
516 page_buf_flags_t flags, /* PBF_TRYLOCK */
517 xfs_buf_t *new_pb)/* newly allocated buffer */
518{
519 loff_t range_base;
520 size_t range_length;
521 xfs_bufhash_t *hash;
522 xfs_buf_t *pb, *n;
523
524 range_base = (ioff << BBSHIFT);
525 range_length = (isize << BBSHIFT);
526
527 /* Check for IOs smaller than the sector size / not sector aligned */
528 ASSERT(!(range_length < (1 << btp->pbr_sshift)));
529 ASSERT(!(range_base & (loff_t)btp->pbr_smask));
530
531 hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)];
532
533 spin_lock(&hash->bh_lock);
534
535 list_for_each_entry_safe(pb, n, &hash->bh_list, pb_hash_list) {
536 ASSERT(btp == pb->pb_target);
537 if (pb->pb_file_offset == range_base &&
538 pb->pb_buffer_length == range_length) {
539 /*
540 * If we look at something bring it to the
541 * front of the list for next time.
542 */
543 atomic_inc(&pb->pb_hold);
544 list_move(&pb->pb_hash_list, &hash->bh_list);
545 goto found;
546 }
547 }
548
549 /* No match found */
550 if (new_pb) {
551 _pagebuf_initialize(new_pb, btp, range_base,
552 range_length, flags);
553 new_pb->pb_hash = hash;
554 list_add(&new_pb->pb_hash_list, &hash->bh_list);
555 } else {
556 XFS_STATS_INC(pb_miss_locked);
557 }
558
559 spin_unlock(&hash->bh_lock);
560 return new_pb;
561
562found:
563 spin_unlock(&hash->bh_lock);
564
565 /* Attempt to get the semaphore without sleeping,
566 * if this does not work then we need to drop the
567 * spinlock and do a hard attempt on the semaphore.
568 */
569 if (down_trylock(&pb->pb_sema)) {
570 if (!(flags & PBF_TRYLOCK)) {
571 /* wait for buffer ownership */
572 PB_TRACE(pb, "get_lock", 0);
573 pagebuf_lock(pb);
574 XFS_STATS_INC(pb_get_locked_waited);
575 } else {
576 /* We asked for a trylock and failed, no need
577 * to look at file offset and length here, we
578 * know that this pagebuf at least overlaps our
579 * pagebuf and is locked, therefore our buffer
580 * either does not exist, or is this buffer
581 */
582
583 pagebuf_rele(pb);
584 XFS_STATS_INC(pb_busy_locked);
585 return (NULL);
586 }
587 } else {
588 /* trylock worked */
589 PB_SET_OWNER(pb);
590 }
591
592 if (pb->pb_flags & PBF_STALE)
593 pb->pb_flags &= PBF_MAPPED;
594 PB_TRACE(pb, "got_lock", 0);
595 XFS_STATS_INC(pb_get_locked);
596 return (pb);
597}
598
599/*
600 * xfs_buf_get_flags assembles a buffer covering the specified range.
601 *
602 * Storage in memory for all portions of the buffer will be allocated,
603 * although backing storage may not be.
604 */
605xfs_buf_t *
606xfs_buf_get_flags( /* allocate a buffer */
607 xfs_buftarg_t *target,/* target for buffer */
608 loff_t ioff, /* starting offset of range */
609 size_t isize, /* length of range */
610 page_buf_flags_t flags) /* PBF_TRYLOCK */
611{
612 xfs_buf_t *pb, *new_pb;
613 int error = 0, i;
614
615 new_pb = pagebuf_allocate(flags);
616 if (unlikely(!new_pb))
617 return NULL;
618
619 pb = _pagebuf_find(target, ioff, isize, flags, new_pb);
620 if (pb == new_pb) {
621 error = _pagebuf_lookup_pages(pb, flags);
622 if (error)
623 goto no_buffer;
624 } else {
625 pagebuf_deallocate(new_pb);
626 if (unlikely(pb == NULL))
627 return NULL;
628 }
629
630 for (i = 0; i < pb->pb_page_count; i++)
631 mark_page_accessed(pb->pb_pages[i]);
632
633 if (!(pb->pb_flags & PBF_MAPPED)) {
634 error = _pagebuf_map_pages(pb, flags);
635 if (unlikely(error)) {
636 printk(KERN_WARNING "%s: failed to map pages\n",
637 __FUNCTION__);
638 goto no_buffer;
639 }
640 }
641
642 XFS_STATS_INC(pb_get);
643
644 /*
645 * Always fill in the block number now, the mapped cases can do
646 * their own overlay of this later.
647 */
648 pb->pb_bn = ioff;
649 pb->pb_count_desired = pb->pb_buffer_length;
650
651 PB_TRACE(pb, "get", (unsigned long)flags);
652 return pb;
653
654 no_buffer:
655 if (flags & (PBF_LOCK | PBF_TRYLOCK))
656 pagebuf_unlock(pb);
657 pagebuf_rele(pb);
658 return NULL;
659}
660
661xfs_buf_t *
662xfs_buf_read_flags(
663 xfs_buftarg_t *target,
664 loff_t ioff,
665 size_t isize,
666 page_buf_flags_t flags)
667{
668 xfs_buf_t *pb;
669
670 flags |= PBF_READ;
671
672 pb = xfs_buf_get_flags(target, ioff, isize, flags);
673 if (pb) {
674 if (PBF_NOT_DONE(pb)) {
675 PB_TRACE(pb, "read", (unsigned long)flags);
676 XFS_STATS_INC(pb_get_read);
677 pagebuf_iostart(pb, flags);
678 } else if (flags & PBF_ASYNC) {
679 PB_TRACE(pb, "read_async", (unsigned long)flags);
680 /*
681 * Read ahead call which is already satisfied,
682 * drop the buffer
683 */
684 goto no_buffer;
685 } else {
686 PB_TRACE(pb, "read_done", (unsigned long)flags);
687 /* We do not want read in the flags */
688 pb->pb_flags &= ~PBF_READ;
689 }
690 }
691
692 return pb;
693
694 no_buffer:
695 if (flags & (PBF_LOCK | PBF_TRYLOCK))
696 pagebuf_unlock(pb);
697 pagebuf_rele(pb);
698 return NULL;
699}
700
701/*
702 * Create a skeletal pagebuf (no pages associated with it).
703 */
704xfs_buf_t *
705pagebuf_lookup(
706 xfs_buftarg_t *target,
707 loff_t ioff,
708 size_t isize,
709 page_buf_flags_t flags)
710{
711 xfs_buf_t *pb;
712
713 pb = pagebuf_allocate(flags);
714 if (pb) {
715 _pagebuf_initialize(pb, target, ioff, isize, flags);
716 }
717 return pb;
718}
719
720/*
721 * If we are not low on memory then do the readahead in a deadlock
722 * safe manner.
723 */
724void
725pagebuf_readahead(
726 xfs_buftarg_t *target,
727 loff_t ioff,
728 size_t isize,
729 page_buf_flags_t flags)
730{
731 struct backing_dev_info *bdi;
732
733 bdi = target->pbr_mapping->backing_dev_info;
734 if (bdi_read_congested(bdi))
735 return;
736
737 flags |= (PBF_TRYLOCK|PBF_ASYNC|PBF_READ_AHEAD);
738 xfs_buf_read_flags(target, ioff, isize, flags);
739}
740
741xfs_buf_t *
742pagebuf_get_empty(
743 size_t len,
744 xfs_buftarg_t *target)
745{
746 xfs_buf_t *pb;
747
748 pb = pagebuf_allocate(0);
749 if (pb)
750 _pagebuf_initialize(pb, target, 0, len, 0);
751 return pb;
752}
753
754static inline struct page *
755mem_to_page(
756 void *addr)
757{
758 if (((unsigned long)addr < VMALLOC_START) ||
759 ((unsigned long)addr >= VMALLOC_END)) {
760 return virt_to_page(addr);
761 } else {
762 return vmalloc_to_page(addr);
763 }
764}
765
766int
767pagebuf_associate_memory(
768 xfs_buf_t *pb,
769 void *mem,
770 size_t len)
771{
772 int rval;
773 int i = 0;
774 size_t ptr;
775 size_t end, end_cur;
776 off_t offset;
777 int page_count;
778
779 page_count = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT;
780 offset = (off_t) mem - ((off_t)mem & PAGE_CACHE_MASK);
781 if (offset && (len > PAGE_CACHE_SIZE))
782 page_count++;
783
784 /* Free any previous set of page pointers */
785 if (pb->pb_pages)
786 _pagebuf_free_pages(pb);
787
788 pb->pb_pages = NULL;
789 pb->pb_addr = mem;
790
791 rval = _pagebuf_get_pages(pb, page_count, 0);
792 if (rval)
793 return rval;
794
795 pb->pb_offset = offset;
796 ptr = (size_t) mem & PAGE_CACHE_MASK;
797 end = PAGE_CACHE_ALIGN((size_t) mem + len);
798 end_cur = end;
799 /* set up first page */
800 pb->pb_pages[0] = mem_to_page(mem);
801
802 ptr += PAGE_CACHE_SIZE;
803 pb->pb_page_count = ++i;
804 while (ptr < end) {
805 pb->pb_pages[i] = mem_to_page((void *)ptr);
806 pb->pb_page_count = ++i;
807 ptr += PAGE_CACHE_SIZE;
808 }
809 pb->pb_locked = 0;
810
811 pb->pb_count_desired = pb->pb_buffer_length = len;
812 pb->pb_flags |= PBF_MAPPED;
813
814 return 0;
815}
816
817xfs_buf_t *
818pagebuf_get_no_daddr(
819 size_t len,
820 xfs_buftarg_t *target)
821{
822 size_t malloc_len = len;
823 xfs_buf_t *bp;
824 void *data;
825 int error;
826
827 bp = pagebuf_allocate(0);
828 if (unlikely(bp == NULL))
829 goto fail;
830 _pagebuf_initialize(bp, target, 0, len, PBF_FORCEIO);
831
832 try_again:
833 data = kmem_alloc(malloc_len, KM_SLEEP | KM_MAYFAIL);
834 if (unlikely(data == NULL))
835 goto fail_free_buf;
836
837 /* check whether alignment matches.. */
838 if ((__psunsigned_t)data !=
839 ((__psunsigned_t)data & ~target->pbr_smask)) {
840 /* .. else double the size and try again */
841 kmem_free(data, malloc_len);
842 malloc_len <<= 1;
843 goto try_again;
844 }
845
846 error = pagebuf_associate_memory(bp, data, len);
847 if (error)
848 goto fail_free_mem;
849 bp->pb_flags |= _PBF_KMEM_ALLOC;
850
851 pagebuf_unlock(bp);
852
853 PB_TRACE(bp, "no_daddr", data);
854 return bp;
855 fail_free_mem:
856 kmem_free(data, malloc_len);
857 fail_free_buf:
858 pagebuf_free(bp);
859 fail:
860 return NULL;
861}
862
863/*
864 * pagebuf_hold
865 *
866 * Increment reference count on buffer, to hold the buffer concurrently
867 * with another thread which may release (free) the buffer asynchronously.
868 *
869 * Must hold the buffer already to call this function.
870 */
871void
872pagebuf_hold(
873 xfs_buf_t *pb)
874{
875 atomic_inc(&pb->pb_hold);
876 PB_TRACE(pb, "hold", 0);
877}
878
879/*
880 * pagebuf_rele
881 *
882 * pagebuf_rele releases a hold on the specified buffer. If the
883 * the hold count is 1, pagebuf_rele calls pagebuf_free.
884 */
885void
886pagebuf_rele(
887 xfs_buf_t *pb)
888{
889 xfs_bufhash_t *hash = pb->pb_hash;
890
891 PB_TRACE(pb, "rele", pb->pb_relse);
892
893 /*
894 * pagebuf_lookup buffers are not hashed, not delayed write,
895 * and don't have their own release routines. Special case.
896 */
897 if (unlikely(!hash)) {
898 ASSERT(!pb->pb_relse);
899 if (atomic_dec_and_test(&pb->pb_hold))
900 xfs_buf_free(pb);
901 return;
902 }
903
904 if (atomic_dec_and_lock(&pb->pb_hold, &hash->bh_lock)) {
905 int do_free = 1;
906
907 if (pb->pb_relse) {
908 atomic_inc(&pb->pb_hold);
909 spin_unlock(&hash->bh_lock);
910 (*(pb->pb_relse)) (pb);
911 spin_lock(&hash->bh_lock);
912 do_free = 0;
913 }
914
915 if (pb->pb_flags & PBF_DELWRI) {
916 pb->pb_flags |= PBF_ASYNC;
917 atomic_inc(&pb->pb_hold);
918 pagebuf_delwri_queue(pb, 0);
919 do_free = 0;
920 } else if (pb->pb_flags & PBF_FS_MANAGED) {
921 do_free = 0;
922 }
923
924 if (do_free) {
925 list_del_init(&pb->pb_hash_list);
926 spin_unlock(&hash->bh_lock);
927 pagebuf_free(pb);
928 } else {
929 spin_unlock(&hash->bh_lock);
930 }
931 }
932}
933
934
935/*
936 * Mutual exclusion on buffers. Locking model:
937 *
938 * Buffers associated with inodes for which buffer locking
939 * is not enabled are not protected by semaphores, and are
940 * assumed to be exclusively owned by the caller. There is a
941 * spinlock in the buffer, used by the caller when concurrent
942 * access is possible.
943 */
944
945/*
946 * pagebuf_cond_lock
947 *
948 * pagebuf_cond_lock locks a buffer object, if it is not already locked.
949 * Note that this in no way
950 * locks the underlying pages, so it is only useful for synchronizing
951 * concurrent use of page buffer objects, not for synchronizing independent
952 * access to the underlying pages.
953 */
954int
955pagebuf_cond_lock( /* lock buffer, if not locked */
956 /* returns -EBUSY if locked) */
957 xfs_buf_t *pb)
958{
959 int locked;
960
961 locked = down_trylock(&pb->pb_sema) == 0;
962 if (locked) {
963 PB_SET_OWNER(pb);
964 }
965 PB_TRACE(pb, "cond_lock", (long)locked);
966 return(locked ? 0 : -EBUSY);
967}
968
969#if defined(DEBUG) || defined(XFS_BLI_TRACE)
970/*
971 * pagebuf_lock_value
972 *
973 * Return lock value for a pagebuf
974 */
975int
976pagebuf_lock_value(
977 xfs_buf_t *pb)
978{
979 return(atomic_read(&pb->pb_sema.count));
980}
981#endif
982
983/*
984 * pagebuf_lock
985 *
986 * pagebuf_lock locks a buffer object. Note that this in no way
987 * locks the underlying pages, so it is only useful for synchronizing
988 * concurrent use of page buffer objects, not for synchronizing independent
989 * access to the underlying pages.
990 */
991int
992pagebuf_lock(
993 xfs_buf_t *pb)
994{
995 PB_TRACE(pb, "lock", 0);
996 if (atomic_read(&pb->pb_io_remaining))
997 blk_run_address_space(pb->pb_target->pbr_mapping);
998 down(&pb->pb_sema);
999 PB_SET_OWNER(pb);
1000 PB_TRACE(pb, "locked", 0);
1001 return 0;
1002}
1003
1004/*
1005 * pagebuf_unlock
1006 *
1007 * pagebuf_unlock releases the lock on the buffer object created by
1008 * pagebuf_lock or pagebuf_cond_lock (not any
1009 * pinning of underlying pages created by pagebuf_pin).
1010 */
1011void
1012pagebuf_unlock( /* unlock buffer */
1013 xfs_buf_t *pb) /* buffer to unlock */
1014{
1015 PB_CLEAR_OWNER(pb);
1016 up(&pb->pb_sema);
1017 PB_TRACE(pb, "unlock", 0);
1018}
1019
1020
1021/*
1022 * Pinning Buffer Storage in Memory
1023 */
1024
1025/*
1026 * pagebuf_pin
1027 *
1028 * pagebuf_pin locks all of the memory represented by a buffer in
1029 * memory. Multiple calls to pagebuf_pin and pagebuf_unpin, for
1030 * the same or different buffers affecting a given page, will
1031 * properly count the number of outstanding "pin" requests. The
1032 * buffer may be released after the pagebuf_pin and a different
1033 * buffer used when calling pagebuf_unpin, if desired.
1034 * pagebuf_pin should be used by the file system when it wants be
1035 * assured that no attempt will be made to force the affected
1036 * memory to disk. It does not assure that a given logical page
1037 * will not be moved to a different physical page.
1038 */
1039void
1040pagebuf_pin(
1041 xfs_buf_t *pb)
1042{
1043 atomic_inc(&pb->pb_pin_count);
1044 PB_TRACE(pb, "pin", (long)pb->pb_pin_count.counter);
1045}
1046
1047/*
1048 * pagebuf_unpin
1049 *
1050 * pagebuf_unpin reverses the locking of memory performed by
1051 * pagebuf_pin. Note that both functions affected the logical
1052 * pages associated with the buffer, not the buffer itself.
1053 */
1054void
1055pagebuf_unpin(
1056 xfs_buf_t *pb)
1057{
1058 if (atomic_dec_and_test(&pb->pb_pin_count)) {
1059 wake_up_all(&pb->pb_waiters);
1060 }
1061 PB_TRACE(pb, "unpin", (long)pb->pb_pin_count.counter);
1062}
1063
1064int
1065pagebuf_ispin(
1066 xfs_buf_t *pb)
1067{
1068 return atomic_read(&pb->pb_pin_count);
1069}
1070
1071/*
1072 * pagebuf_wait_unpin
1073 *
1074 * pagebuf_wait_unpin waits until all of the memory associated
1075 * with the buffer is not longer locked in memory. It returns
1076 * immediately if none of the affected pages are locked.
1077 */
1078static inline void
1079_pagebuf_wait_unpin(
1080 xfs_buf_t *pb)
1081{
1082 DECLARE_WAITQUEUE (wait, current);
1083
1084 if (atomic_read(&pb->pb_pin_count) == 0)
1085 return;
1086
1087 add_wait_queue(&pb->pb_waiters, &wait);
1088 for (;;) {
1089 set_current_state(TASK_UNINTERRUPTIBLE);
1090 if (atomic_read(&pb->pb_pin_count) == 0)
1091 break;
1092 if (atomic_read(&pb->pb_io_remaining))
1093 blk_run_address_space(pb->pb_target->pbr_mapping);
1094 schedule();
1095 }
1096 remove_wait_queue(&pb->pb_waiters, &wait);
1097 set_current_state(TASK_RUNNING);
1098}
1099
1100/*
1101 * Buffer Utility Routines
1102 */
1103
1104/*
1105 * pagebuf_iodone
1106 *
1107 * pagebuf_iodone marks a buffer for which I/O is in progress
1108 * done with respect to that I/O. The pb_iodone routine, if
1109 * present, will be called as a side-effect.
1110 */
1111STATIC void
1112pagebuf_iodone_work(
1113 void *v)
1114{
1115 xfs_buf_t *bp = (xfs_buf_t *)v;
1116
1117 if (bp->pb_iodone)
1118 (*(bp->pb_iodone))(bp);
1119 else if (bp->pb_flags & PBF_ASYNC)
1120 xfs_buf_relse(bp);
1121}
1122
1123void
1124pagebuf_iodone(
1125 xfs_buf_t *pb,
1126 int dataio,
1127 int schedule)
1128{
1129 pb->pb_flags &= ~(PBF_READ | PBF_WRITE);
1130 if (pb->pb_error == 0) {
1131 pb->pb_flags &= ~(PBF_PARTIAL | PBF_NONE);
1132 }
1133
1134 PB_TRACE(pb, "iodone", pb->pb_iodone);
1135
1136 if ((pb->pb_iodone) || (pb->pb_flags & PBF_ASYNC)) {
1137 if (schedule) {
1138 INIT_WORK(&pb->pb_iodone_work, pagebuf_iodone_work, pb);
1139 queue_work(dataio ? pagebuf_dataio_workqueue :
1140 pagebuf_logio_workqueue, &pb->pb_iodone_work);
1141 } else {
1142 pagebuf_iodone_work(pb);
1143 }
1144 } else {
1145 up(&pb->pb_iodonesema);
1146 }
1147}
1148
1149/*
1150 * pagebuf_ioerror
1151 *
1152 * pagebuf_ioerror sets the error code for a buffer.
1153 */
1154void
1155pagebuf_ioerror( /* mark/clear buffer error flag */
1156 xfs_buf_t *pb, /* buffer to mark */
1157 int error) /* error to store (0 if none) */
1158{
1159 ASSERT(error >= 0 && error <= 0xffff);
1160 pb->pb_error = (unsigned short)error;
1161 PB_TRACE(pb, "ioerror", (unsigned long)error);
1162}
1163
1164/*
1165 * pagebuf_iostart
1166 *
1167 * pagebuf_iostart initiates I/O on a buffer, based on the flags supplied.
1168 * If necessary, it will arrange for any disk space allocation required,
1169 * and it will break up the request if the block mappings require it.
1170 * The pb_iodone routine in the buffer supplied will only be called
1171 * when all of the subsidiary I/O requests, if any, have been completed.
1172 * pagebuf_iostart calls the pagebuf_ioinitiate routine or
1173 * pagebuf_iorequest, if the former routine is not defined, to start
1174 * the I/O on a given low-level request.
1175 */
1176int
1177pagebuf_iostart( /* start I/O on a buffer */
1178 xfs_buf_t *pb, /* buffer to start */
1179 page_buf_flags_t flags) /* PBF_LOCK, PBF_ASYNC, PBF_READ, */
1180 /* PBF_WRITE, PBF_DELWRI, */
1181 /* PBF_DONT_BLOCK */
1182{
1183 int status = 0;
1184
1185 PB_TRACE(pb, "iostart", (unsigned long)flags);
1186
1187 if (flags & PBF_DELWRI) {
1188 pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC);
1189 pb->pb_flags |= flags & (PBF_DELWRI | PBF_ASYNC);
1190 pagebuf_delwri_queue(pb, 1);
1191 return status;
1192 }
1193
1194 pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC | PBF_DELWRI | \
1195 PBF_READ_AHEAD | _PBF_RUN_QUEUES);
1196 pb->pb_flags |= flags & (PBF_READ | PBF_WRITE | PBF_ASYNC | \
1197 PBF_READ_AHEAD | _PBF_RUN_QUEUES);
1198
1199 BUG_ON(pb->pb_bn == XFS_BUF_DADDR_NULL);
1200
1201 /* For writes allow an alternate strategy routine to precede
1202 * the actual I/O request (which may not be issued at all in
1203 * a shutdown situation, for example).
1204 */
1205 status = (flags & PBF_WRITE) ?
1206 pagebuf_iostrategy(pb) : pagebuf_iorequest(pb);
1207
1208 /* Wait for I/O if we are not an async request.
1209 * Note: async I/O request completion will release the buffer,
1210 * and that can already be done by this point. So using the
1211 * buffer pointer from here on, after async I/O, is invalid.
1212 */
1213 if (!status && !(flags & PBF_ASYNC))
1214 status = pagebuf_iowait(pb);
1215
1216 return status;
1217}
1218
1219/*
1220 * Helper routine for pagebuf_iorequest
1221 */
1222
1223STATIC __inline__ int
1224_pagebuf_iolocked(
1225 xfs_buf_t *pb)
1226{
1227 ASSERT(pb->pb_flags & (PBF_READ|PBF_WRITE));
1228 if (pb->pb_flags & PBF_READ)
1229 return pb->pb_locked;
1230 return 0;
1231}
1232
1233STATIC __inline__ void
1234_pagebuf_iodone(
1235 xfs_buf_t *pb,
1236 int schedule)
1237{
1238 if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) {
1239 pb->pb_locked = 0;
1240 pagebuf_iodone(pb, (pb->pb_flags & PBF_FS_DATAIOD), schedule);
1241 }
1242}
1243
1244STATIC int
1245bio_end_io_pagebuf(
1246 struct bio *bio,
1247 unsigned int bytes_done,
1248 int error)
1249{
1250 xfs_buf_t *pb = (xfs_buf_t *)bio->bi_private;
1251 unsigned int i, blocksize = pb->pb_target->pbr_bsize;
1252 struct bio_vec *bvec = bio->bi_io_vec;
1253
1254 if (bio->bi_size)
1255 return 1;
1256
1257 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
1258 pb->pb_error = EIO;
1259
1260 for (i = 0; i < bio->bi_vcnt; i++, bvec++) {
1261 struct page *page = bvec->bv_page;
1262
1263 if (pb->pb_error) {
1264 SetPageError(page);
1265 } else if (blocksize == PAGE_CACHE_SIZE) {
1266 SetPageUptodate(page);
1267 } else if (!PagePrivate(page) &&
1268 (pb->pb_flags & _PBF_PAGE_CACHE)) {
1269 set_page_region(page, bvec->bv_offset, bvec->bv_len);
1270 }
1271
1272 if (_pagebuf_iolocked(pb)) {
1273 unlock_page(page);
1274 }
1275 }
1276
1277 _pagebuf_iodone(pb, 1);
1278 bio_put(bio);
1279 return 0;
1280}
1281
1282STATIC void
1283_pagebuf_ioapply(
1284 xfs_buf_t *pb)
1285{
1286 int i, rw, map_i, total_nr_pages, nr_pages;
1287 struct bio *bio;
1288 int offset = pb->pb_offset;
1289 int size = pb->pb_count_desired;
1290 sector_t sector = pb->pb_bn;
1291 unsigned int blocksize = pb->pb_target->pbr_bsize;
1292 int locking = _pagebuf_iolocked(pb);
1293
1294 total_nr_pages = pb->pb_page_count;
1295 map_i = 0;
1296
1297 if (pb->pb_flags & _PBF_RUN_QUEUES) {
1298 pb->pb_flags &= ~_PBF_RUN_QUEUES;
1299 rw = (pb->pb_flags & PBF_READ) ? READ_SYNC : WRITE_SYNC;
1300 } else {
1301 rw = (pb->pb_flags & PBF_READ) ? READ : WRITE;
1302 }
1303
1304 /* Special code path for reading a sub page size pagebuf in --
1305 * we populate up the whole page, and hence the other metadata
1306 * in the same page. This optimization is only valid when the
1307 * filesystem block size and the page size are equal.
1308 */
1309 if ((pb->pb_buffer_length < PAGE_CACHE_SIZE) &&
1310 (pb->pb_flags & PBF_READ) && locking &&
1311 (blocksize == PAGE_CACHE_SIZE)) {
1312 bio = bio_alloc(GFP_NOIO, 1);
1313
1314 bio->bi_bdev = pb->pb_target->pbr_bdev;
1315 bio->bi_sector = sector - (offset >> BBSHIFT);
1316 bio->bi_end_io = bio_end_io_pagebuf;
1317 bio->bi_private = pb;
1318
1319 bio_add_page(bio, pb->pb_pages[0], PAGE_CACHE_SIZE, 0);
1320 size = 0;
1321
1322 atomic_inc(&pb->pb_io_remaining);
1323
1324 goto submit_io;
1325 }
1326
1327 /* Lock down the pages which we need to for the request */
1328 if (locking && (pb->pb_flags & PBF_WRITE) && (pb->pb_locked == 0)) {
1329 for (i = 0; size; i++) {
1330 int nbytes = PAGE_CACHE_SIZE - offset;
1331 struct page *page = pb->pb_pages[i];
1332
1333 if (nbytes > size)
1334 nbytes = size;
1335
1336 lock_page(page);
1337
1338 size -= nbytes;
1339 offset = 0;
1340 }
1341 offset = pb->pb_offset;
1342 size = pb->pb_count_desired;
1343 }
1344
1345next_chunk:
1346 atomic_inc(&pb->pb_io_remaining);
1347 nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
1348 if (nr_pages > total_nr_pages)
1349 nr_pages = total_nr_pages;
1350
1351 bio = bio_alloc(GFP_NOIO, nr_pages);
1352 bio->bi_bdev = pb->pb_target->pbr_bdev;
1353 bio->bi_sector = sector;
1354 bio->bi_end_io = bio_end_io_pagebuf;
1355 bio->bi_private = pb;
1356
1357 for (; size && nr_pages; nr_pages--, map_i++) {
1358 int nbytes = PAGE_CACHE_SIZE - offset;
1359
1360 if (nbytes > size)
1361 nbytes = size;
1362
1363 if (bio_add_page(bio, pb->pb_pages[map_i],
1364 nbytes, offset) < nbytes)
1365 break;
1366
1367 offset = 0;
1368 sector += nbytes >> BBSHIFT;
1369 size -= nbytes;
1370 total_nr_pages--;
1371 }
1372
1373submit_io:
1374 if (likely(bio->bi_size)) {
1375 submit_bio(rw, bio);
1376 if (size)
1377 goto next_chunk;
1378 } else {
1379 bio_put(bio);
1380 pagebuf_ioerror(pb, EIO);
1381 }
1382}
1383
1384/*
1385 * pagebuf_iorequest -- the core I/O request routine.
1386 */
1387int
1388pagebuf_iorequest( /* start real I/O */
1389 xfs_buf_t *pb) /* buffer to convey to device */
1390{
1391 PB_TRACE(pb, "iorequest", 0);
1392
1393 if (pb->pb_flags & PBF_DELWRI) {
1394 pagebuf_delwri_queue(pb, 1);
1395 return 0;
1396 }
1397
1398 if (pb->pb_flags & PBF_WRITE) {
1399 _pagebuf_wait_unpin(pb);
1400 }
1401
1402 pagebuf_hold(pb);
1403
1404 /* Set the count to 1 initially, this will stop an I/O
1405 * completion callout which happens before we have started
1406 * all the I/O from calling pagebuf_iodone too early.
1407 */
1408 atomic_set(&pb->pb_io_remaining, 1);
1409 _pagebuf_ioapply(pb);
1410 _pagebuf_iodone(pb, 0);
1411
1412 pagebuf_rele(pb);
1413 return 0;
1414}
1415
1416/*
1417 * pagebuf_iowait
1418 *
1419 * pagebuf_iowait waits for I/O to complete on the buffer supplied.
1420 * It returns immediately if no I/O is pending. In any case, it returns
1421 * the error code, if any, or 0 if there is no error.
1422 */
1423int
1424pagebuf_iowait(
1425 xfs_buf_t *pb)
1426{
1427 PB_TRACE(pb, "iowait", 0);
1428 if (atomic_read(&pb->pb_io_remaining))
1429 blk_run_address_space(pb->pb_target->pbr_mapping);
1430 down(&pb->pb_iodonesema);
1431 PB_TRACE(pb, "iowaited", (long)pb->pb_error);
1432 return pb->pb_error;
1433}
1434
1435caddr_t
1436pagebuf_offset(
1437 xfs_buf_t *pb,
1438 size_t offset)
1439{
1440 struct page *page;
1441
1442 offset += pb->pb_offset;
1443
1444 page = pb->pb_pages[offset >> PAGE_CACHE_SHIFT];
1445 return (caddr_t) page_address(page) + (offset & (PAGE_CACHE_SIZE - 1));
1446}
1447
1448/*
1449 * pagebuf_iomove
1450 *
1451 * Move data into or out of a buffer.
1452 */
1453void
1454pagebuf_iomove(
1455 xfs_buf_t *pb, /* buffer to process */
1456 size_t boff, /* starting buffer offset */
1457 size_t bsize, /* length to copy */
1458 caddr_t data, /* data address */
1459 page_buf_rw_t mode) /* read/write flag */
1460{
1461 size_t bend, cpoff, csize;
1462 struct page *page;
1463
1464 bend = boff + bsize;
1465 while (boff < bend) {
1466 page = pb->pb_pages[page_buf_btoct(boff + pb->pb_offset)];
1467 cpoff = page_buf_poff(boff + pb->pb_offset);
1468 csize = min_t(size_t,
1469 PAGE_CACHE_SIZE-cpoff, pb->pb_count_desired-boff);
1470
1471 ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE));
1472
1473 switch (mode) {
1474 case PBRW_ZERO:
1475 memset(page_address(page) + cpoff, 0, csize);
1476 break;
1477 case PBRW_READ:
1478 memcpy(data, page_address(page) + cpoff, csize);
1479 break;
1480 case PBRW_WRITE:
1481 memcpy(page_address(page) + cpoff, data, csize);
1482 }
1483
1484 boff += csize;
1485 data += csize;
1486 }
1487}
1488
1489/*
1490 * Handling of buftargs.
1491 */
1492
1493/*
1494 * Wait for any bufs with callbacks that have been submitted but
1495 * have not yet returned... walk the hash list for the target.
1496 */
1497void
1498xfs_wait_buftarg(
1499 xfs_buftarg_t *btp)
1500{
1501 xfs_buf_t *bp, *n;
1502 xfs_bufhash_t *hash;
1503 uint i;
1504
1505 for (i = 0; i < (1 << btp->bt_hashshift); i++) {
1506 hash = &btp->bt_hash[i];
1507again:
1508 spin_lock(&hash->bh_lock);
1509 list_for_each_entry_safe(bp, n, &hash->bh_list, pb_hash_list) {
1510 ASSERT(btp == bp->pb_target);
1511 if (!(bp->pb_flags & PBF_FS_MANAGED)) {
1512 spin_unlock(&hash->bh_lock);
1513 delay(100);
1514 goto again;
1515 }
1516 }
1517 spin_unlock(&hash->bh_lock);
1518 }
1519}
1520
1521/*
1522 * Allocate buffer hash table for a given target.
1523 * For devices containing metadata (i.e. not the log/realtime devices)
1524 * we need to allocate a much larger hash table.
1525 */
1526STATIC void
1527xfs_alloc_bufhash(
1528 xfs_buftarg_t *btp,
1529 int external)
1530{
1531 unsigned int i;
1532
1533 btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */
1534 btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
1535 btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) *
1536 sizeof(xfs_bufhash_t), KM_SLEEP);
1537 for (i = 0; i < (1 << btp->bt_hashshift); i++) {
1538 spin_lock_init(&btp->bt_hash[i].bh_lock);
1539 INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
1540 }
1541}
1542
1543STATIC void
1544xfs_free_bufhash(
1545 xfs_buftarg_t *btp)
1546{
1547 kmem_free(btp->bt_hash,
1548 (1 << btp->bt_hashshift) * sizeof(xfs_bufhash_t));
1549 btp->bt_hash = NULL;
1550}
1551
1552void
1553xfs_free_buftarg(
1554 xfs_buftarg_t *btp,
1555 int external)
1556{
1557 xfs_flush_buftarg(btp, 1);
1558 if (external)
1559 xfs_blkdev_put(btp->pbr_bdev);
1560 xfs_free_bufhash(btp);
1561 iput(btp->pbr_mapping->host);
1562 kmem_free(btp, sizeof(*btp));
1563}
1564
1565void
1566xfs_incore_relse(
1567 xfs_buftarg_t *btp,
1568 int delwri_only,
1569 int wait)
1570{
1571 invalidate_bdev(btp->pbr_bdev, 1);
1572 truncate_inode_pages(btp->pbr_mapping, 0LL);
1573}
1574
1575STATIC int
1576xfs_setsize_buftarg_flags(
1577 xfs_buftarg_t *btp,
1578 unsigned int blocksize,
1579 unsigned int sectorsize,
1580 int verbose)
1581{
1582 btp->pbr_bsize = blocksize;
1583 btp->pbr_sshift = ffs(sectorsize) - 1;
1584 btp->pbr_smask = sectorsize - 1;
1585
1586 if (set_blocksize(btp->pbr_bdev, sectorsize)) {
1587 printk(KERN_WARNING
1588 "XFS: Cannot set_blocksize to %u on device %s\n",
1589 sectorsize, XFS_BUFTARG_NAME(btp));
1590 return EINVAL;
1591 }
1592
1593 if (verbose &&
1594 (PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) {
1595 printk(KERN_WARNING
1596 "XFS: %u byte sectors in use on device %s. "
1597 "This is suboptimal; %u or greater is ideal.\n",
1598 sectorsize, XFS_BUFTARG_NAME(btp),
1599 (unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG);
1600 }
1601
1602 return 0;
1603}
1604
1605/*
1606* When allocating the initial buffer target we have not yet
1607* read in the superblock, so don't know what sized sectors
1608* are being used is at this early stage. Play safe.
1609*/
1610STATIC int
1611xfs_setsize_buftarg_early(
1612 xfs_buftarg_t *btp,
1613 struct block_device *bdev)
1614{
1615 return xfs_setsize_buftarg_flags(btp,
1616 PAGE_CACHE_SIZE, bdev_hardsect_size(bdev), 0);
1617}
1618
1619int
1620xfs_setsize_buftarg(
1621 xfs_buftarg_t *btp,
1622 unsigned int blocksize,
1623 unsigned int sectorsize)
1624{
1625 return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
1626}
1627
1628STATIC int
1629xfs_mapping_buftarg(
1630 xfs_buftarg_t *btp,
1631 struct block_device *bdev)
1632{
1633 struct backing_dev_info *bdi;
1634 struct inode *inode;
1635 struct address_space *mapping;
1636 static struct address_space_operations mapping_aops = {
1637 .sync_page = block_sync_page,
1638 };
1639
1640 inode = new_inode(bdev->bd_inode->i_sb);
1641 if (!inode) {
1642 printk(KERN_WARNING
1643 "XFS: Cannot allocate mapping inode for device %s\n",
1644 XFS_BUFTARG_NAME(btp));
1645 return ENOMEM;
1646 }
1647 inode->i_mode = S_IFBLK;
1648 inode->i_bdev = bdev;
1649 inode->i_rdev = bdev->bd_dev;
1650 bdi = blk_get_backing_dev_info(bdev);
1651 if (!bdi)
1652 bdi = &default_backing_dev_info;
1653 mapping = &inode->i_data;
1654 mapping->a_ops = &mapping_aops;
1655 mapping->backing_dev_info = bdi;
1656 mapping_set_gfp_mask(mapping, GFP_NOFS);
1657 btp->pbr_mapping = mapping;
1658 return 0;
1659}
1660
1661xfs_buftarg_t *
1662xfs_alloc_buftarg(
1663 struct block_device *bdev,
1664 int external)
1665{
1666 xfs_buftarg_t *btp;
1667
1668 btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
1669
1670 btp->pbr_dev = bdev->bd_dev;
1671 btp->pbr_bdev = bdev;
1672 if (xfs_setsize_buftarg_early(btp, bdev))
1673 goto error;
1674 if (xfs_mapping_buftarg(btp, bdev))
1675 goto error;
1676 xfs_alloc_bufhash(btp, external);
1677 return btp;
1678
1679error:
1680 kmem_free(btp, sizeof(*btp));
1681 return NULL;
1682}
1683
1684
1685/*
1686 * Pagebuf delayed write buffer handling
1687 */
1688
1689STATIC LIST_HEAD(pbd_delwrite_queue);
1690STATIC DEFINE_SPINLOCK(pbd_delwrite_lock);
1691
1692STATIC void
1693pagebuf_delwri_queue(
1694 xfs_buf_t *pb,
1695 int unlock)
1696{
1697 PB_TRACE(pb, "delwri_q", (long)unlock);
1698 ASSERT(pb->pb_flags & PBF_DELWRI);
1699
1700 spin_lock(&pbd_delwrite_lock);
1701 /* If already in the queue, dequeue and place at tail */
1702 if (!list_empty(&pb->pb_list)) {
1703 if (unlock) {
1704 atomic_dec(&pb->pb_hold);
1705 }
1706 list_del(&pb->pb_list);
1707 }
1708
1709 list_add_tail(&pb->pb_list, &pbd_delwrite_queue);
1710 pb->pb_queuetime = jiffies;
1711 spin_unlock(&pbd_delwrite_lock);
1712
1713 if (unlock)
1714 pagebuf_unlock(pb);
1715}
1716
1717void
1718pagebuf_delwri_dequeue(
1719 xfs_buf_t *pb)
1720{
1721 int dequeued = 0;
1722
1723 spin_lock(&pbd_delwrite_lock);
1724 if ((pb->pb_flags & PBF_DELWRI) && !list_empty(&pb->pb_list)) {
1725 list_del_init(&pb->pb_list);
1726 dequeued = 1;
1727 }
1728 pb->pb_flags &= ~PBF_DELWRI;
1729 spin_unlock(&pbd_delwrite_lock);
1730
1731 if (dequeued)
1732 pagebuf_rele(pb);
1733
1734 PB_TRACE(pb, "delwri_dq", (long)dequeued);
1735}
1736
1737STATIC void
1738pagebuf_runall_queues(
1739 struct workqueue_struct *queue)
1740{
1741 flush_workqueue(queue);
1742}
1743
1744/* Defines for pagebuf daemon */
1745STATIC DECLARE_COMPLETION(pagebuf_daemon_done);
1746STATIC struct task_struct *pagebuf_daemon_task;
1747STATIC int pagebuf_daemon_active;
1748STATIC int force_flush;
1749
1750
1751STATIC int
1752pagebuf_daemon_wakeup(
1753 int priority,
1754 unsigned int mask)
1755{
1756 force_flush = 1;
1757 barrier();
1758 wake_up_process(pagebuf_daemon_task);
1759 return 0;
1760}
1761
1762STATIC int
1763pagebuf_daemon(
1764 void *data)
1765{
1766 struct list_head tmp;
1767 unsigned long age;
1768 xfs_buftarg_t *target;
1769 xfs_buf_t *pb, *n;
1770
1771 /* Set up the thread */
1772 daemonize("xfsbufd");
1773 current->flags |= PF_MEMALLOC;
1774
1775 pagebuf_daemon_task = current;
1776 pagebuf_daemon_active = 1;
1777 barrier();
1778
1779 INIT_LIST_HEAD(&tmp);
1780 do {
1781 try_to_freeze(PF_FREEZE);
1782
1783 set_current_state(TASK_INTERRUPTIBLE);
1784 schedule_timeout((xfs_buf_timer_centisecs * HZ) / 100);
1785
1786 age = (xfs_buf_age_centisecs * HZ) / 100;
1787 spin_lock(&pbd_delwrite_lock);
1788 list_for_each_entry_safe(pb, n, &pbd_delwrite_queue, pb_list) {
1789 PB_TRACE(pb, "walkq1", (long)pagebuf_ispin(pb));
1790 ASSERT(pb->pb_flags & PBF_DELWRI);
1791
1792 if (!pagebuf_ispin(pb) && !pagebuf_cond_lock(pb)) {
1793 if (!force_flush &&
1794 time_before(jiffies,
1795 pb->pb_queuetime + age)) {
1796 pagebuf_unlock(pb);
1797 break;
1798 }
1799
1800 pb->pb_flags &= ~PBF_DELWRI;
1801 pb->pb_flags |= PBF_WRITE;
1802 list_move(&pb->pb_list, &tmp);
1803 }
1804 }
1805 spin_unlock(&pbd_delwrite_lock);
1806
1807 while (!list_empty(&tmp)) {
1808 pb = list_entry(tmp.next, xfs_buf_t, pb_list);
1809 target = pb->pb_target;
1810
1811 list_del_init(&pb->pb_list);
1812 pagebuf_iostrategy(pb);
1813
1814 blk_run_address_space(target->pbr_mapping);
1815 }
1816
1817 if (as_list_len > 0)
1818 purge_addresses();
1819
1820 force_flush = 0;
1821 } while (pagebuf_daemon_active);
1822
1823 complete_and_exit(&pagebuf_daemon_done, 0);
1824}
1825
1826/*
1827 * Go through all incore buffers, and release buffers if they belong to
1828 * the given device. This is used in filesystem error handling to
1829 * preserve the consistency of its metadata.
1830 */
1831int
1832xfs_flush_buftarg(
1833 xfs_buftarg_t *target,
1834 int wait)
1835{
1836 struct list_head tmp;
1837 xfs_buf_t *pb, *n;
1838 int pincount = 0;
1839
1840 pagebuf_runall_queues(pagebuf_dataio_workqueue);
1841 pagebuf_runall_queues(pagebuf_logio_workqueue);
1842
1843 INIT_LIST_HEAD(&tmp);
1844 spin_lock(&pbd_delwrite_lock);
1845 list_for_each_entry_safe(pb, n, &pbd_delwrite_queue, pb_list) {
1846
1847 if (pb->pb_target != target)
1848 continue;
1849
1850 ASSERT(pb->pb_flags & PBF_DELWRI);
1851 PB_TRACE(pb, "walkq2", (long)pagebuf_ispin(pb));
1852 if (pagebuf_ispin(pb)) {
1853 pincount++;
1854 continue;
1855 }
1856
1857 pb->pb_flags &= ~PBF_DELWRI;
1858 pb->pb_flags |= PBF_WRITE;
1859 list_move(&pb->pb_list, &tmp);
1860 }
1861 spin_unlock(&pbd_delwrite_lock);
1862
1863 /*
1864 * Dropped the delayed write list lock, now walk the temporary list
1865 */
1866 list_for_each_entry_safe(pb, n, &tmp, pb_list) {
1867 if (wait)
1868 pb->pb_flags &= ~PBF_ASYNC;
1869 else
1870 list_del_init(&pb->pb_list);
1871
1872 pagebuf_lock(pb);
1873 pagebuf_iostrategy(pb);
1874 }
1875
1876 /*
1877 * Remaining list items must be flushed before returning
1878 */
1879 while (!list_empty(&tmp)) {
1880 pb = list_entry(tmp.next, xfs_buf_t, pb_list);
1881
1882 list_del_init(&pb->pb_list);
1883 xfs_iowait(pb);
1884 xfs_buf_relse(pb);
1885 }
1886
1887 if (wait)
1888 blk_run_address_space(target->pbr_mapping);
1889
1890 return pincount;
1891}
1892
1893STATIC int
1894pagebuf_daemon_start(void)
1895{
1896 int rval;
1897
1898 pagebuf_logio_workqueue = create_workqueue("xfslogd");
1899 if (!pagebuf_logio_workqueue)
1900 return -ENOMEM;
1901
1902 pagebuf_dataio_workqueue = create_workqueue("xfsdatad");
1903 if (!pagebuf_dataio_workqueue) {
1904 destroy_workqueue(pagebuf_logio_workqueue);
1905 return -ENOMEM;
1906 }
1907
1908 rval = kernel_thread(pagebuf_daemon, NULL, CLONE_FS|CLONE_FILES);
1909 if (rval < 0) {
1910 destroy_workqueue(pagebuf_logio_workqueue);
1911 destroy_workqueue(pagebuf_dataio_workqueue);
1912 }
1913
1914 return rval;
1915}
1916
1917/*
1918 * pagebuf_daemon_stop
1919 *
1920 * Note: do not mark as __exit, it is called from pagebuf_terminate.
1921 */
1922STATIC void
1923pagebuf_daemon_stop(void)
1924{
1925 pagebuf_daemon_active = 0;
1926 barrier();
1927 wait_for_completion(&pagebuf_daemon_done);
1928
1929 destroy_workqueue(pagebuf_logio_workqueue);
1930 destroy_workqueue(pagebuf_dataio_workqueue);
1931}
1932
1933/*
1934 * Initialization and Termination
1935 */
1936
1937int __init
1938pagebuf_init(void)
1939{
1940 pagebuf_cache = kmem_cache_create("xfs_buf_t", sizeof(xfs_buf_t), 0,
1941 SLAB_HWCACHE_ALIGN, NULL, NULL);
1942 if (pagebuf_cache == NULL) {
1943 printk("XFS: couldn't init xfs_buf_t cache\n");
1944 pagebuf_terminate();
1945 return -ENOMEM;
1946 }
1947
1948#ifdef PAGEBUF_TRACE
1949 pagebuf_trace_buf = ktrace_alloc(PAGEBUF_TRACE_SIZE, KM_SLEEP);
1950#endif
1951
1952 pagebuf_daemon_start();
1953
1954 pagebuf_shake = kmem_shake_register(pagebuf_daemon_wakeup);
1955 if (pagebuf_shake == NULL) {
1956 pagebuf_terminate();
1957 return -ENOMEM;
1958 }
1959
1960 return 0;
1961}
1962
1963
1964/*
1965 * pagebuf_terminate.
1966 *
1967 * Note: do not mark as __exit, this is also called from the __init code.
1968 */
1969void
1970pagebuf_terminate(void)
1971{
1972 pagebuf_daemon_stop();
1973
1974#ifdef PAGEBUF_TRACE
1975 ktrace_free(pagebuf_trace_buf);
1976#endif
1977
1978 kmem_zone_destroy(pagebuf_cache);
1979 kmem_shake_deregister(pagebuf_shake);
1980}
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
new file mode 100644
index 000000000000..74deed8e6d90
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -0,0 +1,591 @@
1/*
2 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33/*
34 * Written by Steve Lord, Jim Mostek, Russell Cattelan at SGI
35 */
36
37#ifndef __XFS_BUF_H__
38#define __XFS_BUF_H__
39
40#include <linux/config.h>
41#include <linux/list.h>
42#include <linux/types.h>
43#include <linux/spinlock.h>
44#include <asm/system.h>
45#include <linux/mm.h>
46#include <linux/fs.h>
47#include <linux/buffer_head.h>
48#include <linux/uio.h>
49
50/*
51 * Base types
52 */
53
54#define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL))
55
56#define page_buf_ctob(pp) ((pp) * PAGE_CACHE_SIZE)
57#define page_buf_btoc(dd) (((dd) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT)
58#define page_buf_btoct(dd) ((dd) >> PAGE_CACHE_SHIFT)
59#define page_buf_poff(aa) ((aa) & ~PAGE_CACHE_MASK)
60
61typedef enum page_buf_rw_e {
62 PBRW_READ = 1, /* transfer into target memory */
63 PBRW_WRITE = 2, /* transfer from target memory */
64 PBRW_ZERO = 3 /* Zero target memory */
65} page_buf_rw_t;
66
67
68typedef enum page_buf_flags_e { /* pb_flags values */
69 PBF_READ = (1 << 0), /* buffer intended for reading from device */
70 PBF_WRITE = (1 << 1), /* buffer intended for writing to device */
71 PBF_MAPPED = (1 << 2), /* buffer mapped (pb_addr valid) */
72 PBF_PARTIAL = (1 << 3), /* buffer partially read */
73 PBF_ASYNC = (1 << 4), /* initiator will not wait for completion */
74 PBF_NONE = (1 << 5), /* buffer not read at all */
75 PBF_DELWRI = (1 << 6), /* buffer has dirty pages */
76 PBF_STALE = (1 << 7), /* buffer has been staled, do not find it */
77 PBF_FS_MANAGED = (1 << 8), /* filesystem controls freeing memory */
78 PBF_FS_DATAIOD = (1 << 9), /* schedule IO completion on fs datad */
79 PBF_FORCEIO = (1 << 10), /* ignore any cache state */
80 PBF_FLUSH = (1 << 11), /* flush disk write cache */
81 PBF_READ_AHEAD = (1 << 12), /* asynchronous read-ahead */
82
83 /* flags used only as arguments to access routines */
84 PBF_LOCK = (1 << 14), /* lock requested */
85 PBF_TRYLOCK = (1 << 15), /* lock requested, but do not wait */
86 PBF_DONT_BLOCK = (1 << 16), /* do not block in current thread */
87
88 /* flags used only internally */
89 _PBF_PAGE_CACHE = (1 << 17),/* backed by pagecache */
90 _PBF_KMEM_ALLOC = (1 << 18),/* backed by kmem_alloc() */
91 _PBF_RUN_QUEUES = (1 << 19),/* run block device task queue */
92} page_buf_flags_t;
93
94#define PBF_UPDATE (PBF_READ | PBF_WRITE)
95#define PBF_NOT_DONE(pb) (((pb)->pb_flags & (PBF_PARTIAL|PBF_NONE)) != 0)
96#define PBF_DONE(pb) (((pb)->pb_flags & (PBF_PARTIAL|PBF_NONE)) == 0)
97
98typedef struct xfs_bufhash {
99 struct list_head bh_list;
100 spinlock_t bh_lock;
101} xfs_bufhash_t;
102
103typedef struct xfs_buftarg {
104 dev_t pbr_dev;
105 struct block_device *pbr_bdev;
106 struct address_space *pbr_mapping;
107 unsigned int pbr_bsize;
108 unsigned int pbr_sshift;
109 size_t pbr_smask;
110
111 /* per-device buffer hash table */
112 uint bt_hashmask;
113 uint bt_hashshift;
114 xfs_bufhash_t *bt_hash;
115} xfs_buftarg_t;
116
117/*
118 * xfs_buf_t: Buffer structure for page cache-based buffers
119 *
120 * This buffer structure is used by the page cache buffer management routines
121 * to refer to an assembly of pages forming a logical buffer. The actual I/O
122 * is performed with buffer_head structures, as required by drivers.
123 *
124 * The buffer structure is used on temporary basis only, and discarded when
125 * released. The real data storage is recorded in the page cache. Metadata is
126 * hashed to the block device on which the file system resides.
127 */
128
129struct xfs_buf;
130
131/* call-back function on I/O completion */
132typedef void (*page_buf_iodone_t)(struct xfs_buf *);
133/* call-back function on I/O completion */
134typedef void (*page_buf_relse_t)(struct xfs_buf *);
135/* pre-write function */
136typedef int (*page_buf_bdstrat_t)(struct xfs_buf *);
137
138#define PB_PAGES 2
139
140typedef struct xfs_buf {
141 struct semaphore pb_sema; /* semaphore for lockables */
142 unsigned long pb_queuetime; /* time buffer was queued */
143 atomic_t pb_pin_count; /* pin count */
144 wait_queue_head_t pb_waiters; /* unpin waiters */
145 struct list_head pb_list;
146 page_buf_flags_t pb_flags; /* status flags */
147 struct list_head pb_hash_list; /* hash table list */
148 xfs_bufhash_t *pb_hash; /* hash table list start */
149 xfs_buftarg_t *pb_target; /* buffer target (device) */
150 atomic_t pb_hold; /* reference count */
151 xfs_daddr_t pb_bn; /* block number for I/O */
152 loff_t pb_file_offset; /* offset in file */
153 size_t pb_buffer_length; /* size of buffer in bytes */
154 size_t pb_count_desired; /* desired transfer size */
155 void *pb_addr; /* virtual address of buffer */
156 struct work_struct pb_iodone_work;
157 atomic_t pb_io_remaining;/* #outstanding I/O requests */
158 page_buf_iodone_t pb_iodone; /* I/O completion function */
159 page_buf_relse_t pb_relse; /* releasing function */
160 page_buf_bdstrat_t pb_strat; /* pre-write function */
161 struct semaphore pb_iodonesema; /* Semaphore for I/O waiters */
162 void *pb_fspriv;
163 void *pb_fspriv2;
164 void *pb_fspriv3;
165 unsigned short pb_error; /* error code on I/O */
166 unsigned short pb_locked; /* page array is locked */
167 unsigned int pb_page_count; /* size of page array */
168 unsigned int pb_offset; /* page offset in first page */
169 struct page **pb_pages; /* array of page pointers */
170 struct page *pb_page_array[PB_PAGES]; /* inline pages */
171#ifdef PAGEBUF_LOCK_TRACKING
172 int pb_last_holder;
173#endif
174} xfs_buf_t;
175
176
177/* Finding and Reading Buffers */
178
179extern xfs_buf_t *_pagebuf_find( /* find buffer for block if */
180 /* the block is in memory */
181 xfs_buftarg_t *, /* inode for block */
182 loff_t, /* starting offset of range */
183 size_t, /* length of range */
184 page_buf_flags_t, /* PBF_LOCK */
185 xfs_buf_t *); /* newly allocated buffer */
186
187#define xfs_incore(buftarg,blkno,len,lockit) \
188 _pagebuf_find(buftarg, blkno ,len, lockit, NULL)
189
190extern xfs_buf_t *xfs_buf_get_flags( /* allocate a buffer */
191 xfs_buftarg_t *, /* inode for buffer */
192 loff_t, /* starting offset of range */
193 size_t, /* length of range */
194 page_buf_flags_t); /* PBF_LOCK, PBF_READ, */
195 /* PBF_ASYNC */
196
197#define xfs_buf_get(target, blkno, len, flags) \
198 xfs_buf_get_flags((target), (blkno), (len), PBF_LOCK | PBF_MAPPED)
199
200extern xfs_buf_t *xfs_buf_read_flags( /* allocate and read a buffer */
201 xfs_buftarg_t *, /* inode for buffer */
202 loff_t, /* starting offset of range */
203 size_t, /* length of range */
204 page_buf_flags_t); /* PBF_LOCK, PBF_ASYNC */
205
206#define xfs_buf_read(target, blkno, len, flags) \
207 xfs_buf_read_flags((target), (blkno), (len), PBF_LOCK | PBF_MAPPED)
208
209extern xfs_buf_t *pagebuf_lookup(
210 xfs_buftarg_t *,
211 loff_t, /* starting offset of range */
212 size_t, /* length of range */
213 page_buf_flags_t); /* PBF_READ, PBF_WRITE, */
214 /* PBF_FORCEIO, */
215
216extern xfs_buf_t *pagebuf_get_empty( /* allocate pagebuf struct with */
217 /* no memory or disk address */
218 size_t len,
219 xfs_buftarg_t *); /* mount point "fake" inode */
220
221extern xfs_buf_t *pagebuf_get_no_daddr(/* allocate pagebuf struct */
222 /* without disk address */
223 size_t len,
224 xfs_buftarg_t *); /* mount point "fake" inode */
225
226extern int pagebuf_associate_memory(
227 xfs_buf_t *,
228 void *,
229 size_t);
230
231extern void pagebuf_hold( /* increment reference count */
232 xfs_buf_t *); /* buffer to hold */
233
234extern void pagebuf_readahead( /* read ahead into cache */
235 xfs_buftarg_t *, /* target for buffer (or NULL) */
236 loff_t, /* starting offset of range */
237 size_t, /* length of range */
238 page_buf_flags_t); /* additional read flags */
239
240/* Releasing Buffers */
241
242extern void pagebuf_free( /* deallocate a buffer */
243 xfs_buf_t *); /* buffer to deallocate */
244
245extern void pagebuf_rele( /* release hold on a buffer */
246 xfs_buf_t *); /* buffer to release */
247
248/* Locking and Unlocking Buffers */
249
250extern int pagebuf_cond_lock( /* lock buffer, if not locked */
251 /* (returns -EBUSY if locked) */
252 xfs_buf_t *); /* buffer to lock */
253
254extern int pagebuf_lock_value( /* return count on lock */
255 xfs_buf_t *); /* buffer to check */
256
257extern int pagebuf_lock( /* lock buffer */
258 xfs_buf_t *); /* buffer to lock */
259
260extern void pagebuf_unlock( /* unlock buffer */
261 xfs_buf_t *); /* buffer to unlock */
262
263/* Buffer Read and Write Routines */
264
265extern void pagebuf_iodone( /* mark buffer I/O complete */
266 xfs_buf_t *, /* buffer to mark */
267 int, /* use data/log helper thread. */
268 int); /* run completion locally, or in
269 * a helper thread. */
270
271extern void pagebuf_ioerror( /* mark buffer in error (or not) */
272 xfs_buf_t *, /* buffer to mark */
273 int); /* error to store (0 if none) */
274
275extern int pagebuf_iostart( /* start I/O on a buffer */
276 xfs_buf_t *, /* buffer to start */
277 page_buf_flags_t); /* PBF_LOCK, PBF_ASYNC, */
278 /* PBF_READ, PBF_WRITE, */
279 /* PBF_DELWRI */
280
281extern int pagebuf_iorequest( /* start real I/O */
282 xfs_buf_t *); /* buffer to convey to device */
283
284extern int pagebuf_iowait( /* wait for buffer I/O done */
285 xfs_buf_t *); /* buffer to wait on */
286
287extern void pagebuf_iomove( /* move data in/out of pagebuf */
288 xfs_buf_t *, /* buffer to manipulate */
289 size_t, /* starting buffer offset */
290 size_t, /* length in buffer */
291 caddr_t, /* data pointer */
292 page_buf_rw_t); /* direction */
293
294static inline int pagebuf_iostrategy(xfs_buf_t *pb)
295{
296 return pb->pb_strat ? pb->pb_strat(pb) : pagebuf_iorequest(pb);
297}
298
299static inline int pagebuf_geterror(xfs_buf_t *pb)
300{
301 return pb ? pb->pb_error : ENOMEM;
302}
303
304/* Buffer Utility Routines */
305
306extern caddr_t pagebuf_offset( /* pointer at offset in buffer */
307 xfs_buf_t *, /* buffer to offset into */
308 size_t); /* offset */
309
310/* Pinning Buffer Storage in Memory */
311
312extern void pagebuf_pin( /* pin buffer in memory */
313 xfs_buf_t *); /* buffer to pin */
314
315extern void pagebuf_unpin( /* unpin buffered data */
316 xfs_buf_t *); /* buffer to unpin */
317
318extern int pagebuf_ispin( /* check if buffer is pinned */
319 xfs_buf_t *); /* buffer to check */
320
321/* Delayed Write Buffer Routines */
322
323extern void pagebuf_delwri_dequeue(xfs_buf_t *);
324
325/* Buffer Daemon Setup Routines */
326
327extern int pagebuf_init(void);
328extern void pagebuf_terminate(void);
329
330
331#ifdef PAGEBUF_TRACE
332extern ktrace_t *pagebuf_trace_buf;
333extern void pagebuf_trace(
334 xfs_buf_t *, /* buffer being traced */
335 char *, /* description of operation */
336 void *, /* arbitrary diagnostic value */
337 void *); /* return address */
338#else
339# define pagebuf_trace(pb, id, ptr, ra) do { } while (0)
340#endif
341
342#define pagebuf_target_name(target) \
343 ({ char __b[BDEVNAME_SIZE]; bdevname((target)->pbr_bdev, __b); __b; })
344
345
346
347
348
349/* These are just for xfs_syncsub... it sets an internal variable
350 * then passes it to VOP_FLUSH_PAGES or adds the flags to a newly gotten buf_t
351 */
352#define XFS_B_ASYNC PBF_ASYNC
353#define XFS_B_DELWRI PBF_DELWRI
354#define XFS_B_READ PBF_READ
355#define XFS_B_WRITE PBF_WRITE
356#define XFS_B_STALE PBF_STALE
357
358#define XFS_BUF_TRYLOCK PBF_TRYLOCK
359#define XFS_INCORE_TRYLOCK PBF_TRYLOCK
360#define XFS_BUF_LOCK PBF_LOCK
361#define XFS_BUF_MAPPED PBF_MAPPED
362
363#define BUF_BUSY PBF_DONT_BLOCK
364
365#define XFS_BUF_BFLAGS(x) ((x)->pb_flags)
366#define XFS_BUF_ZEROFLAGS(x) \
367 ((x)->pb_flags &= ~(PBF_READ|PBF_WRITE|PBF_ASYNC|PBF_DELWRI))
368
369#define XFS_BUF_STALE(x) ((x)->pb_flags |= XFS_B_STALE)
370#define XFS_BUF_UNSTALE(x) ((x)->pb_flags &= ~XFS_B_STALE)
371#define XFS_BUF_ISSTALE(x) ((x)->pb_flags & XFS_B_STALE)
372#define XFS_BUF_SUPER_STALE(x) do { \
373 XFS_BUF_STALE(x); \
374 pagebuf_delwri_dequeue(x); \
375 XFS_BUF_DONE(x); \
376 } while (0)
377
378#define XFS_BUF_MANAGE PBF_FS_MANAGED
379#define XFS_BUF_UNMANAGE(x) ((x)->pb_flags &= ~PBF_FS_MANAGED)
380
381#define XFS_BUF_DELAYWRITE(x) ((x)->pb_flags |= PBF_DELWRI)
382#define XFS_BUF_UNDELAYWRITE(x) pagebuf_delwri_dequeue(x)
383#define XFS_BUF_ISDELAYWRITE(x) ((x)->pb_flags & PBF_DELWRI)
384
385#define XFS_BUF_ERROR(x,no) pagebuf_ioerror(x,no)
386#define XFS_BUF_GETERROR(x) pagebuf_geterror(x)
387#define XFS_BUF_ISERROR(x) (pagebuf_geterror(x)?1:0)
388
389#define XFS_BUF_DONE(x) ((x)->pb_flags &= ~(PBF_PARTIAL|PBF_NONE))
390#define XFS_BUF_UNDONE(x) ((x)->pb_flags |= PBF_PARTIAL|PBF_NONE)
391#define XFS_BUF_ISDONE(x) (!(PBF_NOT_DONE(x)))
392
393#define XFS_BUF_BUSY(x) ((x)->pb_flags |= PBF_FORCEIO)
394#define XFS_BUF_UNBUSY(x) ((x)->pb_flags &= ~PBF_FORCEIO)
395#define XFS_BUF_ISBUSY(x) (1)
396
397#define XFS_BUF_ASYNC(x) ((x)->pb_flags |= PBF_ASYNC)
398#define XFS_BUF_UNASYNC(x) ((x)->pb_flags &= ~PBF_ASYNC)
399#define XFS_BUF_ISASYNC(x) ((x)->pb_flags & PBF_ASYNC)
400
401#define XFS_BUF_FLUSH(x) ((x)->pb_flags |= PBF_FLUSH)
402#define XFS_BUF_UNFLUSH(x) ((x)->pb_flags &= ~PBF_FLUSH)
403#define XFS_BUF_ISFLUSH(x) ((x)->pb_flags & PBF_FLUSH)
404
405#define XFS_BUF_SHUT(x) printk("XFS_BUF_SHUT not implemented yet\n")
406#define XFS_BUF_UNSHUT(x) printk("XFS_BUF_UNSHUT not implemented yet\n")
407#define XFS_BUF_ISSHUT(x) (0)
408
409#define XFS_BUF_HOLD(x) pagebuf_hold(x)
410#define XFS_BUF_READ(x) ((x)->pb_flags |= PBF_READ)
411#define XFS_BUF_UNREAD(x) ((x)->pb_flags &= ~PBF_READ)
412#define XFS_BUF_ISREAD(x) ((x)->pb_flags & PBF_READ)
413
414#define XFS_BUF_WRITE(x) ((x)->pb_flags |= PBF_WRITE)
415#define XFS_BUF_UNWRITE(x) ((x)->pb_flags &= ~PBF_WRITE)
416#define XFS_BUF_ISWRITE(x) ((x)->pb_flags & PBF_WRITE)
417
418#define XFS_BUF_ISUNINITIAL(x) (0)
419#define XFS_BUF_UNUNINITIAL(x) (0)
420
421#define XFS_BUF_BP_ISMAPPED(bp) 1
422
423#define XFS_BUF_DATAIO(x) ((x)->pb_flags |= PBF_FS_DATAIOD)
424#define XFS_BUF_UNDATAIO(x) ((x)->pb_flags &= ~PBF_FS_DATAIOD)
425
426#define XFS_BUF_IODONE_FUNC(buf) (buf)->pb_iodone
427#define XFS_BUF_SET_IODONE_FUNC(buf, func) \
428 (buf)->pb_iodone = (func)
429#define XFS_BUF_CLR_IODONE_FUNC(buf) \
430 (buf)->pb_iodone = NULL
431#define XFS_BUF_SET_BDSTRAT_FUNC(buf, func) \
432 (buf)->pb_strat = (func)
433#define XFS_BUF_CLR_BDSTRAT_FUNC(buf) \
434 (buf)->pb_strat = NULL
435
436#define XFS_BUF_FSPRIVATE(buf, type) \
437 ((type)(buf)->pb_fspriv)
438#define XFS_BUF_SET_FSPRIVATE(buf, value) \
439 (buf)->pb_fspriv = (void *)(value)
440#define XFS_BUF_FSPRIVATE2(buf, type) \
441 ((type)(buf)->pb_fspriv2)
442#define XFS_BUF_SET_FSPRIVATE2(buf, value) \
443 (buf)->pb_fspriv2 = (void *)(value)
444#define XFS_BUF_FSPRIVATE3(buf, type) \
445 ((type)(buf)->pb_fspriv3)
446#define XFS_BUF_SET_FSPRIVATE3(buf, value) \
447 (buf)->pb_fspriv3 = (void *)(value)
448#define XFS_BUF_SET_START(buf)
449
450#define XFS_BUF_SET_BRELSE_FUNC(buf, value) \
451 (buf)->pb_relse = (value)
452
453#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->pb_addr)
454
455extern inline xfs_caddr_t xfs_buf_offset(xfs_buf_t *bp, size_t offset)
456{
457 if (bp->pb_flags & PBF_MAPPED)
458 return XFS_BUF_PTR(bp) + offset;
459 return (xfs_caddr_t) pagebuf_offset(bp, offset);
460}
461
462#define XFS_BUF_SET_PTR(bp, val, count) \
463 pagebuf_associate_memory(bp, val, count)
464#define XFS_BUF_ADDR(bp) ((bp)->pb_bn)
465#define XFS_BUF_SET_ADDR(bp, blk) \
466 ((bp)->pb_bn = (xfs_daddr_t)(blk))
467#define XFS_BUF_OFFSET(bp) ((bp)->pb_file_offset)
468#define XFS_BUF_SET_OFFSET(bp, off) \
469 ((bp)->pb_file_offset = (off))
470#define XFS_BUF_COUNT(bp) ((bp)->pb_count_desired)
471#define XFS_BUF_SET_COUNT(bp, cnt) \
472 ((bp)->pb_count_desired = (cnt))
473#define XFS_BUF_SIZE(bp) ((bp)->pb_buffer_length)
474#define XFS_BUF_SET_SIZE(bp, cnt) \
475 ((bp)->pb_buffer_length = (cnt))
476#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)
477#define XFS_BUF_SET_VTYPE(bp, type)
478#define XFS_BUF_SET_REF(bp, ref)
479
480#define XFS_BUF_ISPINNED(bp) pagebuf_ispin(bp)
481
482#define XFS_BUF_VALUSEMA(bp) pagebuf_lock_value(bp)
483#define XFS_BUF_CPSEMA(bp) (pagebuf_cond_lock(bp) == 0)
484#define XFS_BUF_VSEMA(bp) pagebuf_unlock(bp)
485#define XFS_BUF_PSEMA(bp,x) pagebuf_lock(bp)
486#define XFS_BUF_V_IODONESEMA(bp) up(&bp->pb_iodonesema);
487
488/* setup the buffer target from a buftarg structure */
489#define XFS_BUF_SET_TARGET(bp, target) \
490 (bp)->pb_target = (target)
491#define XFS_BUF_TARGET(bp) ((bp)->pb_target)
492#define XFS_BUFTARG_NAME(target) \
493 pagebuf_target_name(target)
494
495#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)
496#define XFS_BUF_SET_VTYPE(bp, type)
497#define XFS_BUF_SET_REF(bp, ref)
498
499static inline int xfs_bawrite(void *mp, xfs_buf_t *bp)
500{
501 bp->pb_fspriv3 = mp;
502 bp->pb_strat = xfs_bdstrat_cb;
503 pagebuf_delwri_dequeue(bp);
504 return pagebuf_iostart(bp, PBF_WRITE | PBF_ASYNC | _PBF_RUN_QUEUES);
505}
506
507static inline void xfs_buf_relse(xfs_buf_t *bp)
508{
509 if (!bp->pb_relse)
510 pagebuf_unlock(bp);
511 pagebuf_rele(bp);
512}
513
514#define xfs_bpin(bp) pagebuf_pin(bp)
515#define xfs_bunpin(bp) pagebuf_unpin(bp)
516
517#define xfs_buftrace(id, bp) \
518 pagebuf_trace(bp, id, NULL, (void *)__builtin_return_address(0))
519
520#define xfs_biodone(pb) \
521 pagebuf_iodone(pb, (pb->pb_flags & PBF_FS_DATAIOD), 0)
522
523#define xfs_biomove(pb, off, len, data, rw) \
524 pagebuf_iomove((pb), (off), (len), (data), \
525 ((rw) == XFS_B_WRITE) ? PBRW_WRITE : PBRW_READ)
526
527#define xfs_biozero(pb, off, len) \
528 pagebuf_iomove((pb), (off), (len), NULL, PBRW_ZERO)
529
530
531static inline int XFS_bwrite(xfs_buf_t *pb)
532{
533 int iowait = (pb->pb_flags & PBF_ASYNC) == 0;
534 int error = 0;
535
536 if (!iowait)
537 pb->pb_flags |= _PBF_RUN_QUEUES;
538
539 pagebuf_delwri_dequeue(pb);
540 pagebuf_iostrategy(pb);
541 if (iowait) {
542 error = pagebuf_iowait(pb);
543 xfs_buf_relse(pb);
544 }
545 return error;
546}
547
548#define XFS_bdwrite(pb) \
549 pagebuf_iostart(pb, PBF_DELWRI | PBF_ASYNC)
550
551static inline int xfs_bdwrite(void *mp, xfs_buf_t *bp)
552{
553 bp->pb_strat = xfs_bdstrat_cb;
554 bp->pb_fspriv3 = mp;
555
556 return pagebuf_iostart(bp, PBF_DELWRI | PBF_ASYNC);
557}
558
559#define XFS_bdstrat(bp) pagebuf_iorequest(bp)
560
561#define xfs_iowait(pb) pagebuf_iowait(pb)
562
563#define xfs_baread(target, rablkno, ralen) \
564 pagebuf_readahead((target), (rablkno), (ralen), PBF_DONT_BLOCK)
565
566#define xfs_buf_get_empty(len, target) pagebuf_get_empty((len), (target))
567#define xfs_buf_get_noaddr(len, target) pagebuf_get_no_daddr((len), (target))
568#define xfs_buf_free(bp) pagebuf_free(bp)
569
570
571/*
572 * Handling of buftargs.
573 */
574
575extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int);
576extern void xfs_free_buftarg(xfs_buftarg_t *, int);
577extern void xfs_wait_buftarg(xfs_buftarg_t *);
578extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
579extern void xfs_incore_relse(xfs_buftarg_t *, int, int);
580extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
581
582#define xfs_getsize_buftarg(buftarg) \
583 block_size((buftarg)->pbr_bdev)
584#define xfs_readonly_buftarg(buftarg) \
585 bdev_read_only((buftarg)->pbr_bdev)
586#define xfs_binval(buftarg) \
587 xfs_flush_buftarg(buftarg, 1)
588#define XFS_bflush(buftarg) \
589 xfs_flush_buftarg(buftarg, 1)
590
591#endif /* __XFS_BUF_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h
new file mode 100644
index 000000000000..00c45849d41a
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_cred.h
@@ -0,0 +1,50 @@
1/*
2 * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_CRED_H__
33#define __XFS_CRED_H__
34
35/*
36 * Credentials
37 */
38typedef struct cred {
39 /* EMPTY */
40} cred_t;
41
42extern struct cred *sys_cred;
43
44/* this is a hack.. (assums sys_cred is the only cred_t in the system) */
45static __inline int capable_cred(cred_t *cr, int cid)
46{
47 return (cr == sys_cred) ? 1 : capable(cid);
48}
49
50#endif /* __XFS_CRED_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
new file mode 100644
index 000000000000..f372a1a5e168
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -0,0 +1,205 @@
1/*
2 * Copyright (c) 2004-2005 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34#include "xfs_types.h"
35#include "xfs_dmapi.h"
36#include "xfs_log.h"
37#include "xfs_trans.h"
38#include "xfs_sb.h"
39#include "xfs_dir.h"
40#include "xfs_mount.h"
41#include "xfs_export.h"
42
43/*
44 * XFS encode and decodes the fileid portion of NFS filehandles
45 * itself instead of letting the generic NFS code do it. This
46 * allows filesystems with 64 bit inode numbers to be exported.
47 *
48 * Note that a side effect is that xfs_vget() won't be passed a
49 * zero inode/generation pair under normal circumstances. As
50 * however a malicious client could send us such data, the check
51 * remains in that code.
52 */
53
54
55STATIC struct dentry *
56linvfs_decode_fh(
57 struct super_block *sb,
58 __u32 *fh,
59 int fh_len,
60 int fileid_type,
61 int (*acceptable)(
62 void *context,
63 struct dentry *de),
64 void *context)
65{
66 xfs_fid2_t ifid;
67 xfs_fid2_t pfid;
68 void *parent = NULL;
69 int is64 = 0;
70 __u32 *p = fh;
71
72#if XFS_BIG_INUMS
73 is64 = (fileid_type & XFS_FILEID_TYPE_64FLAG);
74 fileid_type &= ~XFS_FILEID_TYPE_64FLAG;
75#endif
76
77 /*
78 * Note that we only accept fileids which are long enough
79 * rather than allow the parent generation number to default
80 * to zero. XFS considers zero a valid generation number not
81 * an invalid/wildcard value. There's little point printk'ing
82 * a warning here as we don't have the client information
83 * which would make such a warning useful.
84 */
85 if (fileid_type > 2 ||
86 fh_len < xfs_fileid_length((fileid_type == 2), is64))
87 return NULL;
88
89 p = xfs_fileid_decode_fid2(p, &ifid, is64);
90
91 if (fileid_type == 2) {
92 p = xfs_fileid_decode_fid2(p, &pfid, is64);
93 parent = &pfid;
94 }
95
96 fh = (__u32 *)&ifid;
97 return find_exported_dentry(sb, fh, parent, acceptable, context);
98}
99
100
101STATIC int
102linvfs_encode_fh(
103 struct dentry *dentry,
104 __u32 *fh,
105 int *max_len,
106 int connectable)
107{
108 struct inode *inode = dentry->d_inode;
109 int type = 1;
110 __u32 *p = fh;
111 int len;
112 int is64 = 0;
113#if XFS_BIG_INUMS
114 vfs_t *vfs = LINVFS_GET_VFS(inode->i_sb);
115 xfs_mount_t *mp = XFS_VFSTOM(vfs);
116
117 if (!(mp->m_flags & XFS_MOUNT_32BITINOOPT)) {
118 /* filesystem may contain 64bit inode numbers */
119 is64 = XFS_FILEID_TYPE_64FLAG;
120 }
121#endif
122
123 /* Directories don't need their parent encoded, they have ".." */
124 if (S_ISDIR(inode->i_mode))
125 connectable = 0;
126
127 /*
128 * Only encode if there is enough space given. In practice
129 * this means we can't export a filesystem with 64bit inodes
130 * over NFSv2 with the subtree_check export option; the other
131 * seven combinations work. The real answer is "don't use v2".
132 */
133 len = xfs_fileid_length(connectable, is64);
134 if (*max_len < len)
135 return 255;
136 *max_len = len;
137
138 p = xfs_fileid_encode_inode(p, inode, is64);
139 if (connectable) {
140 spin_lock(&dentry->d_lock);
141 p = xfs_fileid_encode_inode(p, dentry->d_parent->d_inode, is64);
142 spin_unlock(&dentry->d_lock);
143 type = 2;
144 }
145 BUG_ON((p - fh) != len);
146 return type | is64;
147}
148
149STATIC struct dentry *
150linvfs_get_dentry(
151 struct super_block *sb,
152 void *data)
153{
154 vnode_t *vp;
155 struct inode *inode;
156 struct dentry *result;
157 vfs_t *vfsp = LINVFS_GET_VFS(sb);
158 int error;
159
160 VFS_VGET(vfsp, &vp, (fid_t *)data, error);
161 if (error || vp == NULL)
162 return ERR_PTR(-ESTALE) ;
163
164 inode = LINVFS_GET_IP(vp);
165 result = d_alloc_anon(inode);
166 if (!result) {
167 iput(inode);
168 return ERR_PTR(-ENOMEM);
169 }
170 return result;
171}
172
173STATIC struct dentry *
174linvfs_get_parent(
175 struct dentry *child)
176{
177 int error;
178 vnode_t *vp, *cvp;
179 struct dentry *parent;
180 struct dentry dotdot;
181
182 dotdot.d_name.name = "..";
183 dotdot.d_name.len = 2;
184 dotdot.d_inode = NULL;
185
186 cvp = NULL;
187 vp = LINVFS_GET_VP(child->d_inode);
188 VOP_LOOKUP(vp, &dotdot, &cvp, 0, NULL, NULL, error);
189 if (unlikely(error))
190 return ERR_PTR(-error);
191
192 parent = d_alloc_anon(LINVFS_GET_IP(cvp));
193 if (unlikely(!parent)) {
194 VN_RELE(cvp);
195 return ERR_PTR(-ENOMEM);
196 }
197 return parent;
198}
199
200struct export_operations linvfs_export_ops = {
201 .decode_fh = linvfs_decode_fh,
202 .encode_fh = linvfs_encode_fh,
203 .get_parent = linvfs_get_parent,
204 .get_dentry = linvfs_get_dentry,
205};
diff --git a/fs/xfs/linux-2.6/xfs_export.h b/fs/xfs/linux-2.6/xfs_export.h
new file mode 100644
index 000000000000..60b2abac1c18
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_export.h
@@ -0,0 +1,122 @@
1/*
2 * Copyright (c) 2005 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_EXPORT_H__
33#define __XFS_EXPORT_H__
34
35/*
36 * Common defines for code related to exporting XFS filesystems over NFS.
37 *
38 * The NFS fileid goes out on the wire as an array of
39 * 32bit unsigned ints in host order. There are 5 possible
40 * formats.
41 *
42 * (1) fileid_type=0x00
43 * (no fileid data; handled by the generic code)
44 *
45 * (2) fileid_type=0x01
46 * inode-num
47 * generation
48 *
49 * (3) fileid_type=0x02
50 * inode-num
51 * generation
52 * parent-inode-num
53 * parent-generation
54 *
55 * (4) fileid_type=0x81
56 * inode-num-lo32
57 * inode-num-hi32
58 * generation
59 *
60 * (5) fileid_type=0x82
61 * inode-num-lo32
62 * inode-num-hi32
63 * generation
64 * parent-inode-num-lo32
65 * parent-inode-num-hi32
66 * parent-generation
67 *
68 * Note, the NFS filehandle also includes an fsid portion which
69 * may have an inode number in it. That number is hardcoded to
70 * 32bits and there is no way for XFS to intercept it. In
71 * practice this means when exporting an XFS filesytem with 64bit
72 * inodes you should either export the mountpoint (rather than
73 * a subdirectory) or use the "fsid" export option.
74 */
75
76/* This flag goes on the wire. Don't play with it. */
77#define XFS_FILEID_TYPE_64FLAG 0x80 /* NFS fileid has 64bit inodes */
78
79/* Calculate the length in u32 units of the fileid data */
80static inline int
81xfs_fileid_length(int hasparent, int is64)
82{
83 return hasparent ? (is64 ? 6 : 4) : (is64 ? 3 : 2);
84}
85
86/*
87 * Decode encoded inode information (either for the inode itself
88 * or the parent) into an xfs_fid2_t structure. Advances and
89 * returns the new data pointer
90 */
91static inline __u32 *
92xfs_fileid_decode_fid2(__u32 *p, xfs_fid2_t *fid, int is64)
93{
94 fid->fid_len = sizeof(xfs_fid2_t) - sizeof(fid->fid_len);
95 fid->fid_pad = 0;
96 fid->fid_ino = *p++;
97#if XFS_BIG_INUMS
98 if (is64)
99 fid->fid_ino |= (((__u64)(*p++)) << 32);
100#endif
101 fid->fid_gen = *p++;
102 return p;
103}
104
105/*
106 * Encode inode information (either for the inode itself or the
107 * parent) into a fileid buffer. Advances and returns the new
108 * data pointer.
109 */
110static inline __u32 *
111xfs_fileid_encode_inode(__u32 *p, struct inode *inode, int is64)
112{
113 *p++ = (__u32)inode->i_ino;
114#if XFS_BIG_INUMS
115 if (is64)
116 *p++ = (__u32)(inode->i_ino >> 32);
117#endif
118 *p++ = inode->i_generation;
119 return p;
120}
121
122#endif /* __XFS_EXPORT_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
new file mode 100644
index 000000000000..9f057a4a5b06
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -0,0 +1,573 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34#include "xfs_inum.h"
35#include "xfs_log.h"
36#include "xfs_sb.h"
37#include "xfs_dir.h"
38#include "xfs_dir2.h"
39#include "xfs_trans.h"
40#include "xfs_dmapi.h"
41#include "xfs_mount.h"
42#include "xfs_bmap_btree.h"
43#include "xfs_alloc_btree.h"
44#include "xfs_ialloc_btree.h"
45#include "xfs_alloc.h"
46#include "xfs_btree.h"
47#include "xfs_attr_sf.h"
48#include "xfs_dir_sf.h"
49#include "xfs_dir2_sf.h"
50#include "xfs_dinode.h"
51#include "xfs_inode.h"
52#include "xfs_error.h"
53#include "xfs_rw.h"
54#include "xfs_ioctl32.h"
55
56#include <linux/dcache.h>
57#include <linux/smp_lock.h>
58
59static struct vm_operations_struct linvfs_file_vm_ops;
60
61
62STATIC inline ssize_t
63__linvfs_read(
64 struct kiocb *iocb,
65 char __user *buf,
66 int ioflags,
67 size_t count,
68 loff_t pos)
69{
70 struct iovec iov = {buf, count};
71 struct file *file = iocb->ki_filp;
72 vnode_t *vp = LINVFS_GET_VP(file->f_dentry->d_inode);
73 ssize_t rval;
74
75 BUG_ON(iocb->ki_pos != pos);
76
77 if (unlikely(file->f_flags & O_DIRECT))
78 ioflags |= IO_ISDIRECT;
79 VOP_READ(vp, iocb, &iov, 1, &iocb->ki_pos, ioflags, NULL, rval);
80 return rval;
81}
82
83
84STATIC ssize_t
85linvfs_aio_read(
86 struct kiocb *iocb,
87 char __user *buf,
88 size_t count,
89 loff_t pos)
90{
91 return __linvfs_read(iocb, buf, IO_ISAIO, count, pos);
92}
93
94STATIC ssize_t
95linvfs_aio_read_invis(
96 struct kiocb *iocb,
97 char __user *buf,
98 size_t count,
99 loff_t pos)
100{
101 return __linvfs_read(iocb, buf, IO_ISAIO|IO_INVIS, count, pos);
102}
103
104
105STATIC inline ssize_t
106__linvfs_write(
107 struct kiocb *iocb,
108 const char __user *buf,
109 int ioflags,
110 size_t count,
111 loff_t pos)
112{
113 struct iovec iov = {(void __user *)buf, count};
114 struct file *file = iocb->ki_filp;
115 struct inode *inode = file->f_mapping->host;
116 vnode_t *vp = LINVFS_GET_VP(inode);
117 ssize_t rval;
118
119 BUG_ON(iocb->ki_pos != pos);
120 if (unlikely(file->f_flags & O_DIRECT))
121 ioflags |= IO_ISDIRECT;
122
123 VOP_WRITE(vp, iocb, &iov, 1, &iocb->ki_pos, ioflags, NULL, rval);
124 return rval;
125}
126
127
128STATIC ssize_t
129linvfs_aio_write(
130 struct kiocb *iocb,
131 const char __user *buf,
132 size_t count,
133 loff_t pos)
134{
135 return __linvfs_write(iocb, buf, IO_ISAIO, count, pos);
136}
137
138STATIC ssize_t
139linvfs_aio_write_invis(
140 struct kiocb *iocb,
141 const char __user *buf,
142 size_t count,
143 loff_t pos)
144{
145 return __linvfs_write(iocb, buf, IO_ISAIO|IO_INVIS, count, pos);
146}
147
148
149STATIC inline ssize_t
150__linvfs_readv(
151 struct file *file,
152 const struct iovec *iov,
153 int ioflags,
154 unsigned long nr_segs,
155 loff_t *ppos)
156{
157 struct inode *inode = file->f_mapping->host;
158 vnode_t *vp = LINVFS_GET_VP(inode);
159 struct kiocb kiocb;
160 ssize_t rval;
161
162 init_sync_kiocb(&kiocb, file);
163 kiocb.ki_pos = *ppos;
164
165 if (unlikely(file->f_flags & O_DIRECT))
166 ioflags |= IO_ISDIRECT;
167 VOP_READ(vp, &kiocb, iov, nr_segs, &kiocb.ki_pos, ioflags, NULL, rval);
168
169 *ppos = kiocb.ki_pos;
170 return rval;
171}
172
173STATIC ssize_t
174linvfs_readv(
175 struct file *file,
176 const struct iovec *iov,
177 unsigned long nr_segs,
178 loff_t *ppos)
179{
180 return __linvfs_readv(file, iov, 0, nr_segs, ppos);
181}
182
183STATIC ssize_t
184linvfs_readv_invis(
185 struct file *file,
186 const struct iovec *iov,
187 unsigned long nr_segs,
188 loff_t *ppos)
189{
190 return __linvfs_readv(file, iov, IO_INVIS, nr_segs, ppos);
191}
192
193
194STATIC inline ssize_t
195__linvfs_writev(
196 struct file *file,
197 const struct iovec *iov,
198 int ioflags,
199 unsigned long nr_segs,
200 loff_t *ppos)
201{
202 struct inode *inode = file->f_mapping->host;
203 vnode_t *vp = LINVFS_GET_VP(inode);
204 struct kiocb kiocb;
205 ssize_t rval;
206
207 init_sync_kiocb(&kiocb, file);
208 kiocb.ki_pos = *ppos;
209 if (unlikely(file->f_flags & O_DIRECT))
210 ioflags |= IO_ISDIRECT;
211
212 VOP_WRITE(vp, &kiocb, iov, nr_segs, &kiocb.ki_pos, ioflags, NULL, rval);
213
214 *ppos = kiocb.ki_pos;
215 return rval;
216}
217
218
219STATIC ssize_t
220linvfs_writev(
221 struct file *file,
222 const struct iovec *iov,
223 unsigned long nr_segs,
224 loff_t *ppos)
225{
226 return __linvfs_writev(file, iov, 0, nr_segs, ppos);
227}
228
229STATIC ssize_t
230linvfs_writev_invis(
231 struct file *file,
232 const struct iovec *iov,
233 unsigned long nr_segs,
234 loff_t *ppos)
235{
236 return __linvfs_writev(file, iov, IO_INVIS, nr_segs, ppos);
237}
238
239STATIC ssize_t
240linvfs_sendfile(
241 struct file *filp,
242 loff_t *ppos,
243 size_t count,
244 read_actor_t actor,
245 void *target)
246{
247 vnode_t *vp = LINVFS_GET_VP(filp->f_dentry->d_inode);
248 ssize_t rval;
249
250 VOP_SENDFILE(vp, filp, ppos, 0, count, actor, target, NULL, rval);
251 return rval;
252}
253
254
255STATIC int
256linvfs_open(
257 struct inode *inode,
258 struct file *filp)
259{
260 vnode_t *vp = LINVFS_GET_VP(inode);
261 int error;
262
263 if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
264 return -EFBIG;
265
266 ASSERT(vp);
267 VOP_OPEN(vp, NULL, error);
268 return -error;
269}
270
271
272STATIC int
273linvfs_release(
274 struct inode *inode,
275 struct file *filp)
276{
277 vnode_t *vp = LINVFS_GET_VP(inode);
278 int error = 0;
279
280 if (vp)
281 VOP_RELEASE(vp, error);
282 return -error;
283}
284
285
286STATIC int
287linvfs_fsync(
288 struct file *filp,
289 struct dentry *dentry,
290 int datasync)
291{
292 struct inode *inode = dentry->d_inode;
293 vnode_t *vp = LINVFS_GET_VP(inode);
294 int error;
295 int flags = FSYNC_WAIT;
296
297 if (datasync)
298 flags |= FSYNC_DATA;
299
300 ASSERT(vp);
301 VOP_FSYNC(vp, flags, NULL, (xfs_off_t)0, (xfs_off_t)-1, error);
302 return -error;
303}
304
305/*
306 * linvfs_readdir maps to VOP_READDIR().
307 * We need to build a uio, cred, ...
308 */
309
310#define nextdp(dp) ((struct xfs_dirent *)((char *)(dp) + (dp)->d_reclen))
311
312STATIC int
313linvfs_readdir(
314 struct file *filp,
315 void *dirent,
316 filldir_t filldir)
317{
318 int error = 0;
319 vnode_t *vp;
320 uio_t uio;
321 iovec_t iov;
322 int eof = 0;
323 caddr_t read_buf;
324 int namelen, size = 0;
325 size_t rlen = PAGE_CACHE_SIZE;
326 xfs_off_t start_offset, curr_offset;
327 xfs_dirent_t *dbp = NULL;
328
329 vp = LINVFS_GET_VP(filp->f_dentry->d_inode);
330 ASSERT(vp);
331
332 /* Try fairly hard to get memory */
333 do {
334 if ((read_buf = (caddr_t)kmalloc(rlen, GFP_KERNEL)))
335 break;
336 rlen >>= 1;
337 } while (rlen >= 1024);
338
339 if (read_buf == NULL)
340 return -ENOMEM;
341
342 uio.uio_iov = &iov;
343 uio.uio_segflg = UIO_SYSSPACE;
344 curr_offset = filp->f_pos;
345 if (filp->f_pos != 0x7fffffff)
346 uio.uio_offset = filp->f_pos;
347 else
348 uio.uio_offset = 0xffffffff;
349
350 while (!eof) {
351 uio.uio_resid = iov.iov_len = rlen;
352 iov.iov_base = read_buf;
353 uio.uio_iovcnt = 1;
354
355 start_offset = uio.uio_offset;
356
357 VOP_READDIR(vp, &uio, NULL, &eof, error);
358 if ((uio.uio_offset == start_offset) || error) {
359 size = 0;
360 break;
361 }
362
363 size = rlen - uio.uio_resid;
364 dbp = (xfs_dirent_t *)read_buf;
365 while (size > 0) {
366 namelen = strlen(dbp->d_name);
367
368 if (filldir(dirent, dbp->d_name, namelen,
369 (loff_t) curr_offset & 0x7fffffff,
370 (ino_t) dbp->d_ino,
371 DT_UNKNOWN)) {
372 goto done;
373 }
374 size -= dbp->d_reclen;
375 curr_offset = (loff_t)dbp->d_off /* & 0x7fffffff */;
376 dbp = nextdp(dbp);
377 }
378 }
379done:
380 if (!error) {
381 if (size == 0)
382 filp->f_pos = uio.uio_offset & 0x7fffffff;
383 else if (dbp)
384 filp->f_pos = curr_offset;
385 }
386
387 kfree(read_buf);
388 return -error;
389}
390
391
392STATIC int
393linvfs_file_mmap(
394 struct file *filp,
395 struct vm_area_struct *vma)
396{
397 struct inode *ip = filp->f_dentry->d_inode;
398 vnode_t *vp = LINVFS_GET_VP(ip);
399 vattr_t va = { .va_mask = XFS_AT_UPDATIME };
400 int error;
401
402 if (vp->v_vfsp->vfs_flag & VFS_DMI) {
403 xfs_mount_t *mp = XFS_VFSTOM(vp->v_vfsp);
404
405 error = -XFS_SEND_MMAP(mp, vma, 0);
406 if (error)
407 return error;
408 }
409
410 vma->vm_ops = &linvfs_file_vm_ops;
411
412 VOP_SETATTR(vp, &va, XFS_AT_UPDATIME, NULL, error);
413 if (!error)
414 vn_revalidate(vp); /* update Linux inode flags */
415 return 0;
416}
417
418
419STATIC long
420linvfs_ioctl(
421 struct file *filp,
422 unsigned int cmd,
423 unsigned long arg)
424{
425 int error;
426 struct inode *inode = filp->f_dentry->d_inode;
427 vnode_t *vp = LINVFS_GET_VP(inode);
428
429 VOP_IOCTL(vp, inode, filp, 0, cmd, (void __user *)arg, error);
430 VMODIFY(vp);
431
432 /* NOTE: some of the ioctl's return positive #'s as a
433 * byte count indicating success, such as
434 * readlink_by_handle. So we don't "sign flip"
435 * like most other routines. This means true
436 * errors need to be returned as a negative value.
437 */
438 return error;
439}
440
441STATIC long
442linvfs_ioctl_invis(
443 struct file *filp,
444 unsigned int cmd,
445 unsigned long arg)
446{
447 int error;
448 struct inode *inode = filp->f_dentry->d_inode;
449 vnode_t *vp = LINVFS_GET_VP(inode);
450
451 ASSERT(vp);
452 VOP_IOCTL(vp, inode, filp, IO_INVIS, cmd, (void __user *)arg, error);
453 VMODIFY(vp);
454
455 /* NOTE: some of the ioctl's return positive #'s as a
456 * byte count indicating success, such as
457 * readlink_by_handle. So we don't "sign flip"
458 * like most other routines. This means true
459 * errors need to be returned as a negative value.
460 */
461 return error;
462}
463
464#ifdef HAVE_VMOP_MPROTECT
465STATIC int
466linvfs_mprotect(
467 struct vm_area_struct *vma,
468 unsigned int newflags)
469{
470 vnode_t *vp = LINVFS_GET_VP(vma->vm_file->f_dentry->d_inode);
471 int error = 0;
472
473 if (vp->v_vfsp->vfs_flag & VFS_DMI) {
474 if ((vma->vm_flags & VM_MAYSHARE) &&
475 (newflags & VM_WRITE) && !(vma->vm_flags & VM_WRITE)) {
476 xfs_mount_t *mp = XFS_VFSTOM(vp->v_vfsp);
477
478 error = XFS_SEND_MMAP(mp, vma, VM_WRITE);
479 }
480 }
481 return error;
482}
483#endif /* HAVE_VMOP_MPROTECT */
484
485#ifdef HAVE_FOP_OPEN_EXEC
486/* If the user is attempting to execute a file that is offline then
487 * we have to trigger a DMAPI READ event before the file is marked as busy
488 * otherwise the invisible I/O will not be able to write to the file to bring
489 * it back online.
490 */
491STATIC int
492linvfs_open_exec(
493 struct inode *inode)
494{
495 vnode_t *vp = LINVFS_GET_VP(inode);
496 xfs_mount_t *mp = XFS_VFSTOM(vp->v_vfsp);
497 int error = 0;
498 bhv_desc_t *bdp;
499 xfs_inode_t *ip;
500
501 if (vp->v_vfsp->vfs_flag & VFS_DMI) {
502 bdp = vn_bhv_lookup(VN_BHV_HEAD(vp), &xfs_vnodeops);
503 if (!bdp) {
504 error = -EINVAL;
505 goto open_exec_out;
506 }
507 ip = XFS_BHVTOI(bdp);
508 if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ)) {
509 error = -XFS_SEND_DATA(mp, DM_EVENT_READ, vp,
510 0, 0, 0, NULL);
511 }
512 }
513open_exec_out:
514 return error;
515}
516#endif /* HAVE_FOP_OPEN_EXEC */
517
518struct file_operations linvfs_file_operations = {
519 .llseek = generic_file_llseek,
520 .read = do_sync_read,
521 .write = do_sync_write,
522 .readv = linvfs_readv,
523 .writev = linvfs_writev,
524 .aio_read = linvfs_aio_read,
525 .aio_write = linvfs_aio_write,
526 .sendfile = linvfs_sendfile,
527 .unlocked_ioctl = linvfs_ioctl,
528#ifdef CONFIG_COMPAT
529 .compat_ioctl = xfs_compat_ioctl,
530#endif
531 .mmap = linvfs_file_mmap,
532 .open = linvfs_open,
533 .release = linvfs_release,
534 .fsync = linvfs_fsync,
535#ifdef HAVE_FOP_OPEN_EXEC
536 .open_exec = linvfs_open_exec,
537#endif
538};
539
540struct file_operations linvfs_invis_file_operations = {
541 .llseek = generic_file_llseek,
542 .read = do_sync_read,
543 .write = do_sync_write,
544 .readv = linvfs_readv_invis,
545 .writev = linvfs_writev_invis,
546 .aio_read = linvfs_aio_read_invis,
547 .aio_write = linvfs_aio_write_invis,
548 .sendfile = linvfs_sendfile,
549 .unlocked_ioctl = linvfs_ioctl_invis,
550#ifdef CONFIG_COMPAT
551 .compat_ioctl = xfs_compat_invis_ioctl,
552#endif
553 .mmap = linvfs_file_mmap,
554 .open = linvfs_open,
555 .release = linvfs_release,
556 .fsync = linvfs_fsync,
557};
558
559
560struct file_operations linvfs_dir_operations = {
561 .read = generic_read_dir,
562 .readdir = linvfs_readdir,
563 .unlocked_ioctl = linvfs_ioctl,
564 .fsync = linvfs_fsync,
565};
566
567static struct vm_operations_struct linvfs_file_vm_ops = {
568 .nopage = filemap_nopage,
569 .populate = filemap_populate,
570#ifdef HAVE_VMOP_MPROTECT
571 .mprotect = linvfs_mprotect,
572#endif
573};
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
new file mode 100644
index 000000000000..05ebd30ec96f
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -0,0 +1,124 @@
1/*
2 * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34
35/*
36 * Stub for no-op vnode operations that return error status.
37 */
38int
39fs_noerr(void)
40{
41 return 0;
42}
43
44/*
45 * Operation unsupported under this file system.
46 */
47int
48fs_nosys(void)
49{
50 return ENOSYS;
51}
52
53/*
54 * Stub for inactive, strategy, and read/write lock/unlock. Does nothing.
55 */
56/* ARGSUSED */
57void
58fs_noval(void)
59{
60}
61
62/*
63 * vnode pcache layer for vnode_tosspages.
64 * 'last' parameter unused but left in for IRIX compatibility
65 */
66void
67fs_tosspages(
68 bhv_desc_t *bdp,
69 xfs_off_t first,
70 xfs_off_t last,
71 int fiopt)
72{
73 vnode_t *vp = BHV_TO_VNODE(bdp);
74 struct inode *ip = LINVFS_GET_IP(vp);
75
76 if (VN_CACHED(vp))
77 truncate_inode_pages(ip->i_mapping, first);
78}
79
80
81/*
82 * vnode pcache layer for vnode_flushinval_pages.
83 * 'last' parameter unused but left in for IRIX compatibility
84 */
85void
86fs_flushinval_pages(
87 bhv_desc_t *bdp,
88 xfs_off_t first,
89 xfs_off_t last,
90 int fiopt)
91{
92 vnode_t *vp = BHV_TO_VNODE(bdp);
93 struct inode *ip = LINVFS_GET_IP(vp);
94
95 if (VN_CACHED(vp)) {
96 filemap_fdatawrite(ip->i_mapping);
97 filemap_fdatawait(ip->i_mapping);
98
99 truncate_inode_pages(ip->i_mapping, first);
100 }
101}
102
103/*
104 * vnode pcache layer for vnode_flush_pages.
105 * 'last' parameter unused but left in for IRIX compatibility
106 */
107int
108fs_flush_pages(
109 bhv_desc_t *bdp,
110 xfs_off_t first,
111 xfs_off_t last,
112 uint64_t flags,
113 int fiopt)
114{
115 vnode_t *vp = BHV_TO_VNODE(bdp);
116 struct inode *ip = LINVFS_GET_IP(vp);
117
118 if (VN_CACHED(vp)) {
119 filemap_fdatawrite(ip->i_mapping);
120 filemap_fdatawait(ip->i_mapping);
121 }
122
123 return 0;
124}
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.h b/fs/xfs/linux-2.6/xfs_fs_subr.h
new file mode 100644
index 000000000000..2db9ddbd4567
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.h
@@ -0,0 +1,48 @@
1/*
2 * Copyright (c) 2000, 2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_SUBR_H__
33#define __XFS_SUBR_H__
34
35/*
36 * Utilities shared among file system implementations.
37 */
38
39struct cred;
40
41extern int fs_noerr(void);
42extern int fs_nosys(void);
43extern void fs_noval(void);
44extern void fs_tosspages(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
45extern void fs_flushinval_pages(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
46extern int fs_flush_pages(bhv_desc_t *, xfs_off_t, xfs_off_t, uint64_t, int);
47
48#endif /* __XFS_FS_SUBR_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c
new file mode 100644
index 000000000000..a6da5b4fd240
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_globals.c
@@ -0,0 +1,74 @@
1/*
2 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33/*
34 * This file contains globals needed by XFS that were normally defined
35 * somewhere else in IRIX.
36 */
37
38#include "xfs.h"
39#include "xfs_cred.h"
40#include "xfs_sysctl.h"
41
42/*
43 * System memory size - used to scale certain data structures in XFS.
44 */
45unsigned long xfs_physmem;
46
47/*
48 * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n,
49 * other XFS code uses these values. Times are measured in centisecs (i.e.
50 * 100ths of a second).
51 */
52xfs_param_t xfs_params = {
53 /* MIN DFLT MAX */
54 .restrict_chown = { 0, 1, 1 },
55 .sgid_inherit = { 0, 0, 1 },
56 .symlink_mode = { 0, 0, 1 },
57 .panic_mask = { 0, 0, 127 },
58 .error_level = { 0, 3, 11 },
59 .syncd_timer = { 1*100, 30*100, 7200*100},
60 .stats_clear = { 0, 0, 1 },
61 .inherit_sync = { 0, 1, 1 },
62 .inherit_nodump = { 0, 1, 1 },
63 .inherit_noatim = { 0, 1, 1 },
64 .xfs_buf_timer = { 100/2, 1*100, 30*100 },
65 .xfs_buf_age = { 1*100, 15*100, 7200*100},
66 .inherit_nosym = { 0, 0, 1 },
67 .rotorstep = { 1, 1, 255 },
68};
69
70/*
71 * Global system credential structure.
72 */
73cred_t sys_cred_val, *sys_cred = &sys_cred_val;
74
diff --git a/fs/xfs/linux-2.6/xfs_globals.h b/fs/xfs/linux-2.6/xfs_globals.h
new file mode 100644
index 000000000000..e81e2f38a853
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_globals.h
@@ -0,0 +1,44 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_GLOBALS_H__
33#define __XFS_GLOBALS_H__
34
35/*
36 * This file declares globals needed by XFS that were normally defined
37 * somewhere else in IRIX.
38 */
39
40extern uint64_t xfs_panic_mask; /* set to cause more panics */
41extern unsigned long xfs_physmem;
42extern struct cred *sys_cred;
43
44#endif /* __XFS_GLOBALS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
new file mode 100644
index 000000000000..69809eef8a54
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -0,0 +1,1336 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34
35#include "xfs_fs.h"
36#include "xfs_inum.h"
37#include "xfs_log.h"
38#include "xfs_trans.h"
39#include "xfs_sb.h"
40#include "xfs_dir.h"
41#include "xfs_dir2.h"
42#include "xfs_alloc.h"
43#include "xfs_dmapi.h"
44#include "xfs_mount.h"
45#include "xfs_alloc_btree.h"
46#include "xfs_bmap_btree.h"
47#include "xfs_ialloc_btree.h"
48#include "xfs_btree.h"
49#include "xfs_ialloc.h"
50#include "xfs_attr_sf.h"
51#include "xfs_dir_sf.h"
52#include "xfs_dir2_sf.h"
53#include "xfs_dinode.h"
54#include "xfs_inode.h"
55#include "xfs_bmap.h"
56#include "xfs_bit.h"
57#include "xfs_rtalloc.h"
58#include "xfs_error.h"
59#include "xfs_itable.h"
60#include "xfs_rw.h"
61#include "xfs_acl.h"
62#include "xfs_cap.h"
63#include "xfs_mac.h"
64#include "xfs_attr.h"
65#include "xfs_buf_item.h"
66#include "xfs_utils.h"
67#include "xfs_dfrag.h"
68#include "xfs_fsops.h"
69
70#include <linux/dcache.h>
71#include <linux/mount.h>
72#include <linux/namei.h>
73#include <linux/pagemap.h>
74
75/*
76 * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to
77 * a file or fs handle.
78 *
79 * XFS_IOC_PATH_TO_FSHANDLE
80 * returns fs handle for a mount point or path within that mount point
81 * XFS_IOC_FD_TO_HANDLE
82 * returns full handle for a FD opened in user space
83 * XFS_IOC_PATH_TO_HANDLE
84 * returns full handle for a path
85 */
86STATIC int
87xfs_find_handle(
88 unsigned int cmd,
89 void __user *arg)
90{
91 int hsize;
92 xfs_handle_t handle;
93 xfs_fsop_handlereq_t hreq;
94 struct inode *inode;
95 struct vnode *vp;
96
97 if (copy_from_user(&hreq, arg, sizeof(hreq)))
98 return -XFS_ERROR(EFAULT);
99
100 memset((char *)&handle, 0, sizeof(handle));
101
102 switch (cmd) {
103 case XFS_IOC_PATH_TO_FSHANDLE:
104 case XFS_IOC_PATH_TO_HANDLE: {
105 struct nameidata nd;
106 int error;
107
108 error = user_path_walk_link((const char __user *)hreq.path, &nd);
109 if (error)
110 return error;
111
112 ASSERT(nd.dentry);
113 ASSERT(nd.dentry->d_inode);
114 inode = igrab(nd.dentry->d_inode);
115 path_release(&nd);
116 break;
117 }
118
119 case XFS_IOC_FD_TO_HANDLE: {
120 struct file *file;
121
122 file = fget(hreq.fd);
123 if (!file)
124 return -EBADF;
125
126 ASSERT(file->f_dentry);
127 ASSERT(file->f_dentry->d_inode);
128 inode = igrab(file->f_dentry->d_inode);
129 fput(file);
130 break;
131 }
132
133 default:
134 ASSERT(0);
135 return -XFS_ERROR(EINVAL);
136 }
137
138 if (inode->i_sb->s_magic != XFS_SB_MAGIC) {
139 /* we're not in XFS anymore, Toto */
140 iput(inode);
141 return -XFS_ERROR(EINVAL);
142 }
143
144 /* we need the vnode */
145 vp = LINVFS_GET_VP(inode);
146 if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
147 iput(inode);
148 return -XFS_ERROR(EBADF);
149 }
150
151 /* now we can grab the fsid */
152 memcpy(&handle.ha_fsid, vp->v_vfsp->vfs_altfsid, sizeof(xfs_fsid_t));
153 hsize = sizeof(xfs_fsid_t);
154
155 if (cmd != XFS_IOC_PATH_TO_FSHANDLE) {
156 xfs_inode_t *ip;
157 bhv_desc_t *bhv;
158 int lock_mode;
159
160 /* need to get access to the xfs_inode to read the generation */
161 bhv = vn_bhv_lookup_unlocked(VN_BHV_HEAD(vp), &xfs_vnodeops);
162 ASSERT(bhv);
163 ip = XFS_BHVTOI(bhv);
164 ASSERT(ip);
165 lock_mode = xfs_ilock_map_shared(ip);
166
167 /* fill in fid section of handle from inode */
168 handle.ha_fid.xfs_fid_len = sizeof(xfs_fid_t) -
169 sizeof(handle.ha_fid.xfs_fid_len);
170 handle.ha_fid.xfs_fid_pad = 0;
171 handle.ha_fid.xfs_fid_gen = ip->i_d.di_gen;
172 handle.ha_fid.xfs_fid_ino = ip->i_ino;
173
174 xfs_iunlock_map_shared(ip, lock_mode);
175
176 hsize = XFS_HSIZE(handle);
177 }
178
179 /* now copy our handle into the user buffer & write out the size */
180 if (copy_to_user(hreq.ohandle, &handle, hsize) ||
181 copy_to_user(hreq.ohandlen, &hsize, sizeof(__s32))) {
182 iput(inode);
183 return -XFS_ERROR(EFAULT);
184 }
185
186 iput(inode);
187 return 0;
188}
189
190
191/*
192 * Convert userspace handle data into vnode (and inode).
193 * We [ab]use the fact that all the fsop_handlereq ioctl calls
194 * have a data structure argument whose first component is always
195 * a xfs_fsop_handlereq_t, so we can cast to and from this type.
196 * This allows us to optimise the copy_from_user calls and gives
197 * a handy, shared routine.
198 *
199 * If no error, caller must always VN_RELE the returned vp.
200 */
201STATIC int
202xfs_vget_fsop_handlereq(
203 xfs_mount_t *mp,
204 struct inode *parinode, /* parent inode pointer */
205 xfs_fsop_handlereq_t *hreq,
206 vnode_t **vp,
207 struct inode **inode)
208{
209 void __user *hanp;
210 size_t hlen;
211 xfs_fid_t *xfid;
212 xfs_handle_t *handlep;
213 xfs_handle_t handle;
214 xfs_inode_t *ip;
215 struct inode *inodep;
216 vnode_t *vpp;
217 xfs_ino_t ino;
218 __u32 igen;
219 int error;
220
221 /*
222 * Only allow handle opens under a directory.
223 */
224 if (!S_ISDIR(parinode->i_mode))
225 return XFS_ERROR(ENOTDIR);
226
227 hanp = hreq->ihandle;
228 hlen = hreq->ihandlen;
229 handlep = &handle;
230
231 if (hlen < sizeof(handlep->ha_fsid) || hlen > sizeof(*handlep))
232 return XFS_ERROR(EINVAL);
233 if (copy_from_user(handlep, hanp, hlen))
234 return XFS_ERROR(EFAULT);
235 if (hlen < sizeof(*handlep))
236 memset(((char *)handlep) + hlen, 0, sizeof(*handlep) - hlen);
237 if (hlen > sizeof(handlep->ha_fsid)) {
238 if (handlep->ha_fid.xfs_fid_len !=
239 (hlen - sizeof(handlep->ha_fsid)
240 - sizeof(handlep->ha_fid.xfs_fid_len))
241 || handlep->ha_fid.xfs_fid_pad)
242 return XFS_ERROR(EINVAL);
243 }
244
245 /*
246 * Crack the handle, obtain the inode # & generation #
247 */
248 xfid = (struct xfs_fid *)&handlep->ha_fid;
249 if (xfid->xfs_fid_len == sizeof(*xfid) - sizeof(xfid->xfs_fid_len)) {
250 ino = xfid->xfs_fid_ino;
251 igen = xfid->xfs_fid_gen;
252 } else {
253 return XFS_ERROR(EINVAL);
254 }
255
256 /*
257 * Get the XFS inode, building a vnode to go with it.
258 */
259 error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0);
260 if (error)
261 return error;
262 if (ip == NULL)
263 return XFS_ERROR(EIO);
264 if (ip->i_d.di_mode == 0 || ip->i_d.di_gen != igen) {
265 xfs_iput_new(ip, XFS_ILOCK_SHARED);
266 return XFS_ERROR(ENOENT);
267 }
268
269 vpp = XFS_ITOV(ip);
270 inodep = LINVFS_GET_IP(vpp);
271 xfs_iunlock(ip, XFS_ILOCK_SHARED);
272
273 *vp = vpp;
274 *inode = inodep;
275 return 0;
276}
277
278STATIC int
279xfs_open_by_handle(
280 xfs_mount_t *mp,
281 void __user *arg,
282 struct file *parfilp,
283 struct inode *parinode)
284{
285 int error;
286 int new_fd;
287 int permflag;
288 struct file *filp;
289 struct inode *inode;
290 struct dentry *dentry;
291 vnode_t *vp;
292 xfs_fsop_handlereq_t hreq;
293
294 if (!capable(CAP_SYS_ADMIN))
295 return -XFS_ERROR(EPERM);
296 if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
297 return -XFS_ERROR(EFAULT);
298
299 error = xfs_vget_fsop_handlereq(mp, parinode, &hreq, &vp, &inode);
300 if (error)
301 return -error;
302
303 /* Restrict xfs_open_by_handle to directories & regular files. */
304 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
305 iput(inode);
306 return -XFS_ERROR(EINVAL);
307 }
308
309#if BITS_PER_LONG != 32
310 hreq.oflags |= O_LARGEFILE;
311#endif
312 /* Put open permission in namei format. */
313 permflag = hreq.oflags;
314 if ((permflag+1) & O_ACCMODE)
315 permflag++;
316 if (permflag & O_TRUNC)
317 permflag |= 2;
318
319 if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) &&
320 (permflag & FMODE_WRITE) && IS_APPEND(inode)) {
321 iput(inode);
322 return -XFS_ERROR(EPERM);
323 }
324
325 if ((permflag & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
326 iput(inode);
327 return -XFS_ERROR(EACCES);
328 }
329
330 /* Can't write directories. */
331 if ( S_ISDIR(inode->i_mode) && (permflag & FMODE_WRITE)) {
332 iput(inode);
333 return -XFS_ERROR(EISDIR);
334 }
335
336 if ((new_fd = get_unused_fd()) < 0) {
337 iput(inode);
338 return new_fd;
339 }
340
341 dentry = d_alloc_anon(inode);
342 if (dentry == NULL) {
343 iput(inode);
344 put_unused_fd(new_fd);
345 return -XFS_ERROR(ENOMEM);
346 }
347
348 /* Ensure umount returns EBUSY on umounts while this file is open. */
349 mntget(parfilp->f_vfsmnt);
350
351 /* Create file pointer. */
352 filp = dentry_open(dentry, parfilp->f_vfsmnt, hreq.oflags);
353 if (IS_ERR(filp)) {
354 put_unused_fd(new_fd);
355 return -XFS_ERROR(-PTR_ERR(filp));
356 }
357 if (inode->i_mode & S_IFREG)
358 filp->f_op = &linvfs_invis_file_operations;
359
360 fd_install(new_fd, filp);
361 return new_fd;
362}
363
364STATIC int
365xfs_readlink_by_handle(
366 xfs_mount_t *mp,
367 void __user *arg,
368 struct file *parfilp,
369 struct inode *parinode)
370{
371 int error;
372 struct iovec aiov;
373 struct uio auio;
374 struct inode *inode;
375 xfs_fsop_handlereq_t hreq;
376 vnode_t *vp;
377 __u32 olen;
378
379 if (!capable(CAP_SYS_ADMIN))
380 return -XFS_ERROR(EPERM);
381 if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
382 return -XFS_ERROR(EFAULT);
383
384 error = xfs_vget_fsop_handlereq(mp, parinode, &hreq, &vp, &inode);
385 if (error)
386 return -error;
387
388 /* Restrict this handle operation to symlinks only. */
389 if (vp->v_type != VLNK) {
390 VN_RELE(vp);
391 return -XFS_ERROR(EINVAL);
392 }
393
394 if (copy_from_user(&olen, hreq.ohandlen, sizeof(__u32))) {
395 VN_RELE(vp);
396 return -XFS_ERROR(EFAULT);
397 }
398 aiov.iov_len = olen;
399 aiov.iov_base = hreq.ohandle;
400
401 auio.uio_iov = &aiov;
402 auio.uio_iovcnt = 1;
403 auio.uio_offset = 0;
404 auio.uio_segflg = UIO_USERSPACE;
405 auio.uio_resid = olen;
406
407 VOP_READLINK(vp, &auio, IO_INVIS, NULL, error);
408
409 VN_RELE(vp);
410 return (olen - auio.uio_resid);
411}
412
413STATIC int
414xfs_fssetdm_by_handle(
415 xfs_mount_t *mp,
416 void __user *arg,
417 struct file *parfilp,
418 struct inode *parinode)
419{
420 int error;
421 struct fsdmidata fsd;
422 xfs_fsop_setdm_handlereq_t dmhreq;
423 struct inode *inode;
424 bhv_desc_t *bdp;
425 vnode_t *vp;
426
427 if (!capable(CAP_MKNOD))
428 return -XFS_ERROR(EPERM);
429 if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t)))
430 return -XFS_ERROR(EFAULT);
431
432 error = xfs_vget_fsop_handlereq(mp, parinode, &dmhreq.hreq, &vp, &inode);
433 if (error)
434 return -error;
435
436 if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) {
437 VN_RELE(vp);
438 return -XFS_ERROR(EPERM);
439 }
440
441 if (copy_from_user(&fsd, dmhreq.data, sizeof(fsd))) {
442 VN_RELE(vp);
443 return -XFS_ERROR(EFAULT);
444 }
445
446 bdp = bhv_base_unlocked(VN_BHV_HEAD(vp));
447 error = xfs_set_dmattrs(bdp, fsd.fsd_dmevmask, fsd.fsd_dmstate, NULL);
448
449 VN_RELE(vp);
450 if (error)
451 return -error;
452 return 0;
453}
454
455STATIC int
456xfs_attrlist_by_handle(
457 xfs_mount_t *mp,
458 void __user *arg,
459 struct file *parfilp,
460 struct inode *parinode)
461{
462 int error;
463 attrlist_cursor_kern_t *cursor;
464 xfs_fsop_attrlist_handlereq_t al_hreq;
465 struct inode *inode;
466 vnode_t *vp;
467 char *kbuf;
468
469 if (!capable(CAP_SYS_ADMIN))
470 return -XFS_ERROR(EPERM);
471 if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t)))
472 return -XFS_ERROR(EFAULT);
473 if (al_hreq.buflen > XATTR_LIST_MAX)
474 return -XFS_ERROR(EINVAL);
475
476 error = xfs_vget_fsop_handlereq(mp, parinode, &al_hreq.hreq,
477 &vp, &inode);
478 if (error)
479 goto out;
480
481 kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL);
482 if (!kbuf)
483 goto out_vn_rele;
484
485 cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
486 VOP_ATTR_LIST(vp, kbuf, al_hreq.buflen, al_hreq.flags,
487 cursor, NULL, error);
488 if (error)
489 goto out_kfree;
490
491 if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen))
492 error = -EFAULT;
493
494 out_kfree:
495 kfree(kbuf);
496 out_vn_rele:
497 VN_RELE(vp);
498 out:
499 return -error;
500}
501
502STATIC int
503xfs_attrmulti_attr_get(
504 struct vnode *vp,
505 char *name,
506 char __user *ubuf,
507 __uint32_t *len,
508 __uint32_t flags)
509{
510 char *kbuf;
511 int error = EFAULT;
512
513 if (*len > XATTR_SIZE_MAX)
514 return EINVAL;
515 kbuf = kmalloc(*len, GFP_KERNEL);
516 if (!kbuf)
517 return ENOMEM;
518
519 VOP_ATTR_GET(vp, name, kbuf, len, flags, NULL, error);
520 if (error)
521 goto out_kfree;
522
523 if (copy_to_user(ubuf, kbuf, *len))
524 error = EFAULT;
525
526 out_kfree:
527 kfree(kbuf);
528 return error;
529}
530
531STATIC int
532xfs_attrmulti_attr_set(
533 struct vnode *vp,
534 char *name,
535 const char __user *ubuf,
536 __uint32_t len,
537 __uint32_t flags)
538{
539 char *kbuf;
540 int error = EFAULT;
541
542 if (IS_IMMUTABLE(&vp->v_inode) || IS_APPEND(&vp->v_inode))
543 return EPERM;
544 if (len > XATTR_SIZE_MAX)
545 return EINVAL;
546
547 kbuf = kmalloc(len, GFP_KERNEL);
548 if (!kbuf)
549 return ENOMEM;
550
551 if (copy_from_user(kbuf, ubuf, len))
552 goto out_kfree;
553
554 VOP_ATTR_SET(vp, name, kbuf, len, flags, NULL, error);
555
556 out_kfree:
557 kfree(kbuf);
558 return error;
559}
560
561STATIC int
562xfs_attrmulti_attr_remove(
563 struct vnode *vp,
564 char *name,
565 __uint32_t flags)
566{
567 int error;
568
569 if (IS_IMMUTABLE(&vp->v_inode) || IS_APPEND(&vp->v_inode))
570 return EPERM;
571
572 VOP_ATTR_REMOVE(vp, name, flags, NULL, error);
573 return error;
574}
575
576STATIC int
577xfs_attrmulti_by_handle(
578 xfs_mount_t *mp,
579 void __user *arg,
580 struct file *parfilp,
581 struct inode *parinode)
582{
583 int error;
584 xfs_attr_multiop_t *ops;
585 xfs_fsop_attrmulti_handlereq_t am_hreq;
586 struct inode *inode;
587 vnode_t *vp;
588 unsigned int i, size;
589 char *attr_name;
590
591 if (!capable(CAP_SYS_ADMIN))
592 return -XFS_ERROR(EPERM);
593 if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t)))
594 return -XFS_ERROR(EFAULT);
595
596 error = xfs_vget_fsop_handlereq(mp, parinode, &am_hreq.hreq, &vp, &inode);
597 if (error)
598 goto out;
599
600 error = E2BIG;
601 size = am_hreq.opcount * sizeof(attr_multiop_t);
602 if (!size || size > 16 * PAGE_SIZE)
603 goto out_vn_rele;
604
605 error = ENOMEM;
606 ops = kmalloc(size, GFP_KERNEL);
607 if (!ops)
608 goto out_vn_rele;
609
610 error = EFAULT;
611 if (copy_from_user(ops, am_hreq.ops, size))
612 goto out_kfree_ops;
613
614 attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
615 if (!attr_name)
616 goto out_kfree_ops;
617
618
619 error = 0;
620 for (i = 0; i < am_hreq.opcount; i++) {
621 ops[i].am_error = strncpy_from_user(attr_name,
622 ops[i].am_attrname, MAXNAMELEN);
623 if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
624 error = -ERANGE;
625 if (ops[i].am_error < 0)
626 break;
627
628 switch (ops[i].am_opcode) {
629 case ATTR_OP_GET:
630 ops[i].am_error = xfs_attrmulti_attr_get(vp,
631 attr_name, ops[i].am_attrvalue,
632 &ops[i].am_length, ops[i].am_flags);
633 break;
634 case ATTR_OP_SET:
635 ops[i].am_error = xfs_attrmulti_attr_set(vp,
636 attr_name, ops[i].am_attrvalue,
637 ops[i].am_length, ops[i].am_flags);
638 break;
639 case ATTR_OP_REMOVE:
640 ops[i].am_error = xfs_attrmulti_attr_remove(vp,
641 attr_name, ops[i].am_flags);
642 break;
643 default:
644 ops[i].am_error = EINVAL;
645 }
646 }
647
648 if (copy_to_user(am_hreq.ops, ops, size))
649 error = XFS_ERROR(EFAULT);
650
651 kfree(attr_name);
652 out_kfree_ops:
653 kfree(ops);
654 out_vn_rele:
655 VN_RELE(vp);
656 out:
657 return -error;
658}
659
660/* prototypes for a few of the stack-hungry cases that have
661 * their own functions. Functions are defined after their use
662 * so gcc doesn't get fancy and inline them with -03 */
663
664STATIC int
665xfs_ioc_space(
666 bhv_desc_t *bdp,
667 vnode_t *vp,
668 struct file *filp,
669 int flags,
670 unsigned int cmd,
671 void __user *arg);
672
673STATIC int
674xfs_ioc_bulkstat(
675 xfs_mount_t *mp,
676 unsigned int cmd,
677 void __user *arg);
678
679STATIC int
680xfs_ioc_fsgeometry_v1(
681 xfs_mount_t *mp,
682 void __user *arg);
683
684STATIC int
685xfs_ioc_fsgeometry(
686 xfs_mount_t *mp,
687 void __user *arg);
688
689STATIC int
690xfs_ioc_xattr(
691 vnode_t *vp,
692 xfs_inode_t *ip,
693 struct file *filp,
694 unsigned int cmd,
695 void __user *arg);
696
697STATIC int
698xfs_ioc_getbmap(
699 bhv_desc_t *bdp,
700 struct file *filp,
701 int flags,
702 unsigned int cmd,
703 void __user *arg);
704
705STATIC int
706xfs_ioc_getbmapx(
707 bhv_desc_t *bdp,
708 void __user *arg);
709
710int
711xfs_ioctl(
712 bhv_desc_t *bdp,
713 struct inode *inode,
714 struct file *filp,
715 int ioflags,
716 unsigned int cmd,
717 void __user *arg)
718{
719 int error;
720 vnode_t *vp;
721 xfs_inode_t *ip;
722 xfs_mount_t *mp;
723
724 vp = LINVFS_GET_VP(inode);
725
726 vn_trace_entry(vp, "xfs_ioctl", (inst_t *)__return_address);
727
728 ip = XFS_BHVTOI(bdp);
729 mp = ip->i_mount;
730
731 switch (cmd) {
732
733 case XFS_IOC_ALLOCSP:
734 case XFS_IOC_FREESP:
735 case XFS_IOC_RESVSP:
736 case XFS_IOC_UNRESVSP:
737 case XFS_IOC_ALLOCSP64:
738 case XFS_IOC_FREESP64:
739 case XFS_IOC_RESVSP64:
740 case XFS_IOC_UNRESVSP64:
741 /*
742 * Only allow the sys admin to reserve space unless
743 * unwritten extents are enabled.
744 */
745 if (!XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb) &&
746 !capable(CAP_SYS_ADMIN))
747 return -EPERM;
748
749 return xfs_ioc_space(bdp, vp, filp, ioflags, cmd, arg);
750
751 case XFS_IOC_DIOINFO: {
752 struct dioattr da;
753 xfs_buftarg_t *target =
754 (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
755 mp->m_rtdev_targp : mp->m_ddev_targp;
756
757 da.d_mem = da.d_miniosz = 1 << target->pbr_sshift;
758 /* The size dio will do in one go */
759 da.d_maxiosz = 64 * PAGE_CACHE_SIZE;
760
761 if (copy_to_user(arg, &da, sizeof(da)))
762 return -XFS_ERROR(EFAULT);
763 return 0;
764 }
765
766 case XFS_IOC_FSBULKSTAT_SINGLE:
767 case XFS_IOC_FSBULKSTAT:
768 case XFS_IOC_FSINUMBERS:
769 return xfs_ioc_bulkstat(mp, cmd, arg);
770
771 case XFS_IOC_FSGEOMETRY_V1:
772 return xfs_ioc_fsgeometry_v1(mp, arg);
773
774 case XFS_IOC_FSGEOMETRY:
775 return xfs_ioc_fsgeometry(mp, arg);
776
777 case XFS_IOC_GETVERSION:
778 case XFS_IOC_GETXFLAGS:
779 case XFS_IOC_SETXFLAGS:
780 case XFS_IOC_FSGETXATTR:
781 case XFS_IOC_FSSETXATTR:
782 case XFS_IOC_FSGETXATTRA:
783 return xfs_ioc_xattr(vp, ip, filp, cmd, arg);
784
785 case XFS_IOC_FSSETDM: {
786 struct fsdmidata dmi;
787
788 if (copy_from_user(&dmi, arg, sizeof(dmi)))
789 return -XFS_ERROR(EFAULT);
790
791 error = xfs_set_dmattrs(bdp, dmi.fsd_dmevmask, dmi.fsd_dmstate,
792 NULL);
793 return -error;
794 }
795
796 case XFS_IOC_GETBMAP:
797 case XFS_IOC_GETBMAPA:
798 return xfs_ioc_getbmap(bdp, filp, ioflags, cmd, arg);
799
800 case XFS_IOC_GETBMAPX:
801 return xfs_ioc_getbmapx(bdp, arg);
802
803 case XFS_IOC_FD_TO_HANDLE:
804 case XFS_IOC_PATH_TO_HANDLE:
805 case XFS_IOC_PATH_TO_FSHANDLE:
806 return xfs_find_handle(cmd, arg);
807
808 case XFS_IOC_OPEN_BY_HANDLE:
809 return xfs_open_by_handle(mp, arg, filp, inode);
810
811 case XFS_IOC_FSSETDM_BY_HANDLE:
812 return xfs_fssetdm_by_handle(mp, arg, filp, inode);
813
814 case XFS_IOC_READLINK_BY_HANDLE:
815 return xfs_readlink_by_handle(mp, arg, filp, inode);
816
817 case XFS_IOC_ATTRLIST_BY_HANDLE:
818 return xfs_attrlist_by_handle(mp, arg, filp, inode);
819
820 case XFS_IOC_ATTRMULTI_BY_HANDLE:
821 return xfs_attrmulti_by_handle(mp, arg, filp, inode);
822
823 case XFS_IOC_SWAPEXT: {
824 error = xfs_swapext((struct xfs_swapext __user *)arg);
825 return -error;
826 }
827
828 case XFS_IOC_FSCOUNTS: {
829 xfs_fsop_counts_t out;
830
831 error = xfs_fs_counts(mp, &out);
832 if (error)
833 return -error;
834
835 if (copy_to_user(arg, &out, sizeof(out)))
836 return -XFS_ERROR(EFAULT);
837 return 0;
838 }
839
840 case XFS_IOC_SET_RESBLKS: {
841 xfs_fsop_resblks_t inout;
842 __uint64_t in;
843
844 if (!capable(CAP_SYS_ADMIN))
845 return -EPERM;
846
847 if (copy_from_user(&inout, arg, sizeof(inout)))
848 return -XFS_ERROR(EFAULT);
849
850 /* input parameter is passed in resblks field of structure */
851 in = inout.resblks;
852 error = xfs_reserve_blocks(mp, &in, &inout);
853 if (error)
854 return -error;
855
856 if (copy_to_user(arg, &inout, sizeof(inout)))
857 return -XFS_ERROR(EFAULT);
858 return 0;
859 }
860
861 case XFS_IOC_GET_RESBLKS: {
862 xfs_fsop_resblks_t out;
863
864 if (!capable(CAP_SYS_ADMIN))
865 return -EPERM;
866
867 error = xfs_reserve_blocks(mp, NULL, &out);
868 if (error)
869 return -error;
870
871 if (copy_to_user(arg, &out, sizeof(out)))
872 return -XFS_ERROR(EFAULT);
873
874 return 0;
875 }
876
877 case XFS_IOC_FSGROWFSDATA: {
878 xfs_growfs_data_t in;
879
880 if (!capable(CAP_SYS_ADMIN))
881 return -EPERM;
882
883 if (copy_from_user(&in, arg, sizeof(in)))
884 return -XFS_ERROR(EFAULT);
885
886 error = xfs_growfs_data(mp, &in);
887 return -error;
888 }
889
890 case XFS_IOC_FSGROWFSLOG: {
891 xfs_growfs_log_t in;
892
893 if (!capable(CAP_SYS_ADMIN))
894 return -EPERM;
895
896 if (copy_from_user(&in, arg, sizeof(in)))
897 return -XFS_ERROR(EFAULT);
898
899 error = xfs_growfs_log(mp, &in);
900 return -error;
901 }
902
903 case XFS_IOC_FSGROWFSRT: {
904 xfs_growfs_rt_t in;
905
906 if (!capable(CAP_SYS_ADMIN))
907 return -EPERM;
908
909 if (copy_from_user(&in, arg, sizeof(in)))
910 return -XFS_ERROR(EFAULT);
911
912 error = xfs_growfs_rt(mp, &in);
913 return -error;
914 }
915
916 case XFS_IOC_FREEZE:
917 if (!capable(CAP_SYS_ADMIN))
918 return -EPERM;
919
920 if (inode->i_sb->s_frozen == SB_UNFROZEN)
921 freeze_bdev(inode->i_sb->s_bdev);
922 return 0;
923
924 case XFS_IOC_THAW:
925 if (!capable(CAP_SYS_ADMIN))
926 return -EPERM;
927 if (inode->i_sb->s_frozen != SB_UNFROZEN)
928 thaw_bdev(inode->i_sb->s_bdev, inode->i_sb);
929 return 0;
930
931 case XFS_IOC_GOINGDOWN: {
932 __uint32_t in;
933
934 if (!capable(CAP_SYS_ADMIN))
935 return -EPERM;
936
937 if (get_user(in, (__uint32_t __user *)arg))
938 return -XFS_ERROR(EFAULT);
939
940 error = xfs_fs_goingdown(mp, in);
941 return -error;
942 }
943
944 case XFS_IOC_ERROR_INJECTION: {
945 xfs_error_injection_t in;
946
947 if (!capable(CAP_SYS_ADMIN))
948 return -EPERM;
949
950 if (copy_from_user(&in, arg, sizeof(in)))
951 return -XFS_ERROR(EFAULT);
952
953 error = xfs_errortag_add(in.errtag, mp);
954 return -error;
955 }
956
957 case XFS_IOC_ERROR_CLEARALL:
958 if (!capable(CAP_SYS_ADMIN))
959 return -EPERM;
960
961 error = xfs_errortag_clearall(mp);
962 return -error;
963
964 default:
965 return -ENOTTY;
966 }
967}
968
969STATIC int
970xfs_ioc_space(
971 bhv_desc_t *bdp,
972 vnode_t *vp,
973 struct file *filp,
974 int ioflags,
975 unsigned int cmd,
976 void __user *arg)
977{
978 xfs_flock64_t bf;
979 int attr_flags = 0;
980 int error;
981
982 if (vp->v_inode.i_flags & (S_IMMUTABLE|S_APPEND))
983 return -XFS_ERROR(EPERM);
984
985 if (!(filp->f_flags & FMODE_WRITE))
986 return -XFS_ERROR(EBADF);
987
988 if (vp->v_type != VREG)
989 return -XFS_ERROR(EINVAL);
990
991 if (copy_from_user(&bf, arg, sizeof(bf)))
992 return -XFS_ERROR(EFAULT);
993
994 if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
995 attr_flags |= ATTR_NONBLOCK;
996 if (ioflags & IO_INVIS)
997 attr_flags |= ATTR_DMI;
998
999 error = xfs_change_file_space(bdp, cmd, &bf, filp->f_pos,
1000 NULL, attr_flags);
1001 return -error;
1002}
1003
1004STATIC int
1005xfs_ioc_bulkstat(
1006 xfs_mount_t *mp,
1007 unsigned int cmd,
1008 void __user *arg)
1009{
1010 xfs_fsop_bulkreq_t bulkreq;
1011 int count; /* # of records returned */
1012 xfs_ino_t inlast; /* last inode number */
1013 int done;
1014 int error;
1015
1016 /* done = 1 if there are more stats to get and if bulkstat */
1017 /* should be called again (unused here, but used in dmapi) */
1018
1019 if (!capable(CAP_SYS_ADMIN))
1020 return -EPERM;
1021
1022 if (XFS_FORCED_SHUTDOWN(mp))
1023 return -XFS_ERROR(EIO);
1024
1025 if (copy_from_user(&bulkreq, arg, sizeof(xfs_fsop_bulkreq_t)))
1026 return -XFS_ERROR(EFAULT);
1027
1028 if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
1029 return -XFS_ERROR(EFAULT);
1030
1031 if ((count = bulkreq.icount) <= 0)
1032 return -XFS_ERROR(EINVAL);
1033
1034 if (cmd == XFS_IOC_FSINUMBERS)
1035 error = xfs_inumbers(mp, &inlast, &count,
1036 bulkreq.ubuffer);
1037 else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE)
1038 error = xfs_bulkstat_single(mp, &inlast,
1039 bulkreq.ubuffer, &done);
1040 else { /* XFS_IOC_FSBULKSTAT */
1041 if (count == 1 && inlast != 0) {
1042 inlast++;
1043 error = xfs_bulkstat_single(mp, &inlast,
1044 bulkreq.ubuffer, &done);
1045 } else {
1046 error = xfs_bulkstat(mp, &inlast, &count,
1047 (bulkstat_one_pf)xfs_bulkstat_one, NULL,
1048 sizeof(xfs_bstat_t), bulkreq.ubuffer,
1049 BULKSTAT_FG_QUICK, &done);
1050 }
1051 }
1052
1053 if (error)
1054 return -error;
1055
1056 if (bulkreq.ocount != NULL) {
1057 if (copy_to_user(bulkreq.lastip, &inlast,
1058 sizeof(xfs_ino_t)))
1059 return -XFS_ERROR(EFAULT);
1060
1061 if (copy_to_user(bulkreq.ocount, &count, sizeof(count)))
1062 return -XFS_ERROR(EFAULT);
1063 }
1064
1065 return 0;
1066}
1067
1068STATIC int
1069xfs_ioc_fsgeometry_v1(
1070 xfs_mount_t *mp,
1071 void __user *arg)
1072{
1073 xfs_fsop_geom_v1_t fsgeo;
1074 int error;
1075
1076 error = xfs_fs_geometry(mp, (xfs_fsop_geom_t *)&fsgeo, 3);
1077 if (error)
1078 return -error;
1079
1080 if (copy_to_user(arg, &fsgeo, sizeof(fsgeo)))
1081 return -XFS_ERROR(EFAULT);
1082 return 0;
1083}
1084
1085STATIC int
1086xfs_ioc_fsgeometry(
1087 xfs_mount_t *mp,
1088 void __user *arg)
1089{
1090 xfs_fsop_geom_t fsgeo;
1091 int error;
1092
1093 error = xfs_fs_geometry(mp, &fsgeo, 4);
1094 if (error)
1095 return -error;
1096
1097 if (copy_to_user(arg, &fsgeo, sizeof(fsgeo)))
1098 return -XFS_ERROR(EFAULT);
1099 return 0;
1100}
1101
1102/*
1103 * Linux extended inode flags interface.
1104 */
1105#define LINUX_XFLAG_SYNC 0x00000008 /* Synchronous updates */
1106#define LINUX_XFLAG_IMMUTABLE 0x00000010 /* Immutable file */
1107#define LINUX_XFLAG_APPEND 0x00000020 /* writes to file may only append */
1108#define LINUX_XFLAG_NODUMP 0x00000040 /* do not dump file */
1109#define LINUX_XFLAG_NOATIME 0x00000080 /* do not update atime */
1110
1111STATIC unsigned int
1112xfs_merge_ioc_xflags(
1113 unsigned int flags,
1114 unsigned int start)
1115{
1116 unsigned int xflags = start;
1117
1118 if (flags & LINUX_XFLAG_IMMUTABLE)
1119 xflags |= XFS_XFLAG_IMMUTABLE;
1120 else
1121 xflags &= ~XFS_XFLAG_IMMUTABLE;
1122 if (flags & LINUX_XFLAG_APPEND)
1123 xflags |= XFS_XFLAG_APPEND;
1124 else
1125 xflags &= ~XFS_XFLAG_APPEND;
1126 if (flags & LINUX_XFLAG_SYNC)
1127 xflags |= XFS_XFLAG_SYNC;
1128 else
1129 xflags &= ~XFS_XFLAG_SYNC;
1130 if (flags & LINUX_XFLAG_NOATIME)
1131 xflags |= XFS_XFLAG_NOATIME;
1132 else
1133 xflags &= ~XFS_XFLAG_NOATIME;
1134 if (flags & LINUX_XFLAG_NODUMP)
1135 xflags |= XFS_XFLAG_NODUMP;
1136 else
1137 xflags &= ~XFS_XFLAG_NODUMP;
1138
1139 return xflags;
1140}
1141
1142STATIC unsigned int
1143xfs_di2lxflags(
1144 __uint16_t di_flags)
1145{
1146 unsigned int flags = 0;
1147
1148 if (di_flags & XFS_DIFLAG_IMMUTABLE)
1149 flags |= LINUX_XFLAG_IMMUTABLE;
1150 if (di_flags & XFS_DIFLAG_APPEND)
1151 flags |= LINUX_XFLAG_APPEND;
1152 if (di_flags & XFS_DIFLAG_SYNC)
1153 flags |= LINUX_XFLAG_SYNC;
1154 if (di_flags & XFS_DIFLAG_NOATIME)
1155 flags |= LINUX_XFLAG_NOATIME;
1156 if (di_flags & XFS_DIFLAG_NODUMP)
1157 flags |= LINUX_XFLAG_NODUMP;
1158 return flags;
1159}
1160
1161STATIC int
1162xfs_ioc_xattr(
1163 vnode_t *vp,
1164 xfs_inode_t *ip,
1165 struct file *filp,
1166 unsigned int cmd,
1167 void __user *arg)
1168{
1169 struct fsxattr fa;
1170 vattr_t va;
1171 int error;
1172 int attr_flags;
1173 unsigned int flags;
1174
1175 switch (cmd) {
1176 case XFS_IOC_FSGETXATTR: {
1177 va.va_mask = XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_NEXTENTS;
1178 VOP_GETATTR(vp, &va, 0, NULL, error);
1179 if (error)
1180 return -error;
1181
1182 fa.fsx_xflags = va.va_xflags;
1183 fa.fsx_extsize = va.va_extsize;
1184 fa.fsx_nextents = va.va_nextents;
1185
1186 if (copy_to_user(arg, &fa, sizeof(fa)))
1187 return -XFS_ERROR(EFAULT);
1188 return 0;
1189 }
1190
1191 case XFS_IOC_FSSETXATTR: {
1192 if (copy_from_user(&fa, arg, sizeof(fa)))
1193 return -XFS_ERROR(EFAULT);
1194
1195 attr_flags = 0;
1196 if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
1197 attr_flags |= ATTR_NONBLOCK;
1198
1199 va.va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE;
1200 va.va_xflags = fa.fsx_xflags;
1201 va.va_extsize = fa.fsx_extsize;
1202
1203 VOP_SETATTR(vp, &va, attr_flags, NULL, error);
1204 if (!error)
1205 vn_revalidate(vp); /* update Linux inode flags */
1206 return -error;
1207 }
1208
1209 case XFS_IOC_FSGETXATTRA: {
1210 va.va_mask = XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_ANEXTENTS;
1211 VOP_GETATTR(vp, &va, 0, NULL, error);
1212 if (error)
1213 return -error;
1214
1215 fa.fsx_xflags = va.va_xflags;
1216 fa.fsx_extsize = va.va_extsize;
1217 fa.fsx_nextents = va.va_anextents;
1218
1219 if (copy_to_user(arg, &fa, sizeof(fa)))
1220 return -XFS_ERROR(EFAULT);
1221 return 0;
1222 }
1223
1224 case XFS_IOC_GETXFLAGS: {
1225 flags = xfs_di2lxflags(ip->i_d.di_flags);
1226 if (copy_to_user(arg, &flags, sizeof(flags)))
1227 return -XFS_ERROR(EFAULT);
1228 return 0;
1229 }
1230
1231 case XFS_IOC_SETXFLAGS: {
1232 if (copy_from_user(&flags, arg, sizeof(flags)))
1233 return -XFS_ERROR(EFAULT);
1234
1235 if (flags & ~(LINUX_XFLAG_IMMUTABLE | LINUX_XFLAG_APPEND | \
1236 LINUX_XFLAG_NOATIME | LINUX_XFLAG_NODUMP | \
1237 LINUX_XFLAG_SYNC))
1238 return -XFS_ERROR(EOPNOTSUPP);
1239
1240 attr_flags = 0;
1241 if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
1242 attr_flags |= ATTR_NONBLOCK;
1243
1244 va.va_mask = XFS_AT_XFLAGS;
1245 va.va_xflags = xfs_merge_ioc_xflags(flags,
1246 xfs_ip2xflags(ip));
1247
1248 VOP_SETATTR(vp, &va, attr_flags, NULL, error);
1249 if (!error)
1250 vn_revalidate(vp); /* update Linux inode flags */
1251 return -error;
1252 }
1253
1254 case XFS_IOC_GETVERSION: {
1255 flags = LINVFS_GET_IP(vp)->i_generation;
1256 if (copy_to_user(arg, &flags, sizeof(flags)))
1257 return -XFS_ERROR(EFAULT);
1258 return 0;
1259 }
1260
1261 default:
1262 return -ENOTTY;
1263 }
1264}
1265
1266STATIC int
1267xfs_ioc_getbmap(
1268 bhv_desc_t *bdp,
1269 struct file *filp,
1270 int ioflags,
1271 unsigned int cmd,
1272 void __user *arg)
1273{
1274 struct getbmap bm;
1275 int iflags;
1276 int error;
1277
1278 if (copy_from_user(&bm, arg, sizeof(bm)))
1279 return -XFS_ERROR(EFAULT);
1280
1281 if (bm.bmv_count < 2)
1282 return -XFS_ERROR(EINVAL);
1283
1284 iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
1285 if (ioflags & IO_INVIS)
1286 iflags |= BMV_IF_NO_DMAPI_READ;
1287
1288 error = xfs_getbmap(bdp, &bm, (struct getbmap __user *)arg+1, iflags);
1289 if (error)
1290 return -error;
1291
1292 if (copy_to_user(arg, &bm, sizeof(bm)))
1293 return -XFS_ERROR(EFAULT);
1294 return 0;
1295}
1296
1297STATIC int
1298xfs_ioc_getbmapx(
1299 bhv_desc_t *bdp,
1300 void __user *arg)
1301{
1302 struct getbmapx bmx;
1303 struct getbmap bm;
1304 int iflags;
1305 int error;
1306
1307 if (copy_from_user(&bmx, arg, sizeof(bmx)))
1308 return -XFS_ERROR(EFAULT);
1309
1310 if (bmx.bmv_count < 2)
1311 return -XFS_ERROR(EINVAL);
1312
1313 /*
1314 * Map input getbmapx structure to a getbmap
1315 * structure for xfs_getbmap.
1316 */
1317 GETBMAP_CONVERT(bmx, bm);
1318
1319 iflags = bmx.bmv_iflags;
1320
1321 if (iflags & (~BMV_IF_VALID))
1322 return -XFS_ERROR(EINVAL);
1323
1324 iflags |= BMV_IF_EXTENDED;
1325
1326 error = xfs_getbmap(bdp, &bm, (struct getbmapx __user *)arg+1, iflags);
1327 if (error)
1328 return -error;
1329
1330 GETBMAP_CONVERT(bm, bmx);
1331
1332 if (copy_to_user(arg, &bmx, sizeof(bmx)))
1333 return -XFS_ERROR(EFAULT);
1334
1335 return 0;
1336}
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
new file mode 100644
index 000000000000..7a12c83184f5
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -0,0 +1,163 @@
1/*
2 * Copyright (c) 2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include <linux/config.h>
34#include <linux/compat.h>
35#include <linux/init.h>
36#include <linux/ioctl.h>
37#include <linux/ioctl32.h>
38#include <linux/syscalls.h>
39#include <linux/types.h>
40#include <linux/fs.h>
41#include <asm/uaccess.h>
42
43#include "xfs.h"
44#include "xfs_types.h"
45#include "xfs_fs.h"
46#include "xfs_vfs.h"
47#include "xfs_vnode.h"
48#include "xfs_dfrag.h"
49
50#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
51#define BROKEN_X86_ALIGNMENT
52#else
53
54typedef struct xfs_fsop_bulkreq32 {
55 compat_uptr_t lastip; /* last inode # pointer */
56 __s32 icount; /* count of entries in buffer */
57 compat_uptr_t ubuffer; /* user buffer for inode desc. */
58 __s32 ocount; /* output count pointer */
59} xfs_fsop_bulkreq32_t;
60
61static unsigned long
62xfs_ioctl32_bulkstat(unsigned long arg)
63{
64 xfs_fsop_bulkreq32_t __user *p32 = (void __user *)arg;
65 xfs_fsop_bulkreq_t __user *p = compat_alloc_user_space(sizeof(*p));
66 u32 addr;
67
68 if (get_user(addr, &p32->lastip) ||
69 put_user(compat_ptr(addr), &p->lastip) ||
70 copy_in_user(&p->icount, &p32->icount, sizeof(s32)) ||
71 get_user(addr, &p32->ubuffer) ||
72 put_user(compat_ptr(addr), &p->ubuffer) ||
73 get_user(addr, &p32->ocount) ||
74 put_user(compat_ptr(addr), &p->ocount))
75 return -EFAULT;
76
77 return (unsigned long)p;
78}
79#endif
80
81static long
82__xfs_compat_ioctl(int mode, struct file *f, unsigned cmd, unsigned long arg)
83{
84 int error;
85 struct inode *inode = f->f_dentry->d_inode;
86 vnode_t *vp = LINVFS_GET_VP(inode);
87
88 switch (cmd) {
89 case XFS_IOC_DIOINFO:
90 case XFS_IOC_FSGEOMETRY_V1:
91 case XFS_IOC_FSGEOMETRY:
92 case XFS_IOC_GETVERSION:
93 case XFS_IOC_GETXFLAGS:
94 case XFS_IOC_SETXFLAGS:
95 case XFS_IOC_FSGETXATTR:
96 case XFS_IOC_FSSETXATTR:
97 case XFS_IOC_FSGETXATTRA:
98 case XFS_IOC_FSSETDM:
99 case XFS_IOC_GETBMAP:
100 case XFS_IOC_GETBMAPA:
101 case XFS_IOC_GETBMAPX:
102/* not handled
103 case XFS_IOC_FD_TO_HANDLE:
104 case XFS_IOC_PATH_TO_HANDLE:
105 case XFS_IOC_PATH_TO_HANDLE:
106 case XFS_IOC_PATH_TO_FSHANDLE:
107 case XFS_IOC_OPEN_BY_HANDLE:
108 case XFS_IOC_FSSETDM_BY_HANDLE:
109 case XFS_IOC_READLINK_BY_HANDLE:
110 case XFS_IOC_ATTRLIST_BY_HANDLE:
111 case XFS_IOC_ATTRMULTI_BY_HANDLE:
112*/
113 case XFS_IOC_FSCOUNTS:
114 case XFS_IOC_SET_RESBLKS:
115 case XFS_IOC_GET_RESBLKS:
116 case XFS_IOC_FSGROWFSDATA:
117 case XFS_IOC_FSGROWFSLOG:
118 case XFS_IOC_FSGROWFSRT:
119 case XFS_IOC_FREEZE:
120 case XFS_IOC_THAW:
121 case XFS_IOC_GOINGDOWN:
122 case XFS_IOC_ERROR_INJECTION:
123 case XFS_IOC_ERROR_CLEARALL:
124 break;
125
126#ifndef BROKEN_X86_ALIGNMENT
127 /* xfs_flock_t and xfs_bstat_t have wrong u32 vs u64 alignment */
128 case XFS_IOC_ALLOCSP:
129 case XFS_IOC_FREESP:
130 case XFS_IOC_RESVSP:
131 case XFS_IOC_UNRESVSP:
132 case XFS_IOC_ALLOCSP64:
133 case XFS_IOC_FREESP64:
134 case XFS_IOC_RESVSP64:
135 case XFS_IOC_UNRESVSP64:
136 case XFS_IOC_SWAPEXT:
137 break;
138
139 case XFS_IOC_FSBULKSTAT_SINGLE:
140 case XFS_IOC_FSBULKSTAT:
141 case XFS_IOC_FSINUMBERS:
142 arg = xfs_ioctl32_bulkstat(arg);
143 break;
144#endif
145 default:
146 return -ENOIOCTLCMD;
147 }
148
149 VOP_IOCTL(vp, inode, f, mode, cmd, (void __user *)arg, error);
150 VMODIFY(vp);
151
152 return error;
153}
154
155long xfs_compat_ioctl(struct file *f, unsigned cmd, unsigned long arg)
156{
157 return __xfs_compat_ioctl(0, f, cmd, arg);
158}
159
160long xfs_compat_invis_ioctl(struct file *f, unsigned cmd, unsigned long arg)
161{
162 return __xfs_compat_ioctl(IO_INVIS, f, cmd, arg);
163}
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/linux-2.6/xfs_ioctl32.h
new file mode 100644
index 000000000000..779f69a48116
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.h
@@ -0,0 +1,34 @@
1/*
2 * Copyright (c) 2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33long xfs_compat_ioctl(struct file *f, unsigned cmd, unsigned long arg);
34long xfs_compat_invis_ioctl(struct file *f, unsigned cmd, unsigned long arg);
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
new file mode 100644
index 000000000000..407e99359391
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -0,0 +1,680 @@
1/*
2 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34#include "xfs_fs.h"
35#include "xfs_inum.h"
36#include "xfs_log.h"
37#include "xfs_trans.h"
38#include "xfs_sb.h"
39#include "xfs_ag.h"
40#include "xfs_dir.h"
41#include "xfs_dir2.h"
42#include "xfs_alloc.h"
43#include "xfs_dmapi.h"
44#include "xfs_quota.h"
45#include "xfs_mount.h"
46#include "xfs_alloc_btree.h"
47#include "xfs_bmap_btree.h"
48#include "xfs_ialloc_btree.h"
49#include "xfs_btree.h"
50#include "xfs_ialloc.h"
51#include "xfs_attr_sf.h"
52#include "xfs_dir_sf.h"
53#include "xfs_dir2_sf.h"
54#include "xfs_dinode.h"
55#include "xfs_inode.h"
56#include "xfs_bmap.h"
57#include "xfs_bit.h"
58#include "xfs_rtalloc.h"
59#include "xfs_error.h"
60#include "xfs_itable.h"
61#include "xfs_rw.h"
62#include "xfs_acl.h"
63#include "xfs_cap.h"
64#include "xfs_mac.h"
65#include "xfs_attr.h"
66#include "xfs_buf_item.h"
67#include "xfs_utils.h"
68
69#include <linux/xattr.h>
70#include <linux/namei.h>
71
72
73/*
74 * Pull the link count and size up from the xfs inode to the linux inode
75 */
76STATIC void
77validate_fields(
78 struct inode *ip)
79{
80 vnode_t *vp = LINVFS_GET_VP(ip);
81 vattr_t va;
82 int error;
83
84 va.va_mask = XFS_AT_NLINK|XFS_AT_SIZE|XFS_AT_NBLOCKS;
85 VOP_GETATTR(vp, &va, ATTR_LAZY, NULL, error);
86 if (likely(!error)) {
87 ip->i_nlink = va.va_nlink;
88 ip->i_blocks = va.va_nblocks;
89
90 /* we're under i_sem so i_size can't change under us */
91 if (i_size_read(ip) != va.va_size)
92 i_size_write(ip, va.va_size);
93 }
94}
95
96/*
97 * Determine whether a process has a valid fs_struct (kernel daemons
98 * like knfsd don't have an fs_struct).
99 *
100 * XXX(hch): nfsd is broken, better fix it instead.
101 */
102STATIC inline int
103has_fs_struct(struct task_struct *task)
104{
105 return (task->fs != init_task.fs);
106}
107
108STATIC int
109linvfs_mknod(
110 struct inode *dir,
111 struct dentry *dentry,
112 int mode,
113 dev_t rdev)
114{
115 struct inode *ip;
116 vattr_t va;
117 vnode_t *vp = NULL, *dvp = LINVFS_GET_VP(dir);
118 xfs_acl_t *default_acl = NULL;
119 attrexists_t test_default_acl = _ACL_DEFAULT_EXISTS;
120 int error;
121
122 /*
123 * Irix uses Missed'em'V split, but doesn't want to see
124 * the upper 5 bits of (14bit) major.
125 */
126 if (!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff)
127 return -EINVAL;
128
129 if (test_default_acl && test_default_acl(dvp)) {
130 if (!_ACL_ALLOC(default_acl))
131 return -ENOMEM;
132 if (!_ACL_GET_DEFAULT(dvp, default_acl)) {
133 _ACL_FREE(default_acl);
134 default_acl = NULL;
135 }
136 }
137
138 if (IS_POSIXACL(dir) && !default_acl && has_fs_struct(current))
139 mode &= ~current->fs->umask;
140
141 memset(&va, 0, sizeof(va));
142 va.va_mask = XFS_AT_TYPE|XFS_AT_MODE;
143 va.va_type = IFTOVT(mode);
144 va.va_mode = mode;
145
146 switch (mode & S_IFMT) {
147 case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK:
148 va.va_rdev = sysv_encode_dev(rdev);
149 va.va_mask |= XFS_AT_RDEV;
150 /*FALLTHROUGH*/
151 case S_IFREG:
152 VOP_CREATE(dvp, dentry, &va, &vp, NULL, error);
153 break;
154 case S_IFDIR:
155 VOP_MKDIR(dvp, dentry, &va, &vp, NULL, error);
156 break;
157 default:
158 error = EINVAL;
159 break;
160 }
161
162 if (default_acl) {
163 if (!error) {
164 error = _ACL_INHERIT(vp, &va, default_acl);
165 if (!error) {
166 VMODIFY(vp);
167 } else {
168 struct dentry teardown = {};
169 int err2;
170
171 /* Oh, the horror.
172 * If we can't add the ACL we must back out.
173 * ENOSPC can hit here, among other things.
174 */
175 teardown.d_inode = ip = LINVFS_GET_IP(vp);
176 teardown.d_name = dentry->d_name;
177
178 vn_mark_bad(vp);
179
180 if (S_ISDIR(mode))
181 VOP_RMDIR(dvp, &teardown, NULL, err2);
182 else
183 VOP_REMOVE(dvp, &teardown, NULL, err2);
184 VN_RELE(vp);
185 }
186 }
187 _ACL_FREE(default_acl);
188 }
189
190 if (!error) {
191 ASSERT(vp);
192 ip = LINVFS_GET_IP(vp);
193
194 if (S_ISCHR(mode) || S_ISBLK(mode))
195 ip->i_rdev = rdev;
196 else if (S_ISDIR(mode))
197 validate_fields(ip);
198 d_instantiate(dentry, ip);
199 validate_fields(dir);
200 }
201 return -error;
202}
203
204STATIC int
205linvfs_create(
206 struct inode *dir,
207 struct dentry *dentry,
208 int mode,
209 struct nameidata *nd)
210{
211 return linvfs_mknod(dir, dentry, mode, 0);
212}
213
214STATIC int
215linvfs_mkdir(
216 struct inode *dir,
217 struct dentry *dentry,
218 int mode)
219{
220 return linvfs_mknod(dir, dentry, mode|S_IFDIR, 0);
221}
222
223STATIC struct dentry *
224linvfs_lookup(
225 struct inode *dir,
226 struct dentry *dentry,
227 struct nameidata *nd)
228{
229 struct vnode *vp = LINVFS_GET_VP(dir), *cvp;
230 int error;
231
232 if (dentry->d_name.len >= MAXNAMELEN)
233 return ERR_PTR(-ENAMETOOLONG);
234
235 VOP_LOOKUP(vp, dentry, &cvp, 0, NULL, NULL, error);
236 if (error) {
237 if (unlikely(error != ENOENT))
238 return ERR_PTR(-error);
239 d_add(dentry, NULL);
240 return NULL;
241 }
242
243 return d_splice_alias(LINVFS_GET_IP(cvp), dentry);
244}
245
246STATIC int
247linvfs_link(
248 struct dentry *old_dentry,
249 struct inode *dir,
250 struct dentry *dentry)
251{
252 struct inode *ip; /* inode of guy being linked to */
253 vnode_t *tdvp; /* target directory for new name/link */
254 vnode_t *vp; /* vp of name being linked */
255 int error;
256
257 ip = old_dentry->d_inode; /* inode being linked to */
258 if (S_ISDIR(ip->i_mode))
259 return -EPERM;
260
261 tdvp = LINVFS_GET_VP(dir);
262 vp = LINVFS_GET_VP(ip);
263
264 VOP_LINK(tdvp, vp, dentry, NULL, error);
265 if (!error) {
266 VMODIFY(tdvp);
267 VN_HOLD(vp);
268 validate_fields(ip);
269 d_instantiate(dentry, ip);
270 }
271 return -error;
272}
273
274STATIC int
275linvfs_unlink(
276 struct inode *dir,
277 struct dentry *dentry)
278{
279 struct inode *inode;
280 vnode_t *dvp; /* directory containing name to remove */
281 int error;
282
283 inode = dentry->d_inode;
284 dvp = LINVFS_GET_VP(dir);
285
286 VOP_REMOVE(dvp, dentry, NULL, error);
287 if (!error) {
288 validate_fields(dir); /* For size only */
289 validate_fields(inode);
290 }
291
292 return -error;
293}
294
295STATIC int
296linvfs_symlink(
297 struct inode *dir,
298 struct dentry *dentry,
299 const char *symname)
300{
301 struct inode *ip;
302 vattr_t va;
303 vnode_t *dvp; /* directory containing name of symlink */
304 vnode_t *cvp; /* used to lookup symlink to put in dentry */
305 int error;
306
307 dvp = LINVFS_GET_VP(dir);
308 cvp = NULL;
309
310 memset(&va, 0, sizeof(va));
311 va.va_type = VLNK;
312 va.va_mode = irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO;
313 va.va_mask = XFS_AT_TYPE|XFS_AT_MODE;
314
315 error = 0;
316 VOP_SYMLINK(dvp, dentry, &va, (char *)symname, &cvp, NULL, error);
317 if (!error && cvp) {
318 ASSERT(cvp->v_type == VLNK);
319 ip = LINVFS_GET_IP(cvp);
320 d_instantiate(dentry, ip);
321 validate_fields(dir);
322 validate_fields(ip); /* size needs update */
323 }
324 return -error;
325}
326
327STATIC int
328linvfs_rmdir(
329 struct inode *dir,
330 struct dentry *dentry)
331{
332 struct inode *inode = dentry->d_inode;
333 vnode_t *dvp = LINVFS_GET_VP(dir);
334 int error;
335
336 VOP_RMDIR(dvp, dentry, NULL, error);
337 if (!error) {
338 validate_fields(inode);
339 validate_fields(dir);
340 }
341 return -error;
342}
343
344STATIC int
345linvfs_rename(
346 struct inode *odir,
347 struct dentry *odentry,
348 struct inode *ndir,
349 struct dentry *ndentry)
350{
351 struct inode *new_inode = ndentry->d_inode;
352 vnode_t *fvp; /* from directory */
353 vnode_t *tvp; /* target directory */
354 int error;
355
356 fvp = LINVFS_GET_VP(odir);
357 tvp = LINVFS_GET_VP(ndir);
358
359 VOP_RENAME(fvp, odentry, tvp, ndentry, NULL, error);
360 if (error)
361 return -error;
362
363 if (new_inode)
364 validate_fields(new_inode);
365
366 validate_fields(odir);
367 if (ndir != odir)
368 validate_fields(ndir);
369 return 0;
370}
371
372/*
373 * careful here - this function can get called recursively, so
374 * we need to be very careful about how much stack we use.
375 * uio is kmalloced for this reason...
376 */
377STATIC int
378linvfs_follow_link(
379 struct dentry *dentry,
380 struct nameidata *nd)
381{
382 vnode_t *vp;
383 uio_t *uio;
384 iovec_t iov;
385 int error;
386 char *link;
387
388 ASSERT(dentry);
389 ASSERT(nd);
390
391 link = (char *)kmalloc(MAXNAMELEN+1, GFP_KERNEL);
392 if (!link) {
393 nd_set_link(nd, ERR_PTR(-ENOMEM));
394 return 0;
395 }
396
397 uio = (uio_t *)kmalloc(sizeof(uio_t), GFP_KERNEL);
398 if (!uio) {
399 kfree(link);
400 nd_set_link(nd, ERR_PTR(-ENOMEM));
401 return 0;
402 }
403
404 vp = LINVFS_GET_VP(dentry->d_inode);
405
406 iov.iov_base = link;
407 iov.iov_len = MAXNAMELEN;
408
409 uio->uio_iov = &iov;
410 uio->uio_offset = 0;
411 uio->uio_segflg = UIO_SYSSPACE;
412 uio->uio_resid = MAXNAMELEN;
413 uio->uio_iovcnt = 1;
414
415 VOP_READLINK(vp, uio, 0, NULL, error);
416 if (error) {
417 kfree(link);
418 link = ERR_PTR(-error);
419 } else {
420 link[MAXNAMELEN - uio->uio_resid] = '\0';
421 }
422 kfree(uio);
423
424 nd_set_link(nd, link);
425 return 0;
426}
427
428static void linvfs_put_link(struct dentry *dentry, struct nameidata *nd)
429{
430 char *s = nd_get_link(nd);
431 if (!IS_ERR(s))
432 kfree(s);
433}
434
435#ifdef CONFIG_XFS_POSIX_ACL
436STATIC int
437linvfs_permission(
438 struct inode *inode,
439 int mode,
440 struct nameidata *nd)
441{
442 vnode_t *vp = LINVFS_GET_VP(inode);
443 int error;
444
445 mode <<= 6; /* convert from linux to vnode access bits */
446 VOP_ACCESS(vp, mode, NULL, error);
447 return -error;
448}
449#else
450#define linvfs_permission NULL
451#endif
452
453STATIC int
454linvfs_getattr(
455 struct vfsmount *mnt,
456 struct dentry *dentry,
457 struct kstat *stat)
458{
459 struct inode *inode = dentry->d_inode;
460 vnode_t *vp = LINVFS_GET_VP(inode);
461 int error = 0;
462
463 if (unlikely(vp->v_flag & VMODIFIED))
464 error = vn_revalidate(vp);
465 if (!error)
466 generic_fillattr(inode, stat);
467 return 0;
468}
469
470STATIC int
471linvfs_setattr(
472 struct dentry *dentry,
473 struct iattr *attr)
474{
475 struct inode *inode = dentry->d_inode;
476 unsigned int ia_valid = attr->ia_valid;
477 vnode_t *vp = LINVFS_GET_VP(inode);
478 vattr_t vattr;
479 int flags = 0;
480 int error;
481
482 memset(&vattr, 0, sizeof(vattr_t));
483 if (ia_valid & ATTR_UID) {
484 vattr.va_mask |= XFS_AT_UID;
485 vattr.va_uid = attr->ia_uid;
486 }
487 if (ia_valid & ATTR_GID) {
488 vattr.va_mask |= XFS_AT_GID;
489 vattr.va_gid = attr->ia_gid;
490 }
491 if (ia_valid & ATTR_SIZE) {
492 vattr.va_mask |= XFS_AT_SIZE;
493 vattr.va_size = attr->ia_size;
494 }
495 if (ia_valid & ATTR_ATIME) {
496 vattr.va_mask |= XFS_AT_ATIME;
497 vattr.va_atime = attr->ia_atime;
498 }
499 if (ia_valid & ATTR_MTIME) {
500 vattr.va_mask |= XFS_AT_MTIME;
501 vattr.va_mtime = attr->ia_mtime;
502 }
503 if (ia_valid & ATTR_CTIME) {
504 vattr.va_mask |= XFS_AT_CTIME;
505 vattr.va_ctime = attr->ia_ctime;
506 }
507 if (ia_valid & ATTR_MODE) {
508 vattr.va_mask |= XFS_AT_MODE;
509 vattr.va_mode = attr->ia_mode;
510 if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
511 inode->i_mode &= ~S_ISGID;
512 }
513
514 if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET))
515 flags |= ATTR_UTIME;
516#ifdef ATTR_NO_BLOCK
517 if ((ia_valid & ATTR_NO_BLOCK))
518 flags |= ATTR_NONBLOCK;
519#endif
520
521 VOP_SETATTR(vp, &vattr, flags, NULL, error);
522 if (error)
523 return -error;
524 vn_revalidate(vp);
525 return error;
526}
527
528STATIC void
529linvfs_truncate(
530 struct inode *inode)
531{
532 block_truncate_page(inode->i_mapping, inode->i_size, linvfs_get_block);
533}
534
535STATIC int
536linvfs_setxattr(
537 struct dentry *dentry,
538 const char *name,
539 const void *data,
540 size_t size,
541 int flags)
542{
543 vnode_t *vp = LINVFS_GET_VP(dentry->d_inode);
544 char *attr = (char *)name;
545 attrnames_t *namesp;
546 int xflags = 0;
547 int error;
548
549 namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT);
550 if (!namesp)
551 return -EOPNOTSUPP;
552 attr += namesp->attr_namelen;
553 error = namesp->attr_capable(vp, NULL);
554 if (error)
555 return error;
556
557 /* Convert Linux syscall to XFS internal ATTR flags */
558 if (flags & XATTR_CREATE)
559 xflags |= ATTR_CREATE;
560 if (flags & XATTR_REPLACE)
561 xflags |= ATTR_REPLACE;
562 xflags |= namesp->attr_flag;
563 return namesp->attr_set(vp, attr, (void *)data, size, xflags);
564}
565
566STATIC ssize_t
567linvfs_getxattr(
568 struct dentry *dentry,
569 const char *name,
570 void *data,
571 size_t size)
572{
573 vnode_t *vp = LINVFS_GET_VP(dentry->d_inode);
574 char *attr = (char *)name;
575 attrnames_t *namesp;
576 int xflags = 0;
577 ssize_t error;
578
579 namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT);
580 if (!namesp)
581 return -EOPNOTSUPP;
582 attr += namesp->attr_namelen;
583 error = namesp->attr_capable(vp, NULL);
584 if (error)
585 return error;
586
587 /* Convert Linux syscall to XFS internal ATTR flags */
588 if (!size) {
589 xflags |= ATTR_KERNOVAL;
590 data = NULL;
591 }
592 xflags |= namesp->attr_flag;
593 return namesp->attr_get(vp, attr, (void *)data, size, xflags);
594}
595
596STATIC ssize_t
597linvfs_listxattr(
598 struct dentry *dentry,
599 char *data,
600 size_t size)
601{
602 vnode_t *vp = LINVFS_GET_VP(dentry->d_inode);
603 int error, xflags = ATTR_KERNAMELS;
604 ssize_t result;
605
606 if (!size)
607 xflags |= ATTR_KERNOVAL;
608 xflags |= capable(CAP_SYS_ADMIN) ? ATTR_KERNFULLS : ATTR_KERNORMALS;
609
610 error = attr_generic_list(vp, data, size, xflags, &result);
611 if (error < 0)
612 return error;
613 return result;
614}
615
616STATIC int
617linvfs_removexattr(
618 struct dentry *dentry,
619 const char *name)
620{
621 vnode_t *vp = LINVFS_GET_VP(dentry->d_inode);
622 char *attr = (char *)name;
623 attrnames_t *namesp;
624 int xflags = 0;
625 int error;
626
627 namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT);
628 if (!namesp)
629 return -EOPNOTSUPP;
630 attr += namesp->attr_namelen;
631 error = namesp->attr_capable(vp, NULL);
632 if (error)
633 return error;
634 xflags |= namesp->attr_flag;
635 return namesp->attr_remove(vp, attr, xflags);
636}
637
638
639struct inode_operations linvfs_file_inode_operations = {
640 .permission = linvfs_permission,
641 .truncate = linvfs_truncate,
642 .getattr = linvfs_getattr,
643 .setattr = linvfs_setattr,
644 .setxattr = linvfs_setxattr,
645 .getxattr = linvfs_getxattr,
646 .listxattr = linvfs_listxattr,
647 .removexattr = linvfs_removexattr,
648};
649
650struct inode_operations linvfs_dir_inode_operations = {
651 .create = linvfs_create,
652 .lookup = linvfs_lookup,
653 .link = linvfs_link,
654 .unlink = linvfs_unlink,
655 .symlink = linvfs_symlink,
656 .mkdir = linvfs_mkdir,
657 .rmdir = linvfs_rmdir,
658 .mknod = linvfs_mknod,
659 .rename = linvfs_rename,
660 .permission = linvfs_permission,
661 .getattr = linvfs_getattr,
662 .setattr = linvfs_setattr,
663 .setxattr = linvfs_setxattr,
664 .getxattr = linvfs_getxattr,
665 .listxattr = linvfs_listxattr,
666 .removexattr = linvfs_removexattr,
667};
668
669struct inode_operations linvfs_symlink_inode_operations = {
670 .readlink = generic_readlink,
671 .follow_link = linvfs_follow_link,
672 .put_link = linvfs_put_link,
673 .permission = linvfs_permission,
674 .getattr = linvfs_getattr,
675 .setattr = linvfs_setattr,
676 .setxattr = linvfs_setxattr,
677 .getxattr = linvfs_getxattr,
678 .listxattr = linvfs_listxattr,
679 .removexattr = linvfs_removexattr,
680};
diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/linux-2.6/xfs_iops.h
new file mode 100644
index 000000000000..6a69a62c36b0
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_iops.h
@@ -0,0 +1,51 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_IOPS_H__
33#define __XFS_IOPS_H__
34
35extern struct inode_operations linvfs_file_inode_operations;
36extern struct inode_operations linvfs_dir_inode_operations;
37extern struct inode_operations linvfs_symlink_inode_operations;
38
39extern struct file_operations linvfs_file_operations;
40extern struct file_operations linvfs_invis_file_operations;
41extern struct file_operations linvfs_dir_operations;
42
43extern struct address_space_operations linvfs_aops;
44
45extern int linvfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
46extern void linvfs_unwritten_done(struct buffer_head *, int);
47
48extern int xfs_ioctl(struct bhv_desc *, struct inode *, struct file *,
49 int, unsigned int, void __user *);
50
51#endif /* __XFS_IOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
new file mode 100644
index 000000000000..71bb41019a12
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -0,0 +1,374 @@
1/*
2 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_LINUX__
33#define __XFS_LINUX__
34
35#include <linux/types.h>
36#include <linux/config.h>
37
38/*
39 * Some types are conditional depending on the target system.
40 * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits.
41 * XFS_BIG_INUMS needs the VFS inode number to be 64 bits, as well
42 * as requiring XFS_BIG_BLKNOS to be set.
43 */
44#if defined(CONFIG_LBD) || (BITS_PER_LONG == 64)
45# define XFS_BIG_BLKNOS 1
46# if BITS_PER_LONG == 64
47# define XFS_BIG_INUMS 1
48# else
49# define XFS_BIG_INUMS 0
50# endif
51#else
52# define XFS_BIG_BLKNOS 0
53# define XFS_BIG_INUMS 0
54#endif
55
56#include <xfs_types.h>
57#include <xfs_arch.h>
58
59#include <kmem.h>
60#include <mrlock.h>
61#include <spin.h>
62#include <sv.h>
63#include <mutex.h>
64#include <sema.h>
65#include <time.h>
66
67#include <support/qsort.h>
68#include <support/ktrace.h>
69#include <support/debug.h>
70#include <support/move.h>
71#include <support/uuid.h>
72
73#include <linux/mm.h>
74#include <linux/kernel.h>
75#include <linux/blkdev.h>
76#include <linux/slab.h>
77#include <linux/module.h>
78#include <linux/file.h>
79#include <linux/swap.h>
80#include <linux/errno.h>
81#include <linux/sched.h>
82#include <linux/bitops.h>
83#include <linux/major.h>
84#include <linux/pagemap.h>
85#include <linux/vfs.h>
86#include <linux/seq_file.h>
87#include <linux/init.h>
88#include <linux/list.h>
89#include <linux/proc_fs.h>
90#include <linux/version.h>
91#include <linux/sort.h>
92
93#include <asm/page.h>
94#include <asm/div64.h>
95#include <asm/param.h>
96#include <asm/uaccess.h>
97#include <asm/byteorder.h>
98#include <asm/unaligned.h>
99
100#include <xfs_behavior.h>
101#include <xfs_vfs.h>
102#include <xfs_cred.h>
103#include <xfs_vnode.h>
104#include <xfs_stats.h>
105#include <xfs_sysctl.h>
106#include <xfs_iops.h>
107#include <xfs_super.h>
108#include <xfs_globals.h>
109#include <xfs_fs_subr.h>
110#include <xfs_lrw.h>
111#include <xfs_buf.h>
112
113/*
114 * Feature macros (disable/enable)
115 */
116#undef HAVE_REFCACHE /* reference cache not needed for NFS in 2.6 */
117#define HAVE_SENDFILE /* sendfile(2) exists in 2.6, but not in 2.4 */
118
119/*
120 * State flag for unwritten extent buffers.
121 *
122 * We need to be able to distinguish between these and delayed
123 * allocate buffers within XFS. The generic IO path code does
124 * not need to distinguish - we use the BH_Delay flag for both
125 * delalloc and these ondisk-uninitialised buffers.
126 */
127BUFFER_FNS(PrivateStart, unwritten);
128static inline void set_buffer_unwritten_io(struct buffer_head *bh)
129{
130 bh->b_end_io = linvfs_unwritten_done;
131}
132
133#define restricted_chown xfs_params.restrict_chown.val
134#define irix_sgid_inherit xfs_params.sgid_inherit.val
135#define irix_symlink_mode xfs_params.symlink_mode.val
136#define xfs_panic_mask xfs_params.panic_mask.val
137#define xfs_error_level xfs_params.error_level.val
138#define xfs_syncd_centisecs xfs_params.syncd_timer.val
139#define xfs_stats_clear xfs_params.stats_clear.val
140#define xfs_inherit_sync xfs_params.inherit_sync.val
141#define xfs_inherit_nodump xfs_params.inherit_nodump.val
142#define xfs_inherit_noatime xfs_params.inherit_noatim.val
143#define xfs_buf_timer_centisecs xfs_params.xfs_buf_timer.val
144#define xfs_buf_age_centisecs xfs_params.xfs_buf_age.val
145#define xfs_inherit_nosymlinks xfs_params.inherit_nosym.val
146#define xfs_rotorstep xfs_params.rotorstep.val
147
148#ifndef __smp_processor_id
149#define __smp_processor_id() smp_processor_id()
150#endif
151#define current_cpu() __smp_processor_id()
152#define current_pid() (current->pid)
153#define current_fsuid(cred) (current->fsuid)
154#define current_fsgid(cred) (current->fsgid)
155
156#define NBPP PAGE_SIZE
157#define DPPSHFT (PAGE_SHIFT - 9)
158#define NDPP (1 << (PAGE_SHIFT - 9))
159#define dtop(DD) (((DD) + NDPP - 1) >> DPPSHFT)
160#define dtopt(DD) ((DD) >> DPPSHFT)
161#define dpoff(DD) ((DD) & (NDPP-1))
162
163#define NBBY 8 /* number of bits per byte */
164#define NBPC PAGE_SIZE /* Number of bytes per click */
165#define BPCSHIFT PAGE_SHIFT /* LOG2(NBPC) if exact */
166
167/*
168 * Size of block device i/o is parameterized here.
169 * Currently the system supports page-sized i/o.
170 */
171#define BLKDEV_IOSHIFT BPCSHIFT
172#define BLKDEV_IOSIZE (1<<BLKDEV_IOSHIFT)
173/* number of BB's per block device block */
174#define BLKDEV_BB BTOBB(BLKDEV_IOSIZE)
175
176/* bytes to clicks */
177#define btoc(x) (((__psunsigned_t)(x)+(NBPC-1))>>BPCSHIFT)
178#define btoct(x) ((__psunsigned_t)(x)>>BPCSHIFT)
179#define btoc64(x) (((__uint64_t)(x)+(NBPC-1))>>BPCSHIFT)
180#define btoct64(x) ((__uint64_t)(x)>>BPCSHIFT)
181#define io_btoc(x) (((__psunsigned_t)(x)+(IO_NBPC-1))>>IO_BPCSHIFT)
182#define io_btoct(x) ((__psunsigned_t)(x)>>IO_BPCSHIFT)
183
184/* off_t bytes to clicks */
185#define offtoc(x) (((__uint64_t)(x)+(NBPC-1))>>BPCSHIFT)
186#define offtoct(x) ((xfs_off_t)(x)>>BPCSHIFT)
187
188/* clicks to off_t bytes */
189#define ctooff(x) ((xfs_off_t)(x)<<BPCSHIFT)
190
191/* clicks to bytes */
192#define ctob(x) ((__psunsigned_t)(x)<<BPCSHIFT)
193#define btoct(x) ((__psunsigned_t)(x)>>BPCSHIFT)
194#define ctob64(x) ((__uint64_t)(x)<<BPCSHIFT)
195#define io_ctob(x) ((__psunsigned_t)(x)<<IO_BPCSHIFT)
196
197/* bytes to clicks */
198#define btoc(x) (((__psunsigned_t)(x)+(NBPC-1))>>BPCSHIFT)
199
200#ifndef CELL_CAPABLE
201#define FSC_NOTIFY_NAME_CHANGED(vp)
202#endif
203
204#ifndef ENOATTR
205#define ENOATTR ENODATA /* Attribute not found */
206#endif
207
208/* Note: EWRONGFS never visible outside the kernel */
209#define EWRONGFS EINVAL /* Mount with wrong filesystem type */
210
211/*
212 * XXX EFSCORRUPTED needs a real value in errno.h. asm-i386/errno.h won't
213 * return codes out of its known range in errno.
214 * XXX Also note: needs to be < 1000 and fairly unique on Linux (mustn't
215 * conflict with any code we use already or any code a driver may use)
216 * XXX Some options (currently we do #2):
217 * 1/ New error code ["Filesystem is corrupted", _after_ glibc updated]
218 * 2/ 990 ["Unknown error 990"]
219 * 3/ EUCLEAN ["Structure needs cleaning"]
220 * 4/ Convert EFSCORRUPTED to EIO [just prior to return into userspace]
221 */
222#define EFSCORRUPTED 990 /* Filesystem is corrupted */
223
224#define SYNCHRONIZE() barrier()
225#define __return_address __builtin_return_address(0)
226
227/*
228 * IRIX (BSD) quotactl makes use of separate commands for user/group,
229 * whereas on Linux the syscall encodes this information into the cmd
230 * field (see the QCMD macro in quota.h). These macros help keep the
231 * code portable - they are not visible from the syscall interface.
232 */
233#define Q_XSETGQLIM XQM_CMD(0x8) /* set groups disk limits */
234#define Q_XGETGQUOTA XQM_CMD(0x9) /* get groups disk limits */
235
236/* IRIX uses a dynamic sizing algorithm (ndquot = 200 + numprocs*2) */
237/* we may well need to fine-tune this if it ever becomes an issue. */
238#define DQUOT_MAX_HEURISTIC 1024 /* NR_DQUOTS */
239#define ndquot DQUOT_MAX_HEURISTIC
240
241/* IRIX uses the current size of the name cache to guess a good value */
242/* - this isn't the same but is a good enough starting point for now. */
243#define DQUOT_HASH_HEURISTIC files_stat.nr_files
244
245/* IRIX inodes maintain the project ID also, zero this field on Linux */
246#define DEFAULT_PROJID 0
247#define dfltprid DEFAULT_PROJID
248
249#define MAXPATHLEN 1024
250
251#define MIN(a,b) (min(a,b))
252#define MAX(a,b) (max(a,b))
253#define howmany(x, y) (((x)+((y)-1))/(y))
254#define roundup(x, y) ((((x)+((y)-1))/(y))*(y))
255
256#define xfs_stack_trace() dump_stack()
257
258#define xfs_itruncate_data(ip, off) \
259 (-vmtruncate(LINVFS_GET_IP(XFS_ITOV(ip)), (off)))
260
261
262/* Move the kernel do_div definition off to one side */
263
264#if defined __i386__
265/* For ia32 we need to pull some tricks to get past various versions
266 * of the compiler which do not like us using do_div in the middle
267 * of large functions.
268 */
269static inline __u32 xfs_do_div(void *a, __u32 b, int n)
270{
271 __u32 mod;
272
273 switch (n) {
274 case 4:
275 mod = *(__u32 *)a % b;
276 *(__u32 *)a = *(__u32 *)a / b;
277 return mod;
278 case 8:
279 {
280 unsigned long __upper, __low, __high, __mod;
281 __u64 c = *(__u64 *)a;
282 __upper = __high = c >> 32;
283 __low = c;
284 if (__high) {
285 __upper = __high % (b);
286 __high = __high / (b);
287 }
288 asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));
289 asm("":"=A" (c):"a" (__low),"d" (__high));
290 *(__u64 *)a = c;
291 return __mod;
292 }
293 }
294
295 /* NOTREACHED */
296 return 0;
297}
298
299/* Side effect free 64 bit mod operation */
300static inline __u32 xfs_do_mod(void *a, __u32 b, int n)
301{
302 switch (n) {
303 case 4:
304 return *(__u32 *)a % b;
305 case 8:
306 {
307 unsigned long __upper, __low, __high, __mod;
308 __u64 c = *(__u64 *)a;
309 __upper = __high = c >> 32;
310 __low = c;
311 if (__high) {
312 __upper = __high % (b);
313 __high = __high / (b);
314 }
315 asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));
316 asm("":"=A" (c):"a" (__low),"d" (__high));
317 return __mod;
318 }
319 }
320
321 /* NOTREACHED */
322 return 0;
323}
324#else
325static inline __u32 xfs_do_div(void *a, __u32 b, int n)
326{
327 __u32 mod;
328
329 switch (n) {
330 case 4:
331 mod = *(__u32 *)a % b;
332 *(__u32 *)a = *(__u32 *)a / b;
333 return mod;
334 case 8:
335 mod = do_div(*(__u64 *)a, b);
336 return mod;
337 }
338
339 /* NOTREACHED */
340 return 0;
341}
342
343/* Side effect free 64 bit mod operation */
344static inline __u32 xfs_do_mod(void *a, __u32 b, int n)
345{
346 switch (n) {
347 case 4:
348 return *(__u32 *)a % b;
349 case 8:
350 {
351 __u64 c = *(__u64 *)a;
352 return do_div(c, b);
353 }
354 }
355
356 /* NOTREACHED */
357 return 0;
358}
359#endif
360
361#undef do_div
362#define do_div(a, b) xfs_do_div(&(a), (b), sizeof(a))
363#define do_mod(a, b) xfs_do_mod(&(a), (b), sizeof(a))
364
365static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y)
366{
367 x += y - 1;
368 do_div(x, y);
369 return(x * y);
370}
371
372#define qsort(a, n, s, cmp) sort(a, n, s, cmp, NULL)
373
374#endif /* __XFS_LINUX__ */
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
new file mode 100644
index 000000000000..ff145fd0d1a4
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -0,0 +1,1082 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32/*
33 * fs/xfs/linux/xfs_lrw.c (Linux Read Write stuff)
34 *
35 */
36
37#include "xfs.h"
38
39#include "xfs_fs.h"
40#include "xfs_inum.h"
41#include "xfs_log.h"
42#include "xfs_trans.h"
43#include "xfs_sb.h"
44#include "xfs_ag.h"
45#include "xfs_dir.h"
46#include "xfs_dir2.h"
47#include "xfs_alloc.h"
48#include "xfs_dmapi.h"
49#include "xfs_quota.h"
50#include "xfs_mount.h"
51#include "xfs_alloc_btree.h"
52#include "xfs_bmap_btree.h"
53#include "xfs_ialloc_btree.h"
54#include "xfs_btree.h"
55#include "xfs_ialloc.h"
56#include "xfs_attr_sf.h"
57#include "xfs_dir_sf.h"
58#include "xfs_dir2_sf.h"
59#include "xfs_dinode.h"
60#include "xfs_inode.h"
61#include "xfs_bmap.h"
62#include "xfs_bit.h"
63#include "xfs_rtalloc.h"
64#include "xfs_error.h"
65#include "xfs_itable.h"
66#include "xfs_rw.h"
67#include "xfs_acl.h"
68#include "xfs_cap.h"
69#include "xfs_mac.h"
70#include "xfs_attr.h"
71#include "xfs_inode_item.h"
72#include "xfs_buf_item.h"
73#include "xfs_utils.h"
74#include "xfs_iomap.h"
75
76#include <linux/capability.h>
77#include <linux/writeback.h>
78
79
80#if defined(XFS_RW_TRACE)
81void
82xfs_rw_enter_trace(
83 int tag,
84 xfs_iocore_t *io,
85 void *data,
86 size_t segs,
87 loff_t offset,
88 int ioflags)
89{
90 xfs_inode_t *ip = XFS_IO_INODE(io);
91
92 if (ip->i_rwtrace == NULL)
93 return;
94 ktrace_enter(ip->i_rwtrace,
95 (void *)(unsigned long)tag,
96 (void *)ip,
97 (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
98 (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
99 (void *)data,
100 (void *)((unsigned long)segs),
101 (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
102 (void *)((unsigned long)(offset & 0xffffffff)),
103 (void *)((unsigned long)ioflags),
104 (void *)((unsigned long)((io->io_new_size >> 32) & 0xffffffff)),
105 (void *)((unsigned long)(io->io_new_size & 0xffffffff)),
106 (void *)NULL,
107 (void *)NULL,
108 (void *)NULL,
109 (void *)NULL,
110 (void *)NULL);
111}
112
113void
114xfs_inval_cached_trace(
115 xfs_iocore_t *io,
116 xfs_off_t offset,
117 xfs_off_t len,
118 xfs_off_t first,
119 xfs_off_t last)
120{
121 xfs_inode_t *ip = XFS_IO_INODE(io);
122
123 if (ip->i_rwtrace == NULL)
124 return;
125 ktrace_enter(ip->i_rwtrace,
126 (void *)(__psint_t)XFS_INVAL_CACHED,
127 (void *)ip,
128 (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
129 (void *)((unsigned long)(offset & 0xffffffff)),
130 (void *)((unsigned long)((len >> 32) & 0xffffffff)),
131 (void *)((unsigned long)(len & 0xffffffff)),
132 (void *)((unsigned long)((first >> 32) & 0xffffffff)),
133 (void *)((unsigned long)(first & 0xffffffff)),
134 (void *)((unsigned long)((last >> 32) & 0xffffffff)),
135 (void *)((unsigned long)(last & 0xffffffff)),
136 (void *)NULL,
137 (void *)NULL,
138 (void *)NULL,
139 (void *)NULL,
140 (void *)NULL,
141 (void *)NULL);
142}
143#endif
144
145/*
146 * xfs_iozero
147 *
148 * xfs_iozero clears the specified range of buffer supplied,
149 * and marks all the affected blocks as valid and modified. If
150 * an affected block is not allocated, it will be allocated. If
151 * an affected block is not completely overwritten, and is not
152 * valid before the operation, it will be read from disk before
153 * being partially zeroed.
154 */
155STATIC int
156xfs_iozero(
157 struct inode *ip, /* inode */
158 loff_t pos, /* offset in file */
159 size_t count, /* size of data to zero */
160 loff_t end_size) /* max file size to set */
161{
162 unsigned bytes;
163 struct page *page;
164 struct address_space *mapping;
165 char *kaddr;
166 int status;
167
168 mapping = ip->i_mapping;
169 do {
170 unsigned long index, offset;
171
172 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
173 index = pos >> PAGE_CACHE_SHIFT;
174 bytes = PAGE_CACHE_SIZE - offset;
175 if (bytes > count)
176 bytes = count;
177
178 status = -ENOMEM;
179 page = grab_cache_page(mapping, index);
180 if (!page)
181 break;
182
183 kaddr = kmap(page);
184 status = mapping->a_ops->prepare_write(NULL, page, offset,
185 offset + bytes);
186 if (status) {
187 goto unlock;
188 }
189
190 memset((void *) (kaddr + offset), 0, bytes);
191 flush_dcache_page(page);
192 status = mapping->a_ops->commit_write(NULL, page, offset,
193 offset + bytes);
194 if (!status) {
195 pos += bytes;
196 count -= bytes;
197 if (pos > i_size_read(ip))
198 i_size_write(ip, pos < end_size ? pos : end_size);
199 }
200
201unlock:
202 kunmap(page);
203 unlock_page(page);
204 page_cache_release(page);
205 if (status)
206 break;
207 } while (count);
208
209 return (-status);
210}
211
212/*
213 * xfs_inval_cached_pages
214 *
215 * This routine is responsible for keeping direct I/O and buffered I/O
216 * somewhat coherent. From here we make sure that we're at least
217 * temporarily holding the inode I/O lock exclusively and then call
218 * the page cache to flush and invalidate any cached pages. If there
219 * are no cached pages this routine will be very quick.
220 */
221void
222xfs_inval_cached_pages(
223 vnode_t *vp,
224 xfs_iocore_t *io,
225 xfs_off_t offset,
226 int write,
227 int relock)
228{
229 if (VN_CACHED(vp)) {
230 xfs_inval_cached_trace(io, offset, -1, ctooff(offtoct(offset)), -1);
231 VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(offset)), -1, FI_REMAPF_LOCKED);
232 }
233
234}
235
236ssize_t /* bytes read, or (-) error */
237xfs_read(
238 bhv_desc_t *bdp,
239 struct kiocb *iocb,
240 const struct iovec *iovp,
241 unsigned int segs,
242 loff_t *offset,
243 int ioflags,
244 cred_t *credp)
245{
246 struct file *file = iocb->ki_filp;
247 struct inode *inode = file->f_mapping->host;
248 size_t size = 0;
249 ssize_t ret;
250 xfs_fsize_t n;
251 xfs_inode_t *ip;
252 xfs_mount_t *mp;
253 vnode_t *vp;
254 unsigned long seg;
255
256 ip = XFS_BHVTOI(bdp);
257 vp = BHV_TO_VNODE(bdp);
258 mp = ip->i_mount;
259
260 XFS_STATS_INC(xs_read_calls);
261
262 /* START copy & waste from filemap.c */
263 for (seg = 0; seg < segs; seg++) {
264 const struct iovec *iv = &iovp[seg];
265
266 /*
267 * If any segment has a negative length, or the cumulative
268 * length ever wraps negative then return -EINVAL.
269 */
270 size += iv->iov_len;
271 if (unlikely((ssize_t)(size|iv->iov_len) < 0))
272 return XFS_ERROR(-EINVAL);
273 }
274 /* END copy & waste from filemap.c */
275
276 if (unlikely(ioflags & IO_ISDIRECT)) {
277 xfs_buftarg_t *target =
278 (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
279 mp->m_rtdev_targp : mp->m_ddev_targp;
280 if ((*offset & target->pbr_smask) ||
281 (size & target->pbr_smask)) {
282 if (*offset == ip->i_d.di_size) {
283 return (0);
284 }
285 return -XFS_ERROR(EINVAL);
286 }
287 }
288
289 n = XFS_MAXIOFFSET(mp) - *offset;
290 if ((n <= 0) || (size == 0))
291 return 0;
292
293 if (n < size)
294 size = n;
295
296 if (XFS_FORCED_SHUTDOWN(mp)) {
297 return -EIO;
298 }
299
300 if (unlikely(ioflags & IO_ISDIRECT))
301 down(&inode->i_sem);
302 xfs_ilock(ip, XFS_IOLOCK_SHARED);
303
304 if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) &&
305 !(ioflags & IO_INVIS)) {
306 vrwlock_t locktype = VRWLOCK_READ;
307
308 ret = -XFS_SEND_DATA(mp, DM_EVENT_READ,
309 BHV_TO_VNODE(bdp), *offset, size,
310 FILP_DELAY_FLAG(file), &locktype);
311 if (ret) {
312 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
313 goto unlock_isem;
314 }
315 }
316
317 xfs_rw_enter_trace(XFS_READ_ENTER, &ip->i_iocore,
318 (void *)iovp, segs, *offset, ioflags);
319 ret = __generic_file_aio_read(iocb, iovp, segs, offset);
320 if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
321 ret = wait_on_sync_kiocb(iocb);
322 if (ret > 0)
323 XFS_STATS_ADD(xs_read_bytes, ret);
324
325 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
326
327 if (likely(!(ioflags & IO_INVIS)))
328 xfs_ichgtime(ip, XFS_ICHGTIME_ACC);
329
330unlock_isem:
331 if (unlikely(ioflags & IO_ISDIRECT))
332 up(&inode->i_sem);
333 return ret;
334}
335
336ssize_t
337xfs_sendfile(
338 bhv_desc_t *bdp,
339 struct file *filp,
340 loff_t *offset,
341 int ioflags,
342 size_t count,
343 read_actor_t actor,
344 void *target,
345 cred_t *credp)
346{
347 ssize_t ret;
348 xfs_fsize_t n;
349 xfs_inode_t *ip;
350 xfs_mount_t *mp;
351 vnode_t *vp;
352
353 ip = XFS_BHVTOI(bdp);
354 vp = BHV_TO_VNODE(bdp);
355 mp = ip->i_mount;
356
357 XFS_STATS_INC(xs_read_calls);
358
359 n = XFS_MAXIOFFSET(mp) - *offset;
360 if ((n <= 0) || (count == 0))
361 return 0;
362
363 if (n < count)
364 count = n;
365
366 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
367 return -EIO;
368
369 xfs_ilock(ip, XFS_IOLOCK_SHARED);
370
371 if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) &&
372 (!(ioflags & IO_INVIS))) {
373 vrwlock_t locktype = VRWLOCK_READ;
374 int error;
375
376 error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp), *offset, count,
377 FILP_DELAY_FLAG(filp), &locktype);
378 if (error) {
379 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
380 return -error;
381 }
382 }
383 xfs_rw_enter_trace(XFS_SENDFILE_ENTER, &ip->i_iocore,
384 (void *)(unsigned long)target, count, *offset, ioflags);
385 ret = generic_file_sendfile(filp, offset, count, actor, target);
386
387 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
388
389 if (ret > 0)
390 XFS_STATS_ADD(xs_read_bytes, ret);
391
392 if (likely(!(ioflags & IO_INVIS)))
393 xfs_ichgtime(ip, XFS_ICHGTIME_ACC);
394
395 return ret;
396}
397
398/*
399 * This routine is called to handle zeroing any space in the last
400 * block of the file that is beyond the EOF. We do this since the
401 * size is being increased without writing anything to that block
402 * and we don't want anyone to read the garbage on the disk.
403 */
404STATIC int /* error (positive) */
405xfs_zero_last_block(
406 struct inode *ip,
407 xfs_iocore_t *io,
408 xfs_off_t offset,
409 xfs_fsize_t isize,
410 xfs_fsize_t end_size)
411{
412 xfs_fileoff_t last_fsb;
413 xfs_mount_t *mp;
414 int nimaps;
415 int zero_offset;
416 int zero_len;
417 int isize_fsb_offset;
418 int error = 0;
419 xfs_bmbt_irec_t imap;
420 loff_t loff;
421 size_t lsize;
422
423 ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0);
424 ASSERT(offset > isize);
425
426 mp = io->io_mount;
427
428 isize_fsb_offset = XFS_B_FSB_OFFSET(mp, isize);
429 if (isize_fsb_offset == 0) {
430 /*
431 * There are no extra bytes in the last block on disk to
432 * zero, so return.
433 */
434 return 0;
435 }
436
437 last_fsb = XFS_B_TO_FSBT(mp, isize);
438 nimaps = 1;
439 error = XFS_BMAPI(mp, NULL, io, last_fsb, 1, 0, NULL, 0, &imap,
440 &nimaps, NULL);
441 if (error) {
442 return error;
443 }
444 ASSERT(nimaps > 0);
445 /*
446 * If the block underlying isize is just a hole, then there
447 * is nothing to zero.
448 */
449 if (imap.br_startblock == HOLESTARTBLOCK) {
450 return 0;
451 }
452 /*
453 * Zero the part of the last block beyond the EOF, and write it
454 * out sync. We need to drop the ilock while we do this so we
455 * don't deadlock when the buffer cache calls back to us.
456 */
457 XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL| XFS_EXTSIZE_RD);
458 loff = XFS_FSB_TO_B(mp, last_fsb);
459 lsize = XFS_FSB_TO_B(mp, 1);
460
461 zero_offset = isize_fsb_offset;
462 zero_len = mp->m_sb.sb_blocksize - isize_fsb_offset;
463
464 error = xfs_iozero(ip, loff + zero_offset, zero_len, end_size);
465
466 XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
467 ASSERT(error >= 0);
468 return error;
469}
470
471/*
472 * Zero any on disk space between the current EOF and the new,
473 * larger EOF. This handles the normal case of zeroing the remainder
474 * of the last block in the file and the unusual case of zeroing blocks
475 * out beyond the size of the file. This second case only happens
476 * with fixed size extents and when the system crashes before the inode
477 * size was updated but after blocks were allocated. If fill is set,
478 * then any holes in the range are filled and zeroed. If not, the holes
479 * are left alone as holes.
480 */
481
482int /* error (positive) */
483xfs_zero_eof(
484 vnode_t *vp,
485 xfs_iocore_t *io,
486 xfs_off_t offset, /* starting I/O offset */
487 xfs_fsize_t isize, /* current inode size */
488 xfs_fsize_t end_size) /* terminal inode size */
489{
490 struct inode *ip = LINVFS_GET_IP(vp);
491 xfs_fileoff_t start_zero_fsb;
492 xfs_fileoff_t end_zero_fsb;
493 xfs_fileoff_t prev_zero_fsb;
494 xfs_fileoff_t zero_count_fsb;
495 xfs_fileoff_t last_fsb;
496 xfs_extlen_t buf_len_fsb;
497 xfs_extlen_t prev_zero_count;
498 xfs_mount_t *mp;
499 int nimaps;
500 int error = 0;
501 xfs_bmbt_irec_t imap;
502 loff_t loff;
503 size_t lsize;
504
505 ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
506 ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
507
508 mp = io->io_mount;
509
510 /*
511 * First handle zeroing the block on which isize resides.
512 * We only zero a part of that block so it is handled specially.
513 */
514 error = xfs_zero_last_block(ip, io, offset, isize, end_size);
515 if (error) {
516 ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
517 ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
518 return error;
519 }
520
521 /*
522 * Calculate the range between the new size and the old
523 * where blocks needing to be zeroed may exist. To get the
524 * block where the last byte in the file currently resides,
525 * we need to subtract one from the size and truncate back
526 * to a block boundary. We subtract 1 in case the size is
527 * exactly on a block boundary.
528 */
529 last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
530 start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
531 end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
532 ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
533 if (last_fsb == end_zero_fsb) {
534 /*
535 * The size was only incremented on its last block.
536 * We took care of that above, so just return.
537 */
538 return 0;
539 }
540
541 ASSERT(start_zero_fsb <= end_zero_fsb);
542 prev_zero_fsb = NULLFILEOFF;
543 prev_zero_count = 0;
544 while (start_zero_fsb <= end_zero_fsb) {
545 nimaps = 1;
546 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
547 error = XFS_BMAPI(mp, NULL, io, start_zero_fsb, zero_count_fsb,
548 0, NULL, 0, &imap, &nimaps, NULL);
549 if (error) {
550 ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
551 ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
552 return error;
553 }
554 ASSERT(nimaps > 0);
555
556 if (imap.br_state == XFS_EXT_UNWRITTEN ||
557 imap.br_startblock == HOLESTARTBLOCK) {
558 /*
559 * This loop handles initializing pages that were
560 * partially initialized by the code below this
561 * loop. It basically zeroes the part of the page
562 * that sits on a hole and sets the page as P_HOLE
563 * and calls remapf if it is a mapped file.
564 */
565 prev_zero_fsb = NULLFILEOFF;
566 prev_zero_count = 0;
567 start_zero_fsb = imap.br_startoff +
568 imap.br_blockcount;
569 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
570 continue;
571 }
572
573 /*
574 * There are blocks in the range requested.
575 * Zero them a single write at a time. We actually
576 * don't zero the entire range returned if it is
577 * too big and simply loop around to get the rest.
578 * That is not the most efficient thing to do, but it
579 * is simple and this path should not be exercised often.
580 */
581 buf_len_fsb = XFS_FILBLKS_MIN(imap.br_blockcount,
582 mp->m_writeio_blocks << 8);
583 /*
584 * Drop the inode lock while we're doing the I/O.
585 * We'll still have the iolock to protect us.
586 */
587 XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
588
589 loff = XFS_FSB_TO_B(mp, start_zero_fsb);
590 lsize = XFS_FSB_TO_B(mp, buf_len_fsb);
591
592 error = xfs_iozero(ip, loff, lsize, end_size);
593
594 if (error) {
595 goto out_lock;
596 }
597
598 prev_zero_fsb = start_zero_fsb;
599 prev_zero_count = buf_len_fsb;
600 start_zero_fsb = imap.br_startoff + buf_len_fsb;
601 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
602
603 XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
604 }
605
606 return 0;
607
608out_lock:
609
610 XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
611 ASSERT(error >= 0);
612 return error;
613}
614
615ssize_t /* bytes written, or (-) error */
616xfs_write(
617 bhv_desc_t *bdp,
618 struct kiocb *iocb,
619 const struct iovec *iovp,
620 unsigned int nsegs,
621 loff_t *offset,
622 int ioflags,
623 cred_t *credp)
624{
625 struct file *file = iocb->ki_filp;
626 struct address_space *mapping = file->f_mapping;
627 struct inode *inode = mapping->host;
628 unsigned long segs = nsegs;
629 xfs_inode_t *xip;
630 xfs_mount_t *mp;
631 ssize_t ret = 0, error = 0;
632 xfs_fsize_t isize, new_size;
633 xfs_iocore_t *io;
634 vnode_t *vp;
635 unsigned long seg;
636 int iolock;
637 int eventsent = 0;
638 vrwlock_t locktype;
639 size_t ocount = 0, count;
640 loff_t pos;
641 int need_isem = 1, need_flush = 0;
642
643 XFS_STATS_INC(xs_write_calls);
644
645 vp = BHV_TO_VNODE(bdp);
646 xip = XFS_BHVTOI(bdp);
647
648 for (seg = 0; seg < segs; seg++) {
649 const struct iovec *iv = &iovp[seg];
650
651 /*
652 * If any segment has a negative length, or the cumulative
653 * length ever wraps negative then return -EINVAL.
654 */
655 ocount += iv->iov_len;
656 if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
657 return -EINVAL;
658 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
659 continue;
660 if (seg == 0)
661 return -EFAULT;
662 segs = seg;
663 ocount -= iv->iov_len; /* This segment is no good */
664 break;
665 }
666
667 count = ocount;
668 pos = *offset;
669
670 if (count == 0)
671 return 0;
672
673 io = &xip->i_iocore;
674 mp = io->io_mount;
675
676 if (XFS_FORCED_SHUTDOWN(mp))
677 return -EIO;
678
679 fs_check_frozen(vp->v_vfsp, SB_FREEZE_WRITE);
680
681 if (ioflags & IO_ISDIRECT) {
682 xfs_buftarg_t *target =
683 (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
684 mp->m_rtdev_targp : mp->m_ddev_targp;
685
686 if ((pos & target->pbr_smask) || (count & target->pbr_smask))
687 return XFS_ERROR(-EINVAL);
688
689 if (!VN_CACHED(vp) && pos < i_size_read(inode))
690 need_isem = 0;
691
692 if (VN_CACHED(vp))
693 need_flush = 1;
694 }
695
696relock:
697 if (need_isem) {
698 iolock = XFS_IOLOCK_EXCL;
699 locktype = VRWLOCK_WRITE;
700
701 down(&inode->i_sem);
702 } else {
703 iolock = XFS_IOLOCK_SHARED;
704 locktype = VRWLOCK_WRITE_DIRECT;
705 }
706
707 xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
708
709 isize = i_size_read(inode);
710
711 if (file->f_flags & O_APPEND)
712 *offset = isize;
713
714start:
715 error = -generic_write_checks(file, &pos, &count,
716 S_ISBLK(inode->i_mode));
717 if (error) {
718 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
719 goto out_unlock_isem;
720 }
721
722 new_size = pos + count;
723 if (new_size > isize)
724 io->io_new_size = new_size;
725
726 if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) &&
727 !(ioflags & IO_INVIS) && !eventsent)) {
728 loff_t savedsize = pos;
729 int dmflags = FILP_DELAY_FLAG(file);
730
731 if (need_isem)
732 dmflags |= DM_FLAGS_ISEM;
733
734 xfs_iunlock(xip, XFS_ILOCK_EXCL);
735 error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp,
736 pos, count,
737 dmflags, &locktype);
738 if (error) {
739 xfs_iunlock(xip, iolock);
740 goto out_unlock_isem;
741 }
742 xfs_ilock(xip, XFS_ILOCK_EXCL);
743 eventsent = 1;
744
745 /*
746 * The iolock was dropped and reaquired in XFS_SEND_DATA
747 * so we have to recheck the size when appending.
748 * We will only "goto start;" once, since having sent the
749 * event prevents another call to XFS_SEND_DATA, which is
750 * what allows the size to change in the first place.
751 */
752 if ((file->f_flags & O_APPEND) && savedsize != isize) {
753 pos = isize = xip->i_d.di_size;
754 goto start;
755 }
756 }
757
758 /*
759 * On Linux, generic_file_write updates the times even if
760 * no data is copied in so long as the write had a size.
761 *
762 * We must update xfs' times since revalidate will overcopy xfs.
763 */
764 if (!(ioflags & IO_INVIS)) {
765 xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
766 inode_update_time(inode, 1);
767 }
768
769 /*
770 * If the offset is beyond the size of the file, we have a couple
771 * of things to do. First, if there is already space allocated
772 * we need to either create holes or zero the disk or ...
773 *
774 * If there is a page where the previous size lands, we need
775 * to zero it out up to the new size.
776 */
777
778 if (pos > isize) {
779 error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos,
780 isize, pos + count);
781 if (error) {
782 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
783 goto out_unlock_isem;
784 }
785 }
786 xfs_iunlock(xip, XFS_ILOCK_EXCL);
787
788 /*
789 * If we're writing the file then make sure to clear the
790 * setuid and setgid bits if the process is not being run
791 * by root. This keeps people from modifying setuid and
792 * setgid binaries.
793 */
794
795 if (((xip->i_d.di_mode & S_ISUID) ||
796 ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) ==
797 (S_ISGID | S_IXGRP))) &&
798 !capable(CAP_FSETID)) {
799 error = xfs_write_clear_setuid(xip);
800 if (likely(!error))
801 error = -remove_suid(file->f_dentry);
802 if (unlikely(error)) {
803 xfs_iunlock(xip, iolock);
804 goto out_unlock_isem;
805 }
806 }
807
808retry:
809 /* We can write back this queue in page reclaim */
810 current->backing_dev_info = mapping->backing_dev_info;
811
812 if ((ioflags & IO_ISDIRECT)) {
813 if (need_flush) {
814 xfs_inval_cached_trace(io, pos, -1,
815 ctooff(offtoct(pos)), -1);
816 VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(pos)),
817 -1, FI_REMAPF_LOCKED);
818 }
819
820 if (need_isem) {
821 /* demote the lock now the cached pages are gone */
822 XFS_ILOCK_DEMOTE(mp, io, XFS_IOLOCK_EXCL);
823 up(&inode->i_sem);
824
825 iolock = XFS_IOLOCK_SHARED;
826 locktype = VRWLOCK_WRITE_DIRECT;
827 need_isem = 0;
828 }
829
830 xfs_rw_enter_trace(XFS_DIOWR_ENTER, io, (void *)iovp, segs,
831 *offset, ioflags);
832 ret = generic_file_direct_write(iocb, iovp,
833 &segs, pos, offset, count, ocount);
834
835 /*
836 * direct-io write to a hole: fall through to buffered I/O
837 * for completing the rest of the request.
838 */
839 if (ret >= 0 && ret != count) {
840 XFS_STATS_ADD(xs_write_bytes, ret);
841
842 pos += ret;
843 count -= ret;
844
845 need_isem = 1;
846 ioflags &= ~IO_ISDIRECT;
847 xfs_iunlock(xip, iolock);
848 goto relock;
849 }
850 } else {
851 xfs_rw_enter_trace(XFS_WRITE_ENTER, io, (void *)iovp, segs,
852 *offset, ioflags);
853 ret = generic_file_buffered_write(iocb, iovp, segs,
854 pos, offset, count, ret);
855 }
856
857 current->backing_dev_info = NULL;
858
859 if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
860 ret = wait_on_sync_kiocb(iocb);
861
862 if ((ret == -ENOSPC) &&
863 DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) &&
864 !(ioflags & IO_INVIS)) {
865
866 xfs_rwunlock(bdp, locktype);
867 error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
868 DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL,
869 0, 0, 0); /* Delay flag intentionally unused */
870 if (error)
871 goto out_unlock_isem;
872 xfs_rwlock(bdp, locktype);
873 pos = xip->i_d.di_size;
874 ret = 0;
875 goto retry;
876 }
877
878 if (*offset > xip->i_d.di_size) {
879 xfs_ilock(xip, XFS_ILOCK_EXCL);
880 if (*offset > xip->i_d.di_size) {
881 xip->i_d.di_size = *offset;
882 i_size_write(inode, *offset);
883 xip->i_update_core = 1;
884 xip->i_update_size = 1;
885 }
886 xfs_iunlock(xip, XFS_ILOCK_EXCL);
887 }
888
889 error = -ret;
890 if (ret <= 0)
891 goto out_unlock_internal;
892
893 XFS_STATS_ADD(xs_write_bytes, ret);
894
895 /* Handle various SYNC-type writes */
896 if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
897 /*
898 * If we're treating this as O_DSYNC and we have not updated the
899 * size, force the log.
900 */
901 if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&
902 !(xip->i_update_size)) {
903 xfs_inode_log_item_t *iip = xip->i_itemp;
904
905 /*
906 * If an allocation transaction occurred
907 * without extending the size, then we have to force
908 * the log up the proper point to ensure that the
909 * allocation is permanent. We can't count on
910 * the fact that buffered writes lock out direct I/O
911 * writes - the direct I/O write could have extended
912 * the size nontransactionally, then finished before
913 * we started. xfs_write_file will think that the file
914 * didn't grow but the update isn't safe unless the
915 * size change is logged.
916 *
917 * Force the log if we've committed a transaction
918 * against the inode or if someone else has and
919 * the commit record hasn't gone to disk (e.g.
920 * the inode is pinned). This guarantees that
921 * all changes affecting the inode are permanent
922 * when we return.
923 */
924 if (iip && iip->ili_last_lsn) {
925 xfs_log_force(mp, iip->ili_last_lsn,
926 XFS_LOG_FORCE | XFS_LOG_SYNC);
927 } else if (xfs_ipincount(xip) > 0) {
928 xfs_log_force(mp, (xfs_lsn_t)0,
929 XFS_LOG_FORCE | XFS_LOG_SYNC);
930 }
931
932 } else {
933 xfs_trans_t *tp;
934
935 /*
936 * O_SYNC or O_DSYNC _with_ a size update are handled
937 * the same way.
938 *
939 * If the write was synchronous then we need to make
940 * sure that the inode modification time is permanent.
941 * We'll have updated the timestamp above, so here
942 * we use a synchronous transaction to log the inode.
943 * It's not fast, but it's necessary.
944 *
945 * If this a dsync write and the size got changed
946 * non-transactionally, then we need to ensure that
947 * the size change gets logged in a synchronous
948 * transaction.
949 */
950
951 tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
952 if ((error = xfs_trans_reserve(tp, 0,
953 XFS_SWRITE_LOG_RES(mp),
954 0, 0, 0))) {
955 /* Transaction reserve failed */
956 xfs_trans_cancel(tp, 0);
957 } else {
958 /* Transaction reserve successful */
959 xfs_ilock(xip, XFS_ILOCK_EXCL);
960 xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL);
961 xfs_trans_ihold(tp, xip);
962 xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE);
963 xfs_trans_set_sync(tp);
964 error = xfs_trans_commit(tp, 0, NULL);
965 xfs_iunlock(xip, XFS_ILOCK_EXCL);
966 }
967 if (error)
968 goto out_unlock_internal;
969 }
970
971 xfs_rwunlock(bdp, locktype);
972 if (need_isem)
973 up(&inode->i_sem);
974
975 error = sync_page_range(inode, mapping, pos, ret);
976 if (!error)
977 error = ret;
978 return error;
979 }
980
981 out_unlock_internal:
982 xfs_rwunlock(bdp, locktype);
983 out_unlock_isem:
984 if (need_isem)
985 up(&inode->i_sem);
986 return -error;
987}
988
989/*
990 * All xfs metadata buffers except log state machine buffers
991 * get this attached as their b_bdstrat callback function.
992 * This is so that we can catch a buffer
993 * after prematurely unpinning it to forcibly shutdown the filesystem.
994 */
995int
996xfs_bdstrat_cb(struct xfs_buf *bp)
997{
998 xfs_mount_t *mp;
999
1000 mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *);
1001 if (!XFS_FORCED_SHUTDOWN(mp)) {
1002 pagebuf_iorequest(bp);
1003 return 0;
1004 } else {
1005 xfs_buftrace("XFS__BDSTRAT IOERROR", bp);
1006 /*
1007 * Metadata write that didn't get logged but
1008 * written delayed anyway. These aren't associated
1009 * with a transaction, and can be ignored.
1010 */
1011 if (XFS_BUF_IODONE_FUNC(bp) == NULL &&
1012 (XFS_BUF_ISREAD(bp)) == 0)
1013 return (xfs_bioerror_relse(bp));
1014 else
1015 return (xfs_bioerror(bp));
1016 }
1017}
1018
1019
1020int
1021xfs_bmap(bhv_desc_t *bdp,
1022 xfs_off_t offset,
1023 ssize_t count,
1024 int flags,
1025 xfs_iomap_t *iomapp,
1026 int *niomaps)
1027{
1028 xfs_inode_t *ip = XFS_BHVTOI(bdp);
1029 xfs_iocore_t *io = &ip->i_iocore;
1030
1031 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
1032 ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) ==
1033 ((ip->i_iocore.io_flags & XFS_IOCORE_RT) != 0));
1034
1035 return xfs_iomap(io, offset, count, flags, iomapp, niomaps);
1036}
1037
1038/*
1039 * Wrapper around bdstrat so that we can stop data
1040 * from going to disk in case we are shutting down the filesystem.
1041 * Typically user data goes thru this path; one of the exceptions
1042 * is the superblock.
1043 */
1044int
1045xfsbdstrat(
1046 struct xfs_mount *mp,
1047 struct xfs_buf *bp)
1048{
1049 ASSERT(mp);
1050 if (!XFS_FORCED_SHUTDOWN(mp)) {
1051 /* Grio redirection would go here
1052 * if (XFS_BUF_IS_GRIO(bp)) {
1053 */
1054
1055 pagebuf_iorequest(bp);
1056 return 0;
1057 }
1058
1059 xfs_buftrace("XFSBDSTRAT IOERROR", bp);
1060 return (xfs_bioerror_relse(bp));
1061}
1062
1063/*
1064 * If the underlying (data/log/rt) device is readonly, there are some
1065 * operations that cannot proceed.
1066 */
1067int
1068xfs_dev_is_read_only(
1069 xfs_mount_t *mp,
1070 char *message)
1071{
1072 if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
1073 xfs_readonly_buftarg(mp->m_logdev_targp) ||
1074 (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
1075 cmn_err(CE_NOTE,
1076 "XFS: %s required on read-only device.", message);
1077 cmn_err(CE_NOTE,
1078 "XFS: write access unavailable, cannot proceed.");
1079 return EROFS;
1080 }
1081 return 0;
1082}
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
new file mode 100644
index 000000000000..d723e35254a0
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_lrw.h
@@ -0,0 +1,116 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_LRW_H__
33#define __XFS_LRW_H__
34
35struct vnode;
36struct bhv_desc;
37struct xfs_mount;
38struct xfs_iocore;
39struct xfs_inode;
40struct xfs_bmbt_irec;
41struct xfs_buf;
42struct xfs_iomap;
43
44#if defined(XFS_RW_TRACE)
45/*
46 * Defines for the trace mechanisms in xfs_lrw.c.
47 */
48#define XFS_RW_KTRACE_SIZE 128
49
50#define XFS_READ_ENTER 1
51#define XFS_WRITE_ENTER 2
52#define XFS_IOMAP_READ_ENTER 3
53#define XFS_IOMAP_WRITE_ENTER 4
54#define XFS_IOMAP_READ_MAP 5
55#define XFS_IOMAP_WRITE_MAP 6
56#define XFS_IOMAP_WRITE_NOSPACE 7
57#define XFS_ITRUNC_START 8
58#define XFS_ITRUNC_FINISH1 9
59#define XFS_ITRUNC_FINISH2 10
60#define XFS_CTRUNC1 11
61#define XFS_CTRUNC2 12
62#define XFS_CTRUNC3 13
63#define XFS_CTRUNC4 14
64#define XFS_CTRUNC5 15
65#define XFS_CTRUNC6 16
66#define XFS_BUNMAPI 17
67#define XFS_INVAL_CACHED 18
68#define XFS_DIORD_ENTER 19
69#define XFS_DIOWR_ENTER 20
70#define XFS_SENDFILE_ENTER 21
71#define XFS_WRITEPAGE_ENTER 22
72#define XFS_RELEASEPAGE_ENTER 23
73#define XFS_IOMAP_ALLOC_ENTER 24
74#define XFS_IOMAP_ALLOC_MAP 25
75#define XFS_IOMAP_UNWRITTEN 26
76extern void xfs_rw_enter_trace(int, struct xfs_iocore *,
77 void *, size_t, loff_t, int);
78extern void xfs_inval_cached_trace(struct xfs_iocore *,
79 xfs_off_t, xfs_off_t, xfs_off_t, xfs_off_t);
80#else
81#define xfs_rw_enter_trace(tag, io, data, size, offset, ioflags)
82#define xfs_inval_cached_trace(io, offset, len, first, last)
83#endif
84
85/*
86 * Maximum count of bmaps used by read and write paths.
87 */
88#define XFS_MAX_RW_NBMAPS 4
89
90extern int xfs_bmap(struct bhv_desc *, xfs_off_t, ssize_t, int,
91 struct xfs_iomap *, int *);
92extern int xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
93extern int xfs_bdstrat_cb(struct xfs_buf *);
94
95extern int xfs_zero_eof(struct vnode *, struct xfs_iocore *, xfs_off_t,
96 xfs_fsize_t, xfs_fsize_t);
97extern void xfs_inval_cached_pages(struct vnode *, struct xfs_iocore *,
98 xfs_off_t, int, int);
99extern ssize_t xfs_read(struct bhv_desc *, struct kiocb *,
100 const struct iovec *, unsigned int,
101 loff_t *, int, struct cred *);
102extern ssize_t xfs_write(struct bhv_desc *, struct kiocb *,
103 const struct iovec *, unsigned int,
104 loff_t *, int, struct cred *);
105extern ssize_t xfs_sendfile(struct bhv_desc *, struct file *,
106 loff_t *, int, size_t, read_actor_t,
107 void *, struct cred *);
108
109extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
110
111#define XFS_FSB_TO_DB_IO(io,fsb) \
112 (((io)->io_flags & XFS_IOCORE_RT) ? \
113 XFS_FSB_TO_BB((io)->io_mount, (fsb)) : \
114 XFS_FSB_TO_DADDR((io)->io_mount, (fsb)))
115
116#endif /* __XFS_LRW_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c
new file mode 100644
index 000000000000..aaf5ddba47f3
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_stats.c
@@ -0,0 +1,132 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34#include <linux/proc_fs.h>
35
36DEFINE_PER_CPU(struct xfsstats, xfsstats);
37
38STATIC int
39xfs_read_xfsstats(
40 char *buffer,
41 char **start,
42 off_t offset,
43 int count,
44 int *eof,
45 void *data)
46{
47 int c, i, j, len, val;
48 __uint64_t xs_xstrat_bytes = 0;
49 __uint64_t xs_write_bytes = 0;
50 __uint64_t xs_read_bytes = 0;
51
52 static struct xstats_entry {
53 char *desc;
54 int endpoint;
55 } xstats[] = {
56 { "extent_alloc", XFSSTAT_END_EXTENT_ALLOC },
57 { "abt", XFSSTAT_END_ALLOC_BTREE },
58 { "blk_map", XFSSTAT_END_BLOCK_MAPPING },
59 { "bmbt", XFSSTAT_END_BLOCK_MAP_BTREE },
60 { "dir", XFSSTAT_END_DIRECTORY_OPS },
61 { "trans", XFSSTAT_END_TRANSACTIONS },
62 { "ig", XFSSTAT_END_INODE_OPS },
63 { "log", XFSSTAT_END_LOG_OPS },
64 { "push_ail", XFSSTAT_END_TAIL_PUSHING },
65 { "xstrat", XFSSTAT_END_WRITE_CONVERT },
66 { "rw", XFSSTAT_END_READ_WRITE_OPS },
67 { "attr", XFSSTAT_END_ATTRIBUTE_OPS },
68 { "icluster", XFSSTAT_END_INODE_CLUSTER },
69 { "vnodes", XFSSTAT_END_VNODE_OPS },
70 { "buf", XFSSTAT_END_BUF },
71 };
72
73 /* Loop over all stats groups */
74 for (i=j=len = 0; i < sizeof(xstats)/sizeof(struct xstats_entry); i++) {
75 len += sprintf(buffer + len, xstats[i].desc);
76 /* inner loop does each group */
77 while (j < xstats[i].endpoint) {
78 val = 0;
79 /* sum over all cpus */
80 for (c = 0; c < NR_CPUS; c++) {
81 if (!cpu_possible(c)) continue;
82 val += *(((__u32*)&per_cpu(xfsstats, c) + j));
83 }
84 len += sprintf(buffer + len, " %u", val);
85 j++;
86 }
87 buffer[len++] = '\n';
88 }
89 /* extra precision counters */
90 for (i = 0; i < NR_CPUS; i++) {
91 if (!cpu_possible(i)) continue;
92 xs_xstrat_bytes += per_cpu(xfsstats, i).xs_xstrat_bytes;
93 xs_write_bytes += per_cpu(xfsstats, i).xs_write_bytes;
94 xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes;
95 }
96
97 len += sprintf(buffer + len, "xpc %Lu %Lu %Lu\n",
98 xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
99 len += sprintf(buffer + len, "debug %u\n",
100#if defined(DEBUG)
101 1);
102#else
103 0);
104#endif
105
106 if (offset >= len) {
107 *start = buffer;
108 *eof = 1;
109 return 0;
110 }
111 *start = buffer + offset;
112 if ((len -= offset) > count)
113 return count;
114 *eof = 1;
115
116 return len;
117}
118
119void
120xfs_init_procfs(void)
121{
122 if (!proc_mkdir("fs/xfs", NULL))
123 return;
124 create_proc_read_entry("fs/xfs/stat", 0, NULL, xfs_read_xfsstats, NULL);
125}
126
127void
128xfs_cleanup_procfs(void)
129{
130 remove_proc_entry("fs/xfs/stat", NULL);
131 remove_proc_entry("fs/xfs", NULL);
132}
diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/linux-2.6/xfs_stats.h
new file mode 100644
index 000000000000..3f756a6c3eb0
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_stats.h
@@ -0,0 +1,166 @@
1/*
2 * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_STATS_H__
33#define __XFS_STATS_H__
34
35
36#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF)
37
38#include <linux/percpu.h>
39
40/*
41 * XFS global statistics
42 */
43struct xfsstats {
44# define XFSSTAT_END_EXTENT_ALLOC 4
45 __uint32_t xs_allocx;
46 __uint32_t xs_allocb;
47 __uint32_t xs_freex;
48 __uint32_t xs_freeb;
49# define XFSSTAT_END_ALLOC_BTREE (XFSSTAT_END_EXTENT_ALLOC+4)
50 __uint32_t xs_abt_lookup;
51 __uint32_t xs_abt_compare;
52 __uint32_t xs_abt_insrec;
53 __uint32_t xs_abt_delrec;
54# define XFSSTAT_END_BLOCK_MAPPING (XFSSTAT_END_ALLOC_BTREE+7)
55 __uint32_t xs_blk_mapr;
56 __uint32_t xs_blk_mapw;
57 __uint32_t xs_blk_unmap;
58 __uint32_t xs_add_exlist;
59 __uint32_t xs_del_exlist;
60 __uint32_t xs_look_exlist;
61 __uint32_t xs_cmp_exlist;
62# define XFSSTAT_END_BLOCK_MAP_BTREE (XFSSTAT_END_BLOCK_MAPPING+4)
63 __uint32_t xs_bmbt_lookup;
64 __uint32_t xs_bmbt_compare;
65 __uint32_t xs_bmbt_insrec;
66 __uint32_t xs_bmbt_delrec;
67# define XFSSTAT_END_DIRECTORY_OPS (XFSSTAT_END_BLOCK_MAP_BTREE+4)
68 __uint32_t xs_dir_lookup;
69 __uint32_t xs_dir_create;
70 __uint32_t xs_dir_remove;
71 __uint32_t xs_dir_getdents;
72# define XFSSTAT_END_TRANSACTIONS (XFSSTAT_END_DIRECTORY_OPS+3)
73 __uint32_t xs_trans_sync;
74 __uint32_t xs_trans_async;
75 __uint32_t xs_trans_empty;
76# define XFSSTAT_END_INODE_OPS (XFSSTAT_END_TRANSACTIONS+7)
77 __uint32_t xs_ig_attempts;
78 __uint32_t xs_ig_found;
79 __uint32_t xs_ig_frecycle;
80 __uint32_t xs_ig_missed;
81 __uint32_t xs_ig_dup;
82 __uint32_t xs_ig_reclaims;
83 __uint32_t xs_ig_attrchg;
84# define XFSSTAT_END_LOG_OPS (XFSSTAT_END_INODE_OPS+5)
85 __uint32_t xs_log_writes;
86 __uint32_t xs_log_blocks;
87 __uint32_t xs_log_noiclogs;
88 __uint32_t xs_log_force;
89 __uint32_t xs_log_force_sleep;
90# define XFSSTAT_END_TAIL_PUSHING (XFSSTAT_END_LOG_OPS+10)
91 __uint32_t xs_try_logspace;
92 __uint32_t xs_sleep_logspace;
93 __uint32_t xs_push_ail;
94 __uint32_t xs_push_ail_success;
95 __uint32_t xs_push_ail_pushbuf;
96 __uint32_t xs_push_ail_pinned;
97 __uint32_t xs_push_ail_locked;
98 __uint32_t xs_push_ail_flushing;
99 __uint32_t xs_push_ail_restarts;
100 __uint32_t xs_push_ail_flush;
101# define XFSSTAT_END_WRITE_CONVERT (XFSSTAT_END_TAIL_PUSHING+2)
102 __uint32_t xs_xstrat_quick;
103 __uint32_t xs_xstrat_split;
104# define XFSSTAT_END_READ_WRITE_OPS (XFSSTAT_END_WRITE_CONVERT+2)
105 __uint32_t xs_write_calls;
106 __uint32_t xs_read_calls;
107# define XFSSTAT_END_ATTRIBUTE_OPS (XFSSTAT_END_READ_WRITE_OPS+4)
108 __uint32_t xs_attr_get;
109 __uint32_t xs_attr_set;
110 __uint32_t xs_attr_remove;
111 __uint32_t xs_attr_list;
112# define XFSSTAT_END_INODE_CLUSTER (XFSSTAT_END_ATTRIBUTE_OPS+3)
113 __uint32_t xs_iflush_count;
114 __uint32_t xs_icluster_flushcnt;
115 __uint32_t xs_icluster_flushinode;
116# define XFSSTAT_END_VNODE_OPS (XFSSTAT_END_INODE_CLUSTER+8)
117 __uint32_t vn_active; /* # vnodes not on free lists */
118 __uint32_t vn_alloc; /* # times vn_alloc called */
119 __uint32_t vn_get; /* # times vn_get called */
120 __uint32_t vn_hold; /* # times vn_hold called */
121 __uint32_t vn_rele; /* # times vn_rele called */
122 __uint32_t vn_reclaim; /* # times vn_reclaim called */
123 __uint32_t vn_remove; /* # times vn_remove called */
124 __uint32_t vn_free; /* # times vn_free called */
125#define XFSSTAT_END_BUF (XFSSTAT_END_VNODE_OPS+9)
126 __uint32_t pb_get;
127 __uint32_t pb_create;
128 __uint32_t pb_get_locked;
129 __uint32_t pb_get_locked_waited;
130 __uint32_t pb_busy_locked;
131 __uint32_t pb_miss_locked;
132 __uint32_t pb_page_retries;
133 __uint32_t pb_page_found;
134 __uint32_t pb_get_read;
135/* Extra precision counters */
136 __uint64_t xs_xstrat_bytes;
137 __uint64_t xs_write_bytes;
138 __uint64_t xs_read_bytes;
139};
140
141DECLARE_PER_CPU(struct xfsstats, xfsstats);
142
143/*
144 * We don't disable preempt, not too worried about poking the
145 * wrong CPU's stat for now (also aggregated before reporting).
146 */
147#define XFS_STATS_INC(v) (per_cpu(xfsstats, current_cpu()).v++)
148#define XFS_STATS_DEC(v) (per_cpu(xfsstats, current_cpu()).v--)
149#define XFS_STATS_ADD(v, inc) (per_cpu(xfsstats, current_cpu()).v += (inc))
150
151extern void xfs_init_procfs(void);
152extern void xfs_cleanup_procfs(void);
153
154
155#else /* !CONFIG_PROC_FS */
156
157# define XFS_STATS_INC(count)
158# define XFS_STATS_DEC(count)
159# define XFS_STATS_ADD(count, inc)
160
161static __inline void xfs_init_procfs(void) { };
162static __inline void xfs_cleanup_procfs(void) { };
163
164#endif /* !CONFIG_PROC_FS */
165
166#endif /* __XFS_STATS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
new file mode 100644
index 000000000000..53dc658cafa6
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -0,0 +1,912 @@
1/*
2 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34
35#include "xfs_inum.h"
36#include "xfs_log.h"
37#include "xfs_clnt.h"
38#include "xfs_trans.h"
39#include "xfs_sb.h"
40#include "xfs_dir.h"
41#include "xfs_dir2.h"
42#include "xfs_alloc.h"
43#include "xfs_dmapi.h"
44#include "xfs_quota.h"
45#include "xfs_mount.h"
46#include "xfs_alloc_btree.h"
47#include "xfs_bmap_btree.h"
48#include "xfs_ialloc_btree.h"
49#include "xfs_btree.h"
50#include "xfs_ialloc.h"
51#include "xfs_attr_sf.h"
52#include "xfs_dir_sf.h"
53#include "xfs_dir2_sf.h"
54#include "xfs_dinode.h"
55#include "xfs_inode.h"
56#include "xfs_bmap.h"
57#include "xfs_bit.h"
58#include "xfs_rtalloc.h"
59#include "xfs_error.h"
60#include "xfs_itable.h"
61#include "xfs_rw.h"
62#include "xfs_acl.h"
63#include "xfs_cap.h"
64#include "xfs_mac.h"
65#include "xfs_attr.h"
66#include "xfs_buf_item.h"
67#include "xfs_utils.h"
68#include "xfs_version.h"
69#include "xfs_ioctl32.h"
70
71#include <linux/namei.h>
72#include <linux/init.h>
73#include <linux/mount.h>
74#include <linux/writeback.h>
75
76STATIC struct quotactl_ops linvfs_qops;
77STATIC struct super_operations linvfs_sops;
78STATIC kmem_zone_t *linvfs_inode_zone;
79
80STATIC struct xfs_mount_args *
81xfs_args_allocate(
82 struct super_block *sb)
83{
84 struct xfs_mount_args *args;
85
86 args = kmem_zalloc(sizeof(struct xfs_mount_args), KM_SLEEP);
87 args->logbufs = args->logbufsize = -1;
88 strncpy(args->fsname, sb->s_id, MAXNAMELEN);
89
90 /* Copy the already-parsed mount(2) flags we're interested in */
91 if (sb->s_flags & MS_NOATIME)
92 args->flags |= XFSMNT_NOATIME;
93 if (sb->s_flags & MS_DIRSYNC)
94 args->flags |= XFSMNT_DIRSYNC;
95 if (sb->s_flags & MS_SYNCHRONOUS)
96 args->flags |= XFSMNT_WSYNC;
97
98 /* Default to 32 bit inodes on Linux all the time */
99 args->flags |= XFSMNT_32BITINODES;
100
101 return args;
102}
103
104__uint64_t
105xfs_max_file_offset(
106 unsigned int blockshift)
107{
108 unsigned int pagefactor = 1;
109 unsigned int bitshift = BITS_PER_LONG - 1;
110
111 /* Figure out maximum filesize, on Linux this can depend on
112 * the filesystem blocksize (on 32 bit platforms).
113 * __block_prepare_write does this in an [unsigned] long...
114 * page->index << (PAGE_CACHE_SHIFT - bbits)
115 * So, for page sized blocks (4K on 32 bit platforms),
116 * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
117 * (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1)
118 * but for smaller blocksizes it is less (bbits = log2 bsize).
119 * Note1: get_block_t takes a long (implicit cast from above)
120 * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch
121 * can optionally convert the [unsigned] long from above into
122 * an [unsigned] long long.
123 */
124
125#if BITS_PER_LONG == 32
126# if defined(CONFIG_LBD)
127 ASSERT(sizeof(sector_t) == 8);
128 pagefactor = PAGE_CACHE_SIZE;
129 bitshift = BITS_PER_LONG;
130# else
131 pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift);
132# endif
133#endif
134
135 return (((__uint64_t)pagefactor) << bitshift) - 1;
136}
137
138STATIC __inline__ void
139xfs_set_inodeops(
140 struct inode *inode)
141{
142 vnode_t *vp = LINVFS_GET_VP(inode);
143
144 if (vp->v_type == VNON) {
145 vn_mark_bad(vp);
146 } else if (S_ISREG(inode->i_mode)) {
147 inode->i_op = &linvfs_file_inode_operations;
148 inode->i_fop = &linvfs_file_operations;
149 inode->i_mapping->a_ops = &linvfs_aops;
150 } else if (S_ISDIR(inode->i_mode)) {
151 inode->i_op = &linvfs_dir_inode_operations;
152 inode->i_fop = &linvfs_dir_operations;
153 } else if (S_ISLNK(inode->i_mode)) {
154 inode->i_op = &linvfs_symlink_inode_operations;
155 if (inode->i_blocks)
156 inode->i_mapping->a_ops = &linvfs_aops;
157 } else {
158 inode->i_op = &linvfs_file_inode_operations;
159 init_special_inode(inode, inode->i_mode, inode->i_rdev);
160 }
161}
162
163STATIC __inline__ void
164xfs_revalidate_inode(
165 xfs_mount_t *mp,
166 vnode_t *vp,
167 xfs_inode_t *ip)
168{
169 struct inode *inode = LINVFS_GET_IP(vp);
170
171 inode->i_mode = (ip->i_d.di_mode & MODEMASK) | VTTOIF(vp->v_type);
172 inode->i_nlink = ip->i_d.di_nlink;
173 inode->i_uid = ip->i_d.di_uid;
174 inode->i_gid = ip->i_d.di_gid;
175 if (((1 << vp->v_type) & ((1<<VBLK) | (1<<VCHR))) == 0) {
176 inode->i_rdev = 0;
177 } else {
178 xfs_dev_t dev = ip->i_df.if_u2.if_rdev;
179 inode->i_rdev = MKDEV(sysv_major(dev) & 0x1ff, sysv_minor(dev));
180 }
181 inode->i_blksize = PAGE_CACHE_SIZE;
182 inode->i_generation = ip->i_d.di_gen;
183 i_size_write(inode, ip->i_d.di_size);
184 inode->i_blocks =
185 XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
186 inode->i_atime.tv_sec = ip->i_d.di_atime.t_sec;
187 inode->i_atime.tv_nsec = ip->i_d.di_atime.t_nsec;
188 inode->i_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
189 inode->i_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
190 inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
191 inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
192 if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
193 inode->i_flags |= S_IMMUTABLE;
194 else
195 inode->i_flags &= ~S_IMMUTABLE;
196 if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
197 inode->i_flags |= S_APPEND;
198 else
199 inode->i_flags &= ~S_APPEND;
200 if (ip->i_d.di_flags & XFS_DIFLAG_SYNC)
201 inode->i_flags |= S_SYNC;
202 else
203 inode->i_flags &= ~S_SYNC;
204 if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME)
205 inode->i_flags |= S_NOATIME;
206 else
207 inode->i_flags &= ~S_NOATIME;
208 vp->v_flag &= ~VMODIFIED;
209}
210
211void
212xfs_initialize_vnode(
213 bhv_desc_t *bdp,
214 vnode_t *vp,
215 bhv_desc_t *inode_bhv,
216 int unlock)
217{
218 xfs_inode_t *ip = XFS_BHVTOI(inode_bhv);
219 struct inode *inode = LINVFS_GET_IP(vp);
220
221 if (!inode_bhv->bd_vobj) {
222 vp->v_vfsp = bhvtovfs(bdp);
223 bhv_desc_init(inode_bhv, ip, vp, &xfs_vnodeops);
224 bhv_insert(VN_BHV_HEAD(vp), inode_bhv);
225 }
226
227 /*
228 * We need to set the ops vectors, and unlock the inode, but if
229 * we have been called during the new inode create process, it is
230 * too early to fill in the Linux inode. We will get called a
231 * second time once the inode is properly set up, and then we can
232 * finish our work.
233 */
234 if (ip->i_d.di_mode != 0 && unlock && (inode->i_state & I_NEW)) {
235 vp->v_type = IFTOVT(ip->i_d.di_mode);
236 xfs_revalidate_inode(XFS_BHVTOM(bdp), vp, ip);
237 xfs_set_inodeops(inode);
238
239 ip->i_flags &= ~XFS_INEW;
240 barrier();
241
242 unlock_new_inode(inode);
243 }
244}
245
246int
247xfs_blkdev_get(
248 xfs_mount_t *mp,
249 const char *name,
250 struct block_device **bdevp)
251{
252 int error = 0;
253
254 *bdevp = open_bdev_excl(name, 0, mp);
255 if (IS_ERR(*bdevp)) {
256 error = PTR_ERR(*bdevp);
257 printk("XFS: Invalid device [%s], error=%d\n", name, error);
258 }
259
260 return -error;
261}
262
263void
264xfs_blkdev_put(
265 struct block_device *bdev)
266{
267 if (bdev)
268 close_bdev_excl(bdev);
269}
270
271
272STATIC struct inode *
273linvfs_alloc_inode(
274 struct super_block *sb)
275{
276 vnode_t *vp;
277
278 vp = (vnode_t *)kmem_cache_alloc(linvfs_inode_zone,
279 kmem_flags_convert(KM_SLEEP));
280 if (!vp)
281 return NULL;
282 return LINVFS_GET_IP(vp);
283}
284
285STATIC void
286linvfs_destroy_inode(
287 struct inode *inode)
288{
289 kmem_cache_free(linvfs_inode_zone, LINVFS_GET_VP(inode));
290}
291
292STATIC void
293init_once(
294 void *data,
295 kmem_cache_t *cachep,
296 unsigned long flags)
297{
298 vnode_t *vp = (vnode_t *)data;
299
300 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
301 SLAB_CTOR_CONSTRUCTOR)
302 inode_init_once(LINVFS_GET_IP(vp));
303}
304
305STATIC int
306init_inodecache( void )
307{
308 linvfs_inode_zone = kmem_cache_create("linvfs_icache",
309 sizeof(vnode_t), 0, SLAB_RECLAIM_ACCOUNT,
310 init_once, NULL);
311 if (linvfs_inode_zone == NULL)
312 return -ENOMEM;
313 return 0;
314}
315
316STATIC void
317destroy_inodecache( void )
318{
319 if (kmem_cache_destroy(linvfs_inode_zone))
320 printk(KERN_WARNING "%s: cache still in use!\n", __FUNCTION__);
321}
322
323/*
324 * Attempt to flush the inode, this will actually fail
325 * if the inode is pinned, but we dirty the inode again
326 * at the point when it is unpinned after a log write,
327 * since this is when the inode itself becomes flushable.
328 */
329STATIC int
330linvfs_write_inode(
331 struct inode *inode,
332 int sync)
333{
334 vnode_t *vp = LINVFS_GET_VP(inode);
335 int error = 0, flags = FLUSH_INODE;
336
337 if (vp) {
338 vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
339 if (sync)
340 flags |= FLUSH_SYNC;
341 VOP_IFLUSH(vp, flags, error);
342 if (error == EAGAIN) {
343 if (sync)
344 VOP_IFLUSH(vp, flags | FLUSH_LOG, error);
345 else
346 error = 0;
347 }
348 }
349
350 return -error;
351}
352
353STATIC void
354linvfs_clear_inode(
355 struct inode *inode)
356{
357 vnode_t *vp = LINVFS_GET_VP(inode);
358
359 if (vp) {
360 vn_rele(vp);
361 vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
362 /*
363 * Do all our cleanup, and remove this vnode.
364 */
365 vn_remove(vp);
366 }
367}
368
369
370/*
371 * Enqueue a work item to be picked up by the vfs xfssyncd thread.
372 * Doing this has two advantages:
373 * - It saves on stack space, which is tight in certain situations
374 * - It can be used (with care) as a mechanism to avoid deadlocks.
375 * Flushing while allocating in a full filesystem requires both.
376 */
377STATIC void
378xfs_syncd_queue_work(
379 struct vfs *vfs,
380 void *data,
381 void (*syncer)(vfs_t *, void *))
382{
383 vfs_sync_work_t *work;
384
385 work = kmem_alloc(sizeof(struct vfs_sync_work), KM_SLEEP);
386 INIT_LIST_HEAD(&work->w_list);
387 work->w_syncer = syncer;
388 work->w_data = data;
389 work->w_vfs = vfs;
390 spin_lock(&vfs->vfs_sync_lock);
391 list_add_tail(&work->w_list, &vfs->vfs_sync_list);
392 spin_unlock(&vfs->vfs_sync_lock);
393 wake_up_process(vfs->vfs_sync_task);
394}
395
396/*
397 * Flush delayed allocate data, attempting to free up reserved space
398 * from existing allocations. At this point a new allocation attempt
399 * has failed with ENOSPC and we are in the process of scratching our
400 * heads, looking about for more room...
401 */
402STATIC void
403xfs_flush_inode_work(
404 vfs_t *vfs,
405 void *inode)
406{
407 filemap_flush(((struct inode *)inode)->i_mapping);
408 iput((struct inode *)inode);
409}
410
411void
412xfs_flush_inode(
413 xfs_inode_t *ip)
414{
415 struct inode *inode = LINVFS_GET_IP(XFS_ITOV(ip));
416 struct vfs *vfs = XFS_MTOVFS(ip->i_mount);
417
418 igrab(inode);
419 xfs_syncd_queue_work(vfs, inode, xfs_flush_inode_work);
420 delay(HZ/2);
421}
422
423/*
424 * This is the "bigger hammer" version of xfs_flush_inode_work...
425 * (IOW, "If at first you don't succeed, use a Bigger Hammer").
426 */
427STATIC void
428xfs_flush_device_work(
429 vfs_t *vfs,
430 void *inode)
431{
432 sync_blockdev(vfs->vfs_super->s_bdev);
433 iput((struct inode *)inode);
434}
435
436void
437xfs_flush_device(
438 xfs_inode_t *ip)
439{
440 struct inode *inode = LINVFS_GET_IP(XFS_ITOV(ip));
441 struct vfs *vfs = XFS_MTOVFS(ip->i_mount);
442
443 igrab(inode);
444 xfs_syncd_queue_work(vfs, inode, xfs_flush_device_work);
445 delay(HZ/2);
446 xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
447}
448
449#define SYNCD_FLAGS (SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR)
450STATIC void
451vfs_sync_worker(
452 vfs_t *vfsp,
453 void *unused)
454{
455 int error;
456
457 if (!(vfsp->vfs_flag & VFS_RDONLY))
458 VFS_SYNC(vfsp, SYNCD_FLAGS, NULL, error);
459 vfsp->vfs_sync_seq++;
460 wmb();
461 wake_up(&vfsp->vfs_wait_single_sync_task);
462}
463
464STATIC int
465xfssyncd(
466 void *arg)
467{
468 long timeleft;
469 vfs_t *vfsp = (vfs_t *) arg;
470 struct list_head tmp;
471 struct vfs_sync_work *work, *n;
472
473 daemonize("xfssyncd");
474
475 vfsp->vfs_sync_work.w_vfs = vfsp;
476 vfsp->vfs_sync_work.w_syncer = vfs_sync_worker;
477 vfsp->vfs_sync_task = current;
478 wmb();
479 wake_up(&vfsp->vfs_wait_sync_task);
480
481 INIT_LIST_HEAD(&tmp);
482 timeleft = (xfs_syncd_centisecs * HZ) / 100;
483 for (;;) {
484 set_current_state(TASK_INTERRUPTIBLE);
485 timeleft = schedule_timeout(timeleft);
486 /* swsusp */
487 try_to_freeze(PF_FREEZE);
488 if (vfsp->vfs_flag & VFS_UMOUNT)
489 break;
490
491 spin_lock(&vfsp->vfs_sync_lock);
492 /*
493 * We can get woken by laptop mode, to do a sync -
494 * that's the (only!) case where the list would be
495 * empty with time remaining.
496 */
497 if (!timeleft || list_empty(&vfsp->vfs_sync_list)) {
498 if (!timeleft)
499 timeleft = (xfs_syncd_centisecs * HZ) / 100;
500 INIT_LIST_HEAD(&vfsp->vfs_sync_work.w_list);
501 list_add_tail(&vfsp->vfs_sync_work.w_list,
502 &vfsp->vfs_sync_list);
503 }
504 list_for_each_entry_safe(work, n, &vfsp->vfs_sync_list, w_list)
505 list_move(&work->w_list, &tmp);
506 spin_unlock(&vfsp->vfs_sync_lock);
507
508 list_for_each_entry_safe(work, n, &tmp, w_list) {
509 (*work->w_syncer)(vfsp, work->w_data);
510 list_del(&work->w_list);
511 if (work == &vfsp->vfs_sync_work)
512 continue;
513 kmem_free(work, sizeof(struct vfs_sync_work));
514 }
515 }
516
517 vfsp->vfs_sync_task = NULL;
518 wmb();
519 wake_up(&vfsp->vfs_wait_sync_task);
520
521 return 0;
522}
523
524STATIC int
525linvfs_start_syncd(
526 vfs_t *vfsp)
527{
528 int pid;
529
530 pid = kernel_thread(xfssyncd, (void *) vfsp,
531 CLONE_VM | CLONE_FS | CLONE_FILES);
532 if (pid < 0)
533 return -pid;
534 wait_event(vfsp->vfs_wait_sync_task, vfsp->vfs_sync_task);
535 return 0;
536}
537
538STATIC void
539linvfs_stop_syncd(
540 vfs_t *vfsp)
541{
542 vfsp->vfs_flag |= VFS_UMOUNT;
543 wmb();
544
545 wake_up_process(vfsp->vfs_sync_task);
546 wait_event(vfsp->vfs_wait_sync_task, !vfsp->vfs_sync_task);
547}
548
549STATIC void
550linvfs_put_super(
551 struct super_block *sb)
552{
553 vfs_t *vfsp = LINVFS_GET_VFS(sb);
554 int error;
555
556 linvfs_stop_syncd(vfsp);
557 VFS_SYNC(vfsp, SYNC_ATTR|SYNC_DELWRI, NULL, error);
558 if (!error)
559 VFS_UNMOUNT(vfsp, 0, NULL, error);
560 if (error) {
561 printk("XFS unmount got error %d\n", error);
562 printk("%s: vfsp/0x%p left dangling!\n", __FUNCTION__, vfsp);
563 return;
564 }
565
566 vfs_deallocate(vfsp);
567}
568
569STATIC void
570linvfs_write_super(
571 struct super_block *sb)
572{
573 vfs_t *vfsp = LINVFS_GET_VFS(sb);
574 int error;
575
576 if (sb->s_flags & MS_RDONLY) {
577 sb->s_dirt = 0; /* paranoia */
578 return;
579 }
580 /* Push the log and superblock a little */
581 VFS_SYNC(vfsp, SYNC_FSDATA, NULL, error);
582 sb->s_dirt = 0;
583}
584
585STATIC int
586linvfs_sync_super(
587 struct super_block *sb,
588 int wait)
589{
590 vfs_t *vfsp = LINVFS_GET_VFS(sb);
591 int error;
592 int flags = SYNC_FSDATA;
593
594 if (wait)
595 flags |= SYNC_WAIT;
596
597 VFS_SYNC(vfsp, flags, NULL, error);
598 sb->s_dirt = 0;
599
600 if (unlikely(laptop_mode)) {
601 int prev_sync_seq = vfsp->vfs_sync_seq;
602
603 /*
604 * The disk must be active because we're syncing.
605 * We schedule xfssyncd now (now that the disk is
606 * active) instead of later (when it might not be).
607 */
608 wake_up_process(vfsp->vfs_sync_task);
609 /*
610 * We have to wait for the sync iteration to complete.
611 * If we don't, the disk activity caused by the sync
612 * will come after the sync is completed, and that
613 * triggers another sync from laptop mode.
614 */
615 wait_event(vfsp->vfs_wait_single_sync_task,
616 vfsp->vfs_sync_seq != prev_sync_seq);
617 }
618
619 return -error;
620}
621
622STATIC int
623linvfs_statfs(
624 struct super_block *sb,
625 struct kstatfs *statp)
626{
627 vfs_t *vfsp = LINVFS_GET_VFS(sb);
628 int error;
629
630 VFS_STATVFS(vfsp, statp, NULL, error);
631 return -error;
632}
633
634STATIC int
635linvfs_remount(
636 struct super_block *sb,
637 int *flags,
638 char *options)
639{
640 vfs_t *vfsp = LINVFS_GET_VFS(sb);
641 struct xfs_mount_args *args = xfs_args_allocate(sb);
642 int error;
643
644 VFS_PARSEARGS(vfsp, options, args, 1, error);
645 if (!error)
646 VFS_MNTUPDATE(vfsp, flags, args, error);
647 kmem_free(args, sizeof(*args));
648 return -error;
649}
650
651STATIC void
652linvfs_freeze_fs(
653 struct super_block *sb)
654{
655 VFS_FREEZE(LINVFS_GET_VFS(sb));
656}
657
658STATIC int
659linvfs_show_options(
660 struct seq_file *m,
661 struct vfsmount *mnt)
662{
663 struct vfs *vfsp = LINVFS_GET_VFS(mnt->mnt_sb);
664 int error;
665
666 VFS_SHOWARGS(vfsp, m, error);
667 return error;
668}
669
670STATIC int
671linvfs_getxstate(
672 struct super_block *sb,
673 struct fs_quota_stat *fqs)
674{
675 struct vfs *vfsp = LINVFS_GET_VFS(sb);
676 int error;
677
678 VFS_QUOTACTL(vfsp, Q_XGETQSTAT, 0, (caddr_t)fqs, error);
679 return -error;
680}
681
682STATIC int
683linvfs_setxstate(
684 struct super_block *sb,
685 unsigned int flags,
686 int op)
687{
688 struct vfs *vfsp = LINVFS_GET_VFS(sb);
689 int error;
690
691 VFS_QUOTACTL(vfsp, op, 0, (caddr_t)&flags, error);
692 return -error;
693}
694
695STATIC int
696linvfs_getxquota(
697 struct super_block *sb,
698 int type,
699 qid_t id,
700 struct fs_disk_quota *fdq)
701{
702 struct vfs *vfsp = LINVFS_GET_VFS(sb);
703 int error, getmode;
704
705 getmode = (type == GRPQUOTA) ? Q_XGETGQUOTA : Q_XGETQUOTA;
706 VFS_QUOTACTL(vfsp, getmode, id, (caddr_t)fdq, error);
707 return -error;
708}
709
710STATIC int
711linvfs_setxquota(
712 struct super_block *sb,
713 int type,
714 qid_t id,
715 struct fs_disk_quota *fdq)
716{
717 struct vfs *vfsp = LINVFS_GET_VFS(sb);
718 int error, setmode;
719
720 setmode = (type == GRPQUOTA) ? Q_XSETGQLIM : Q_XSETQLIM;
721 VFS_QUOTACTL(vfsp, setmode, id, (caddr_t)fdq, error);
722 return -error;
723}
724
725STATIC int
726linvfs_fill_super(
727 struct super_block *sb,
728 void *data,
729 int silent)
730{
731 vnode_t *rootvp;
732 struct vfs *vfsp = vfs_allocate();
733 struct xfs_mount_args *args = xfs_args_allocate(sb);
734 struct kstatfs statvfs;
735 int error, error2;
736
737 vfsp->vfs_super = sb;
738 LINVFS_SET_VFS(sb, vfsp);
739 if (sb->s_flags & MS_RDONLY)
740 vfsp->vfs_flag |= VFS_RDONLY;
741 bhv_insert_all_vfsops(vfsp);
742
743 VFS_PARSEARGS(vfsp, (char *)data, args, 0, error);
744 if (error) {
745 bhv_remove_all_vfsops(vfsp, 1);
746 goto fail_vfsop;
747 }
748
749 sb_min_blocksize(sb, BBSIZE);
750#ifdef CONFIG_XFS_EXPORT
751 sb->s_export_op = &linvfs_export_ops;
752#endif
753 sb->s_qcop = &linvfs_qops;
754 sb->s_op = &linvfs_sops;
755
756 VFS_MOUNT(vfsp, args, NULL, error);
757 if (error) {
758 bhv_remove_all_vfsops(vfsp, 1);
759 goto fail_vfsop;
760 }
761
762 VFS_STATVFS(vfsp, &statvfs, NULL, error);
763 if (error)
764 goto fail_unmount;
765
766 sb->s_dirt = 1;
767 sb->s_magic = statvfs.f_type;
768 sb->s_blocksize = statvfs.f_bsize;
769 sb->s_blocksize_bits = ffs(statvfs.f_bsize) - 1;
770 sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits);
771 sb->s_time_gran = 1;
772 set_posix_acl_flag(sb);
773
774 VFS_ROOT(vfsp, &rootvp, error);
775 if (error)
776 goto fail_unmount;
777
778 sb->s_root = d_alloc_root(LINVFS_GET_IP(rootvp));
779 if (!sb->s_root) {
780 error = ENOMEM;
781 goto fail_vnrele;
782 }
783 if (is_bad_inode(sb->s_root->d_inode)) {
784 error = EINVAL;
785 goto fail_vnrele;
786 }
787 if ((error = linvfs_start_syncd(vfsp)))
788 goto fail_vnrele;
789 vn_trace_exit(rootvp, __FUNCTION__, (inst_t *)__return_address);
790
791 kmem_free(args, sizeof(*args));
792 return 0;
793
794fail_vnrele:
795 if (sb->s_root) {
796 dput(sb->s_root);
797 sb->s_root = NULL;
798 } else {
799 VN_RELE(rootvp);
800 }
801
802fail_unmount:
803 VFS_UNMOUNT(vfsp, 0, NULL, error2);
804
805fail_vfsop:
806 vfs_deallocate(vfsp);
807 kmem_free(args, sizeof(*args));
808 return -error;
809}
810
811STATIC struct super_block *
812linvfs_get_sb(
813 struct file_system_type *fs_type,
814 int flags,
815 const char *dev_name,
816 void *data)
817{
818 return get_sb_bdev(fs_type, flags, dev_name, data, linvfs_fill_super);
819}
820
821STATIC struct super_operations linvfs_sops = {
822 .alloc_inode = linvfs_alloc_inode,
823 .destroy_inode = linvfs_destroy_inode,
824 .write_inode = linvfs_write_inode,
825 .clear_inode = linvfs_clear_inode,
826 .put_super = linvfs_put_super,
827 .write_super = linvfs_write_super,
828 .sync_fs = linvfs_sync_super,
829 .write_super_lockfs = linvfs_freeze_fs,
830 .statfs = linvfs_statfs,
831 .remount_fs = linvfs_remount,
832 .show_options = linvfs_show_options,
833};
834
835STATIC struct quotactl_ops linvfs_qops = {
836 .get_xstate = linvfs_getxstate,
837 .set_xstate = linvfs_setxstate,
838 .get_xquota = linvfs_getxquota,
839 .set_xquota = linvfs_setxquota,
840};
841
842STATIC struct file_system_type xfs_fs_type = {
843 .owner = THIS_MODULE,
844 .name = "xfs",
845 .get_sb = linvfs_get_sb,
846 .kill_sb = kill_block_super,
847 .fs_flags = FS_REQUIRES_DEV,
848};
849
850
851STATIC int __init
852init_xfs_fs( void )
853{
854 int error;
855 struct sysinfo si;
856 static char message[] __initdata = KERN_INFO \
857 XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled\n";
858
859 printk(message);
860
861 si_meminfo(&si);
862 xfs_physmem = si.totalram;
863
864 ktrace_init(64);
865
866 error = init_inodecache();
867 if (error < 0)
868 goto undo_inodecache;
869
870 error = pagebuf_init();
871 if (error < 0)
872 goto undo_pagebuf;
873
874 vn_init();
875 xfs_init();
876 uuid_init();
877 vfs_initquota();
878
879 error = register_filesystem(&xfs_fs_type);
880 if (error)
881 goto undo_register;
882 XFS_DM_INIT(&xfs_fs_type);
883 return 0;
884
885undo_register:
886 pagebuf_terminate();
887
888undo_pagebuf:
889 destroy_inodecache();
890
891undo_inodecache:
892 return error;
893}
894
895STATIC void __exit
896exit_xfs_fs( void )
897{
898 vfs_exitquota();
899 XFS_DM_EXIT(&xfs_fs_type);
900 unregister_filesystem(&xfs_fs_type);
901 xfs_cleanup();
902 pagebuf_terminate();
903 destroy_inodecache();
904 ktrace_uninit();
905}
906
907module_init(init_xfs_fs);
908module_exit(exit_xfs_fs);
909
910MODULE_AUTHOR("Silicon Graphics, Inc.");
911MODULE_DESCRIPTION(XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled");
912MODULE_LICENSE("GPL");
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
new file mode 100644
index 000000000000..ec7e0035c731
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -0,0 +1,138 @@
1/*
2 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_SUPER_H__
33#define __XFS_SUPER_H__
34
35#ifdef CONFIG_XFS_DMAPI
36# define vfs_insertdmapi(vfs) vfs_insertops(vfsp, &xfs_dmops)
37# define vfs_initdmapi() dmapi_init()
38# define vfs_exitdmapi() dmapi_uninit()
39#else
40# define vfs_insertdmapi(vfs) do { } while (0)
41# define vfs_initdmapi() do { } while (0)
42# define vfs_exitdmapi() do { } while (0)
43#endif
44
45#ifdef CONFIG_XFS_QUOTA
46# define vfs_insertquota(vfs) vfs_insertops(vfsp, &xfs_qmops)
47extern void xfs_qm_init(void);
48extern void xfs_qm_exit(void);
49# define vfs_initquota() xfs_qm_init()
50# define vfs_exitquota() xfs_qm_exit()
51#else
52# define vfs_insertquota(vfs) do { } while (0)
53# define vfs_initquota() do { } while (0)
54# define vfs_exitquota() do { } while (0)
55#endif
56
57#ifdef CONFIG_XFS_POSIX_ACL
58# define XFS_ACL_STRING "ACLs, "
59# define set_posix_acl_flag(sb) ((sb)->s_flags |= MS_POSIXACL)
60#else
61# define XFS_ACL_STRING
62# define set_posix_acl_flag(sb) do { } while (0)
63#endif
64
65#ifdef CONFIG_XFS_SECURITY
66# define XFS_SECURITY_STRING "security attributes, "
67# define ENOSECURITY 0
68#else
69# define XFS_SECURITY_STRING
70# define ENOSECURITY EOPNOTSUPP
71#endif
72
73#ifdef CONFIG_XFS_RT
74# define XFS_REALTIME_STRING "realtime, "
75#else
76# define XFS_REALTIME_STRING
77#endif
78
79#if XFS_BIG_BLKNOS
80# if XFS_BIG_INUMS
81# define XFS_BIGFS_STRING "large block/inode numbers, "
82# else
83# define XFS_BIGFS_STRING "large block numbers, "
84# endif
85#else
86# define XFS_BIGFS_STRING
87#endif
88
89#ifdef CONFIG_XFS_TRACE
90# define XFS_TRACE_STRING "tracing, "
91#else
92# define XFS_TRACE_STRING
93#endif
94
95#ifdef CONFIG_XFS_DMAPI
96# define XFS_DMAPI_STRING "dmapi support, "
97#else
98# define XFS_DMAPI_STRING
99#endif
100
101#ifdef DEBUG
102# define XFS_DBG_STRING "debug"
103#else
104# define XFS_DBG_STRING "no debug"
105#endif
106
107#define XFS_BUILD_OPTIONS XFS_ACL_STRING \
108 XFS_SECURITY_STRING \
109 XFS_REALTIME_STRING \
110 XFS_BIGFS_STRING \
111 XFS_TRACE_STRING \
112 XFS_DMAPI_STRING \
113 XFS_DBG_STRING /* DBG must be last */
114
115#define LINVFS_GET_VFS(s) \
116 (vfs_t *)((s)->s_fs_info)
117#define LINVFS_SET_VFS(s, vfsp) \
118 ((s)->s_fs_info = vfsp)
119
120struct xfs_inode;
121struct xfs_mount;
122struct xfs_buftarg;
123struct block_device;
124
125extern __uint64_t xfs_max_file_offset(unsigned int);
126
127extern void xfs_initialize_vnode(bhv_desc_t *, vnode_t *, bhv_desc_t *, int);
128
129extern void xfs_flush_inode(struct xfs_inode *);
130extern void xfs_flush_device(struct xfs_inode *);
131
132extern int xfs_blkdev_get(struct xfs_mount *, const char *,
133 struct block_device **);
134extern void xfs_blkdev_put(struct block_device *);
135
136extern struct export_operations linvfs_export_ops;
137
138#endif /* __XFS_SUPER_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
new file mode 100644
index 000000000000..0dc010356f4d
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -0,0 +1,174 @@
1/*
2 * Copyright (c) 2001-2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34#include "xfs_rw.h"
35#include <linux/sysctl.h>
36#include <linux/proc_fs.h>
37
38
39static struct ctl_table_header *xfs_table_header;
40
41
42#ifdef CONFIG_PROC_FS
43STATIC int
44xfs_stats_clear_proc_handler(
45 ctl_table *ctl,
46 int write,
47 struct file *filp,
48 void __user *buffer,
49 size_t *lenp,
50 loff_t *ppos)
51{
52 int c, ret, *valp = ctl->data;
53 __uint32_t vn_active;
54
55 ret = proc_dointvec_minmax(ctl, write, filp, buffer, lenp, ppos);
56
57 if (!ret && write && *valp) {
58 printk("XFS Clearing xfsstats\n");
59 for (c = 0; c < NR_CPUS; c++) {
60 if (!cpu_possible(c)) continue;
61 preempt_disable();
62 /* save vn_active, it's a universal truth! */
63 vn_active = per_cpu(xfsstats, c).vn_active;
64 memset(&per_cpu(xfsstats, c), 0,
65 sizeof(struct xfsstats));
66 per_cpu(xfsstats, c).vn_active = vn_active;
67 preempt_enable();
68 }
69 xfs_stats_clear = 0;
70 }
71
72 return ret;
73}
74#endif /* CONFIG_PROC_FS */
75
76STATIC ctl_table xfs_table[] = {
77 {XFS_RESTRICT_CHOWN, "restrict_chown", &xfs_params.restrict_chown.val,
78 sizeof(int), 0644, NULL, &proc_dointvec_minmax,
79 &sysctl_intvec, NULL,
80 &xfs_params.restrict_chown.min, &xfs_params.restrict_chown.max},
81
82 {XFS_SGID_INHERIT, "irix_sgid_inherit", &xfs_params.sgid_inherit.val,
83 sizeof(int), 0644, NULL, &proc_dointvec_minmax,
84 &sysctl_intvec, NULL,
85 &xfs_params.sgid_inherit.min, &xfs_params.sgid_inherit.max},
86
87 {XFS_SYMLINK_MODE, "irix_symlink_mode", &xfs_params.symlink_mode.val,
88 sizeof(int), 0644, NULL, &proc_dointvec_minmax,
89 &sysctl_intvec, NULL,
90 &xfs_params.symlink_mode.min, &xfs_params.symlink_mode.max},
91
92 {XFS_PANIC_MASK, "panic_mask", &xfs_params.panic_mask.val,
93 sizeof(int), 0644, NULL, &proc_dointvec_minmax,
94 &sysctl_intvec, NULL,
95 &xfs_params.panic_mask.min, &xfs_params.panic_mask.max},
96
97 {XFS_ERRLEVEL, "error_level", &xfs_params.error_level.val,
98 sizeof(int), 0644, NULL, &proc_dointvec_minmax,
99 &sysctl_intvec, NULL,
100 &xfs_params.error_level.min, &xfs_params.error_level.max},
101
102 {XFS_SYNCD_TIMER, "xfssyncd_centisecs", &xfs_params.syncd_timer.val,
103 sizeof(int), 0644, NULL, &proc_dointvec_minmax,
104 &sysctl_intvec, NULL,
105 &xfs_params.syncd_timer.min, &xfs_params.syncd_timer.max},
106
107 {XFS_INHERIT_SYNC, "inherit_sync", &xfs_params.inherit_sync.val,
108 sizeof(int), 0644, NULL, &proc_dointvec_minmax,
109 &sysctl_intvec, NULL,
110 &xfs_params.inherit_sync.min, &xfs_params.inherit_sync.max},
111
112 {XFS_INHERIT_NODUMP, "inherit_nodump", &xfs_params.inherit_nodump.val,
113 sizeof(int), 0644, NULL, &proc_dointvec_minmax,
114 &sysctl_intvec, NULL,
115 &xfs_params.inherit_nodump.min, &xfs_params.inherit_nodump.max},
116
117 {XFS_INHERIT_NOATIME, "inherit_noatime", &xfs_params.inherit_noatim.val,
118 sizeof(int), 0644, NULL, &proc_dointvec_minmax,
119 &sysctl_intvec, NULL,
120 &xfs_params.inherit_noatim.min, &xfs_params.inherit_noatim.max},
121
122 {XFS_BUF_TIMER, "xfsbufd_centisecs", &xfs_params.xfs_buf_timer.val,
123 sizeof(int), 0644, NULL, &proc_dointvec_minmax,
124 &sysctl_intvec, NULL,
125 &xfs_params.xfs_buf_timer.min, &xfs_params.xfs_buf_timer.max},
126
127 {XFS_BUF_AGE, "age_buffer_centisecs", &xfs_params.xfs_buf_age.val,
128 sizeof(int), 0644, NULL, &proc_dointvec_minmax,
129 &sysctl_intvec, NULL,
130 &xfs_params.xfs_buf_age.min, &xfs_params.xfs_buf_age.max},
131
132 {XFS_INHERIT_NOSYM, "inherit_nosymlinks", &xfs_params.inherit_nosym.val,
133 sizeof(int), 0644, NULL, &proc_dointvec_minmax,
134 &sysctl_intvec, NULL,
135 &xfs_params.inherit_nosym.min, &xfs_params.inherit_nosym.max},
136
137 {XFS_ROTORSTEP, "rotorstep", &xfs_params.rotorstep.val,
138 sizeof(int), 0644, NULL, &proc_dointvec_minmax,
139 &sysctl_intvec, NULL,
140 &xfs_params.rotorstep.min, &xfs_params.rotorstep.max},
141
142 /* please keep this the last entry */
143#ifdef CONFIG_PROC_FS
144 {XFS_STATS_CLEAR, "stats_clear", &xfs_params.stats_clear.val,
145 sizeof(int), 0644, NULL, &xfs_stats_clear_proc_handler,
146 &sysctl_intvec, NULL,
147 &xfs_params.stats_clear.min, &xfs_params.stats_clear.max},
148#endif /* CONFIG_PROC_FS */
149
150 {0}
151};
152
153STATIC ctl_table xfs_dir_table[] = {
154 {FS_XFS, "xfs", NULL, 0, 0555, xfs_table},
155 {0}
156};
157
158STATIC ctl_table xfs_root_table[] = {
159 {CTL_FS, "fs", NULL, 0, 0555, xfs_dir_table},
160 {0}
161};
162
163void
164xfs_sysctl_register(void)
165{
166 xfs_table_header = register_sysctl_table(xfs_root_table, 1);
167}
168
169void
170xfs_sysctl_unregister(void)
171{
172 if (xfs_table_header)
173 unregister_sysctl_table(xfs_table_header);
174}
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.h b/fs/xfs/linux-2.6/xfs_sysctl.h
new file mode 100644
index 000000000000..a39a95020a58
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_sysctl.h
@@ -0,0 +1,114 @@
1/*
2 * Copyright (c) 2001-2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#ifndef __XFS_SYSCTL_H__
34#define __XFS_SYSCTL_H__
35
36#include <linux/sysctl.h>
37
38/*
39 * Tunable xfs parameters
40 */
41
42typedef struct xfs_sysctl_val {
43 int min;
44 int val;
45 int max;
46} xfs_sysctl_val_t;
47
48typedef struct xfs_param {
49 xfs_sysctl_val_t restrict_chown;/* Root/non-root can give away files.*/
50 xfs_sysctl_val_t sgid_inherit; /* Inherit S_ISGID if process' GID is
51 * not a member of parent dir GID. */
52 xfs_sysctl_val_t symlink_mode; /* Link creat mode affected by umask */
53 xfs_sysctl_val_t panic_mask; /* bitmask to cause panic on errors. */
54 xfs_sysctl_val_t error_level; /* Degree of reporting for problems */
55 xfs_sysctl_val_t syncd_timer; /* Interval between xfssyncd wakeups */
56 xfs_sysctl_val_t stats_clear; /* Reset all XFS statistics to zero. */
57 xfs_sysctl_val_t inherit_sync; /* Inherit the "sync" inode flag. */
58 xfs_sysctl_val_t inherit_nodump;/* Inherit the "nodump" inode flag. */
59 xfs_sysctl_val_t inherit_noatim;/* Inherit the "noatime" inode flag. */
60 xfs_sysctl_val_t xfs_buf_timer; /* Interval between xfsbufd wakeups. */
61 xfs_sysctl_val_t xfs_buf_age; /* Metadata buffer age before flush. */
62 xfs_sysctl_val_t inherit_nosym; /* Inherit the "nosymlinks" flag. */
63 xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */
64} xfs_param_t;
65
66/*
67 * xfs_error_level:
68 *
69 * How much error reporting will be done when internal problems are
70 * encountered. These problems normally return an EFSCORRUPTED to their
71 * caller, with no other information reported.
72 *
73 * 0 No error reports
74 * 1 Report EFSCORRUPTED errors that will cause a filesystem shutdown
75 * 5 Report all EFSCORRUPTED errors (all of the above errors, plus any
76 * additional errors that are known to not cause shutdowns)
77 *
78 * xfs_panic_mask bit 0x8 turns the error reports into panics
79 */
80
81enum {
82 /* XFS_REFCACHE_SIZE = 1 */
83 /* XFS_REFCACHE_PURGE = 2 */
84 XFS_RESTRICT_CHOWN = 3,
85 XFS_SGID_INHERIT = 4,
86 XFS_SYMLINK_MODE = 5,
87 XFS_PANIC_MASK = 6,
88 XFS_ERRLEVEL = 7,
89 XFS_SYNCD_TIMER = 8,
90 /* XFS_PROBE_DMAPI = 9 */
91 /* XFS_PROBE_IOOPS = 10 */
92 /* XFS_PROBE_QUOTA = 11 */
93 XFS_STATS_CLEAR = 12,
94 XFS_INHERIT_SYNC = 13,
95 XFS_INHERIT_NODUMP = 14,
96 XFS_INHERIT_NOATIME = 15,
97 XFS_BUF_TIMER = 16,
98 XFS_BUF_AGE = 17,
99 /* XFS_IO_BYPASS = 18 */
100 XFS_INHERIT_NOSYM = 19,
101 XFS_ROTORSTEP = 20,
102};
103
104extern xfs_param_t xfs_params;
105
106#ifdef CONFIG_SYSCTL
107extern void xfs_sysctl_register(void);
108extern void xfs_sysctl_unregister(void);
109#else
110# define xfs_sysctl_register() do { } while (0)
111# define xfs_sysctl_unregister() do { } while (0)
112#endif /* CONFIG_SYSCTL */
113
114#endif /* __XFS_SYSCTL_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_version.h b/fs/xfs/linux-2.6/xfs_version.h
new file mode 100644
index 000000000000..96f96394417e
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_version.h
@@ -0,0 +1,44 @@
1/*
2 * Copyright (c) 2001-2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33/*
34 * Dummy file that can contain a timestamp to put into the
35 * XFS init string, to help users keep track of what they're
36 * running
37 */
38
39#ifndef __XFS_VERSION_H__
40#define __XFS_VERSION_H__
41
42#define XFS_VERSION_STRING "SGI XFS"
43
44#endif /* __XFS_VERSION_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_vfs.c b/fs/xfs/linux-2.6/xfs_vfs.c
new file mode 100644
index 000000000000..669c61644959
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_vfs.c
@@ -0,0 +1,330 @@
1/*
2 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34#include "xfs_fs.h"
35#include "xfs_macros.h"
36#include "xfs_inum.h"
37#include "xfs_log.h"
38#include "xfs_clnt.h"
39#include "xfs_trans.h"
40#include "xfs_sb.h"
41#include "xfs_ag.h"
42#include "xfs_dir.h"
43#include "xfs_dir2.h"
44#include "xfs_imap.h"
45#include "xfs_alloc.h"
46#include "xfs_dmapi.h"
47#include "xfs_mount.h"
48#include "xfs_quota.h"
49
50int
51vfs_mount(
52 struct bhv_desc *bdp,
53 struct xfs_mount_args *args,
54 struct cred *cr)
55{
56 struct bhv_desc *next = bdp;
57
58 ASSERT(next);
59 while (! (bhvtovfsops(next))->vfs_mount)
60 next = BHV_NEXT(next);
61 return ((*bhvtovfsops(next)->vfs_mount)(next, args, cr));
62}
63
64int
65vfs_parseargs(
66 struct bhv_desc *bdp,
67 char *s,
68 struct xfs_mount_args *args,
69 int f)
70{
71 struct bhv_desc *next = bdp;
72
73 ASSERT(next);
74 while (! (bhvtovfsops(next))->vfs_parseargs)
75 next = BHV_NEXT(next);
76 return ((*bhvtovfsops(next)->vfs_parseargs)(next, s, args, f));
77}
78
79int
80vfs_showargs(
81 struct bhv_desc *bdp,
82 struct seq_file *m)
83{
84 struct bhv_desc *next = bdp;
85
86 ASSERT(next);
87 while (! (bhvtovfsops(next))->vfs_showargs)
88 next = BHV_NEXT(next);
89 return ((*bhvtovfsops(next)->vfs_showargs)(next, m));
90}
91
92int
93vfs_unmount(
94 struct bhv_desc *bdp,
95 int fl,
96 struct cred *cr)
97{
98 struct bhv_desc *next = bdp;
99
100 ASSERT(next);
101 while (! (bhvtovfsops(next))->vfs_unmount)
102 next = BHV_NEXT(next);
103 return ((*bhvtovfsops(next)->vfs_unmount)(next, fl, cr));
104}
105
106int
107vfs_mntupdate(
108 struct bhv_desc *bdp,
109 int *fl,
110 struct xfs_mount_args *args)
111{
112 struct bhv_desc *next = bdp;
113
114 ASSERT(next);
115 while (! (bhvtovfsops(next))->vfs_mntupdate)
116 next = BHV_NEXT(next);
117 return ((*bhvtovfsops(next)->vfs_mntupdate)(next, fl, args));
118}
119
120int
121vfs_root(
122 struct bhv_desc *bdp,
123 struct vnode **vpp)
124{
125 struct bhv_desc *next = bdp;
126
127 ASSERT(next);
128 while (! (bhvtovfsops(next))->vfs_root)
129 next = BHV_NEXT(next);
130 return ((*bhvtovfsops(next)->vfs_root)(next, vpp));
131}
132
133int
134vfs_statvfs(
135 struct bhv_desc *bdp,
136 xfs_statfs_t *sp,
137 struct vnode *vp)
138{
139 struct bhv_desc *next = bdp;
140
141 ASSERT(next);
142 while (! (bhvtovfsops(next))->vfs_statvfs)
143 next = BHV_NEXT(next);
144 return ((*bhvtovfsops(next)->vfs_statvfs)(next, sp, vp));
145}
146
147int
148vfs_sync(
149 struct bhv_desc *bdp,
150 int fl,
151 struct cred *cr)
152{
153 struct bhv_desc *next = bdp;
154
155 ASSERT(next);
156 while (! (bhvtovfsops(next))->vfs_sync)
157 next = BHV_NEXT(next);
158 return ((*bhvtovfsops(next)->vfs_sync)(next, fl, cr));
159}
160
161int
162vfs_vget(
163 struct bhv_desc *bdp,
164 struct vnode **vpp,
165 struct fid *fidp)
166{
167 struct bhv_desc *next = bdp;
168
169 ASSERT(next);
170 while (! (bhvtovfsops(next))->vfs_vget)
171 next = BHV_NEXT(next);
172 return ((*bhvtovfsops(next)->vfs_vget)(next, vpp, fidp));
173}
174
175int
176vfs_dmapiops(
177 struct bhv_desc *bdp,
178 caddr_t addr)
179{
180 struct bhv_desc *next = bdp;
181
182 ASSERT(next);
183 while (! (bhvtovfsops(next))->vfs_dmapiops)
184 next = BHV_NEXT(next);
185 return ((*bhvtovfsops(next)->vfs_dmapiops)(next, addr));
186}
187
188int
189vfs_quotactl(
190 struct bhv_desc *bdp,
191 int cmd,
192 int id,
193 caddr_t addr)
194{
195 struct bhv_desc *next = bdp;
196
197 ASSERT(next);
198 while (! (bhvtovfsops(next))->vfs_quotactl)
199 next = BHV_NEXT(next);
200 return ((*bhvtovfsops(next)->vfs_quotactl)(next, cmd, id, addr));
201}
202
203void
204vfs_init_vnode(
205 struct bhv_desc *bdp,
206 struct vnode *vp,
207 struct bhv_desc *bp,
208 int unlock)
209{
210 struct bhv_desc *next = bdp;
211
212 ASSERT(next);
213 while (! (bhvtovfsops(next))->vfs_init_vnode)
214 next = BHV_NEXT(next);
215 ((*bhvtovfsops(next)->vfs_init_vnode)(next, vp, bp, unlock));
216}
217
218void
219vfs_force_shutdown(
220 struct bhv_desc *bdp,
221 int fl,
222 char *file,
223 int line)
224{
225 struct bhv_desc *next = bdp;
226
227 ASSERT(next);
228 while (! (bhvtovfsops(next))->vfs_force_shutdown)
229 next = BHV_NEXT(next);
230 ((*bhvtovfsops(next)->vfs_force_shutdown)(next, fl, file, line));
231}
232
233void
234vfs_freeze(
235 struct bhv_desc *bdp)
236{
237 struct bhv_desc *next = bdp;
238
239 ASSERT(next);
240 while (! (bhvtovfsops(next))->vfs_freeze)
241 next = BHV_NEXT(next);
242 ((*bhvtovfsops(next)->vfs_freeze)(next));
243}
244
245vfs_t *
246vfs_allocate( void )
247{
248 struct vfs *vfsp;
249
250 vfsp = kmem_zalloc(sizeof(vfs_t), KM_SLEEP);
251 bhv_head_init(VFS_BHVHEAD(vfsp), "vfs");
252 INIT_LIST_HEAD(&vfsp->vfs_sync_list);
253 spin_lock_init(&vfsp->vfs_sync_lock);
254 init_waitqueue_head(&vfsp->vfs_wait_sync_task);
255 init_waitqueue_head(&vfsp->vfs_wait_single_sync_task);
256 return vfsp;
257}
258
259void
260vfs_deallocate(
261 struct vfs *vfsp)
262{
263 bhv_head_destroy(VFS_BHVHEAD(vfsp));
264 kmem_free(vfsp, sizeof(vfs_t));
265}
266
267void
268vfs_insertops(
269 struct vfs *vfsp,
270 struct bhv_vfsops *vfsops)
271{
272 struct bhv_desc *bdp;
273
274 bdp = kmem_alloc(sizeof(struct bhv_desc), KM_SLEEP);
275 bhv_desc_init(bdp, NULL, vfsp, vfsops);
276 bhv_insert(&vfsp->vfs_bh, bdp);
277}
278
279void
280vfs_insertbhv(
281 struct vfs *vfsp,
282 struct bhv_desc *bdp,
283 struct vfsops *vfsops,
284 void *mount)
285{
286 bhv_desc_init(bdp, mount, vfsp, vfsops);
287 bhv_insert_initial(&vfsp->vfs_bh, bdp);
288}
289
290void
291bhv_remove_vfsops(
292 struct vfs *vfsp,
293 int pos)
294{
295 struct bhv_desc *bhv;
296
297 bhv = bhv_lookup_range(&vfsp->vfs_bh, pos, pos);
298 if (!bhv)
299 return;
300 bhv_remove(&vfsp->vfs_bh, bhv);
301 kmem_free(bhv, sizeof(*bhv));
302}
303
304void
305bhv_remove_all_vfsops(
306 struct vfs *vfsp,
307 int freebase)
308{
309 struct xfs_mount *mp;
310
311 bhv_remove_vfsops(vfsp, VFS_POSITION_QM);
312 bhv_remove_vfsops(vfsp, VFS_POSITION_DM);
313 if (!freebase)
314 return;
315 mp = XFS_BHVTOM(bhv_lookup(VFS_BHVHEAD(vfsp), &xfs_vfsops));
316 VFS_REMOVEBHV(vfsp, &mp->m_bhv);
317 xfs_mount_free(mp, 0);
318}
319
320void
321bhv_insert_all_vfsops(
322 struct vfs *vfsp)
323{
324 struct xfs_mount *mp;
325
326 mp = xfs_mount_init();
327 vfs_insertbhv(vfsp, &mp->m_bhv, &xfs_vfsops, mp);
328 vfs_insertdmapi(vfsp);
329 vfs_insertquota(vfsp);
330}
diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h
new file mode 100644
index 000000000000..76493991578f
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_vfs.h
@@ -0,0 +1,223 @@
1/*
2 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_VFS_H__
33#define __XFS_VFS_H__
34
35#include <linux/vfs.h>
36#include "xfs_fs.h"
37
38struct fid;
39struct vfs;
40struct cred;
41struct vnode;
42struct kstatfs;
43struct seq_file;
44struct super_block;
45struct xfs_mount_args;
46
47typedef struct kstatfs xfs_statfs_t;
48
49typedef struct vfs_sync_work {
50 struct list_head w_list;
51 struct vfs *w_vfs;
52 void *w_data; /* syncer routine argument */
53 void (*w_syncer)(struct vfs *, void *);
54} vfs_sync_work_t;
55
56typedef struct vfs {
57 u_int vfs_flag; /* flags */
58 xfs_fsid_t vfs_fsid; /* file system ID */
59 xfs_fsid_t *vfs_altfsid; /* An ID fixed for life of FS */
60 bhv_head_t vfs_bh; /* head of vfs behavior chain */
61 struct super_block *vfs_super; /* generic superblock pointer */
62 struct task_struct *vfs_sync_task; /* generalised sync thread */
63 vfs_sync_work_t vfs_sync_work; /* work item for VFS_SYNC */
64 struct list_head vfs_sync_list; /* sync thread work item list */
65 spinlock_t vfs_sync_lock; /* work item list lock */
66 int vfs_sync_seq; /* sync thread generation no. */
67 wait_queue_head_t vfs_wait_single_sync_task;
68 wait_queue_head_t vfs_wait_sync_task;
69} vfs_t;
70
71#define vfs_fbhv vfs_bh.bh_first /* 1st on vfs behavior chain */
72
73#define bhvtovfs(bdp) ( (struct vfs *)BHV_VOBJ(bdp) )
74#define bhvtovfsops(bdp) ( (struct vfsops *)BHV_OPS(bdp) )
75#define VFS_BHVHEAD(vfs) ( &(vfs)->vfs_bh )
76#define VFS_REMOVEBHV(vfs, bdp) ( bhv_remove(VFS_BHVHEAD(vfs), bdp) )
77
78#define VFS_POSITION_BASE BHV_POSITION_BASE /* chain bottom */
79#define VFS_POSITION_TOP BHV_POSITION_TOP /* chain top */
80#define VFS_POSITION_INVALID BHV_POSITION_INVALID /* invalid pos. num */
81
82typedef enum {
83 VFS_BHV_UNKNOWN, /* not specified */
84 VFS_BHV_XFS, /* xfs */
85 VFS_BHV_DM, /* data migration */
86 VFS_BHV_QM, /* quota manager */
87 VFS_BHV_IO, /* IO path */
88 VFS_BHV_END /* housekeeping end-of-range */
89} vfs_bhv_t;
90
91#define VFS_POSITION_XFS (BHV_POSITION_BASE)
92#define VFS_POSITION_DM (VFS_POSITION_BASE+10)
93#define VFS_POSITION_QM (VFS_POSITION_BASE+20)
94#define VFS_POSITION_IO (VFS_POSITION_BASE+30)
95
96#define VFS_RDONLY 0x0001 /* read-only vfs */
97#define VFS_GRPID 0x0002 /* group-ID assigned from directory */
98#define VFS_DMI 0x0004 /* filesystem has the DMI enabled */
99#define VFS_UMOUNT 0x0008 /* unmount in progress */
100#define VFS_END 0x0008 /* max flag */
101
102#define SYNC_ATTR 0x0001 /* sync attributes */
103#define SYNC_CLOSE 0x0002 /* close file system down */
104#define SYNC_DELWRI 0x0004 /* look at delayed writes */
105#define SYNC_WAIT 0x0008 /* wait for i/o to complete */
106#define SYNC_BDFLUSH 0x0010 /* BDFLUSH is calling -- don't block */
107#define SYNC_FSDATA 0x0020 /* flush fs data (e.g. superblocks) */
108#define SYNC_REFCACHE 0x0040 /* prune some of the nfs ref cache */
109#define SYNC_REMOUNT 0x0080 /* remount readonly, no dummy LRs */
110
111typedef int (*vfs_mount_t)(bhv_desc_t *,
112 struct xfs_mount_args *, struct cred *);
113typedef int (*vfs_parseargs_t)(bhv_desc_t *, char *,
114 struct xfs_mount_args *, int);
115typedef int (*vfs_showargs_t)(bhv_desc_t *, struct seq_file *);
116typedef int (*vfs_unmount_t)(bhv_desc_t *, int, struct cred *);
117typedef int (*vfs_mntupdate_t)(bhv_desc_t *, int *,
118 struct xfs_mount_args *);
119typedef int (*vfs_root_t)(bhv_desc_t *, struct vnode **);
120typedef int (*vfs_statvfs_t)(bhv_desc_t *, xfs_statfs_t *, struct vnode *);
121typedef int (*vfs_sync_t)(bhv_desc_t *, int, struct cred *);
122typedef int (*vfs_vget_t)(bhv_desc_t *, struct vnode **, struct fid *);
123typedef int (*vfs_dmapiops_t)(bhv_desc_t *, caddr_t);
124typedef int (*vfs_quotactl_t)(bhv_desc_t *, int, int, caddr_t);
125typedef void (*vfs_init_vnode_t)(bhv_desc_t *,
126 struct vnode *, bhv_desc_t *, int);
127typedef void (*vfs_force_shutdown_t)(bhv_desc_t *, int, char *, int);
128typedef void (*vfs_freeze_t)(bhv_desc_t *);
129
130typedef struct vfsops {
131 bhv_position_t vf_position; /* behavior chain position */
132 vfs_mount_t vfs_mount; /* mount file system */
133 vfs_parseargs_t vfs_parseargs; /* parse mount options */
134 vfs_showargs_t vfs_showargs; /* unparse mount options */
135 vfs_unmount_t vfs_unmount; /* unmount file system */
136 vfs_mntupdate_t vfs_mntupdate; /* update file system options */
137 vfs_root_t vfs_root; /* get root vnode */
138 vfs_statvfs_t vfs_statvfs; /* file system statistics */
139 vfs_sync_t vfs_sync; /* flush files */
140 vfs_vget_t vfs_vget; /* get vnode from fid */
141 vfs_dmapiops_t vfs_dmapiops; /* data migration */
142 vfs_quotactl_t vfs_quotactl; /* disk quota */
143 vfs_init_vnode_t vfs_init_vnode; /* initialize a new vnode */
144 vfs_force_shutdown_t vfs_force_shutdown; /* crash and burn */
145 vfs_freeze_t vfs_freeze; /* freeze fs for snapshot */
146} vfsops_t;
147
148/*
149 * VFS's. Operates on vfs structure pointers (starts at bhv head).
150 */
151#define VHEAD(v) ((v)->vfs_fbhv)
152#define VFS_MOUNT(v, ma,cr, rv) ((rv) = vfs_mount(VHEAD(v), ma,cr))
153#define VFS_PARSEARGS(v, o,ma,f, rv) ((rv) = vfs_parseargs(VHEAD(v), o,ma,f))
154#define VFS_SHOWARGS(v, m, rv) ((rv) = vfs_showargs(VHEAD(v), m))
155#define VFS_UNMOUNT(v, f, cr, rv) ((rv) = vfs_unmount(VHEAD(v), f,cr))
156#define VFS_MNTUPDATE(v, fl, args, rv) ((rv) = vfs_mntupdate(VHEAD(v), fl, args))
157#define VFS_ROOT(v, vpp, rv) ((rv) = vfs_root(VHEAD(v), vpp))
158#define VFS_STATVFS(v, sp,vp, rv) ((rv) = vfs_statvfs(VHEAD(v), sp,vp))
159#define VFS_SYNC(v, flag,cr, rv) ((rv) = vfs_sync(VHEAD(v), flag,cr))
160#define VFS_VGET(v, vpp,fidp, rv) ((rv) = vfs_vget(VHEAD(v), vpp,fidp))
161#define VFS_DMAPIOPS(v, p, rv) ((rv) = vfs_dmapiops(VHEAD(v), p))
162#define VFS_QUOTACTL(v, c,id,p, rv) ((rv) = vfs_quotactl(VHEAD(v), c,id,p))
163#define VFS_INIT_VNODE(v, vp,b,ul) ( vfs_init_vnode(VHEAD(v), vp,b,ul) )
164#define VFS_FORCE_SHUTDOWN(v, fl,f,l) ( vfs_force_shutdown(VHEAD(v), fl,f,l) )
165#define VFS_FREEZE(v) ( vfs_freeze(VHEAD(v)) )
166
167/*
168 * PVFS's. Operates on behavior descriptor pointers.
169 */
170#define PVFS_MOUNT(b, ma,cr, rv) ((rv) = vfs_mount(b, ma,cr))
171#define PVFS_PARSEARGS(b, o,ma,f, rv) ((rv) = vfs_parseargs(b, o,ma,f))
172#define PVFS_SHOWARGS(b, m, rv) ((rv) = vfs_showargs(b, m))
173#define PVFS_UNMOUNT(b, f,cr, rv) ((rv) = vfs_unmount(b, f,cr))
174#define PVFS_MNTUPDATE(b, fl, args, rv) ((rv) = vfs_mntupdate(b, fl, args))
175#define PVFS_ROOT(b, vpp, rv) ((rv) = vfs_root(b, vpp))
176#define PVFS_STATVFS(b, sp,vp, rv) ((rv) = vfs_statvfs(b, sp,vp))
177#define PVFS_SYNC(b, flag,cr, rv) ((rv) = vfs_sync(b, flag,cr))
178#define PVFS_VGET(b, vpp,fidp, rv) ((rv) = vfs_vget(b, vpp,fidp))
179#define PVFS_DMAPIOPS(b, p, rv) ((rv) = vfs_dmapiops(b, p))
180#define PVFS_QUOTACTL(b, c,id,p, rv) ((rv) = vfs_quotactl(b, c,id,p))
181#define PVFS_INIT_VNODE(b, vp,b2,ul) ( vfs_init_vnode(b, vp,b2,ul) )
182#define PVFS_FORCE_SHUTDOWN(b, fl,f,l) ( vfs_force_shutdown(b, fl,f,l) )
183#define PVFS_FREEZE(b) ( vfs_freeze(b) )
184
185extern int vfs_mount(bhv_desc_t *, struct xfs_mount_args *, struct cred *);
186extern int vfs_parseargs(bhv_desc_t *, char *, struct xfs_mount_args *, int);
187extern int vfs_showargs(bhv_desc_t *, struct seq_file *);
188extern int vfs_unmount(bhv_desc_t *, int, struct cred *);
189extern int vfs_mntupdate(bhv_desc_t *, int *, struct xfs_mount_args *);
190extern int vfs_root(bhv_desc_t *, struct vnode **);
191extern int vfs_statvfs(bhv_desc_t *, xfs_statfs_t *, struct vnode *);
192extern int vfs_sync(bhv_desc_t *, int, struct cred *);
193extern int vfs_vget(bhv_desc_t *, struct vnode **, struct fid *);
194extern int vfs_dmapiops(bhv_desc_t *, caddr_t);
195extern int vfs_quotactl(bhv_desc_t *, int, int, caddr_t);
196extern void vfs_init_vnode(bhv_desc_t *, struct vnode *, bhv_desc_t *, int);
197extern void vfs_force_shutdown(bhv_desc_t *, int, char *, int);
198extern void vfs_freeze(bhv_desc_t *);
199
200typedef struct bhv_vfsops {
201 struct vfsops bhv_common;
202 void * bhv_custom;
203} bhv_vfsops_t;
204
205#define vfs_bhv_lookup(v, id) ( bhv_lookup_range(&(v)->vfs_bh, (id), (id)) )
206#define vfs_bhv_custom(b) ( ((bhv_vfsops_t *)BHV_OPS(b))->bhv_custom )
207#define vfs_bhv_set_custom(b,o) ( (b)->bhv_custom = (void *)(o))
208#define vfs_bhv_clr_custom(b) ( (b)->bhv_custom = NULL )
209
210extern vfs_t *vfs_allocate(void);
211extern void vfs_deallocate(vfs_t *);
212extern void vfs_insertops(vfs_t *, bhv_vfsops_t *);
213extern void vfs_insertbhv(vfs_t *, bhv_desc_t *, vfsops_t *, void *);
214
215extern void bhv_insert_all_vfsops(struct vfs *);
216extern void bhv_remove_all_vfsops(struct vfs *, int);
217extern void bhv_remove_vfsops(struct vfs *, int);
218
219#define fs_frozen(vfsp) ((vfsp)->vfs_super->s_frozen)
220#define fs_check_frozen(vfsp, level) \
221 vfs_check_frozen(vfsp->vfs_super, level);
222
223#endif /* __XFS_VFS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c
new file mode 100644
index 000000000000..849c61c74f3c
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_vnode.c
@@ -0,0 +1,455 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34
35
36uint64_t vn_generation; /* vnode generation number */
37DEFINE_SPINLOCK(vnumber_lock);
38
39/*
40 * Dedicated vnode inactive/reclaim sync semaphores.
41 * Prime number of hash buckets since address is used as the key.
42 */
43#define NVSYNC 37
44#define vptosync(v) (&vsync[((unsigned long)v) % NVSYNC])
45sv_t vsync[NVSYNC];
46
47/*
48 * Translate stat(2) file types to vnode types and vice versa.
49 * Aware of numeric order of S_IFMT and vnode type values.
50 */
51enum vtype iftovt_tab[] = {
52 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
53 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
54};
55
56u_short vttoif_tab[] = {
57 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO, 0, S_IFSOCK
58};
59
60
61void
62vn_init(void)
63{
64 register sv_t *svp;
65 register int i;
66
67 for (svp = vsync, i = 0; i < NVSYNC; i++, svp++)
68 init_sv(svp, SV_DEFAULT, "vsy", i);
69}
70
71/*
72 * Clean a vnode of filesystem-specific data and prepare it for reuse.
73 */
74STATIC int
75vn_reclaim(
76 struct vnode *vp)
77{
78 int error;
79
80 XFS_STATS_INC(vn_reclaim);
81 vn_trace_entry(vp, "vn_reclaim", (inst_t *)__return_address);
82
83 /*
84 * Only make the VOP_RECLAIM call if there are behaviors
85 * to call.
86 */
87 if (vp->v_fbhv) {
88 VOP_RECLAIM(vp, error);
89 if (error)
90 return -error;
91 }
92 ASSERT(vp->v_fbhv == NULL);
93
94 VN_LOCK(vp);
95 vp->v_flag &= (VRECLM|VWAIT);
96 VN_UNLOCK(vp, 0);
97
98 vp->v_type = VNON;
99 vp->v_fbhv = NULL;
100
101#ifdef XFS_VNODE_TRACE
102 ktrace_free(vp->v_trace);
103 vp->v_trace = NULL;
104#endif
105
106 return 0;
107}
108
109STATIC void
110vn_wakeup(
111 struct vnode *vp)
112{
113 VN_LOCK(vp);
114 if (vp->v_flag & VWAIT)
115 sv_broadcast(vptosync(vp));
116 vp->v_flag &= ~(VRECLM|VWAIT|VMODIFIED);
117 VN_UNLOCK(vp, 0);
118}
119
120int
121vn_wait(
122 struct vnode *vp)
123{
124 VN_LOCK(vp);
125 if (vp->v_flag & (VINACT | VRECLM)) {
126 vp->v_flag |= VWAIT;
127 sv_wait(vptosync(vp), PINOD, &vp->v_lock, 0);
128 return 1;
129 }
130 VN_UNLOCK(vp, 0);
131 return 0;
132}
133
134struct vnode *
135vn_initialize(
136 struct inode *inode)
137{
138 struct vnode *vp = LINVFS_GET_VP(inode);
139
140 XFS_STATS_INC(vn_active);
141 XFS_STATS_INC(vn_alloc);
142
143 vp->v_flag = VMODIFIED;
144 spinlock_init(&vp->v_lock, "v_lock");
145
146 spin_lock(&vnumber_lock);
147 if (!++vn_generation) /* v_number shouldn't be zero */
148 vn_generation++;
149 vp->v_number = vn_generation;
150 spin_unlock(&vnumber_lock);
151
152 ASSERT(VN_CACHED(vp) == 0);
153
154 /* Initialize the first behavior and the behavior chain head. */
155 vn_bhv_head_init(VN_BHV_HEAD(vp), "vnode");
156
157#ifdef XFS_VNODE_TRACE
158 vp->v_trace = ktrace_alloc(VNODE_TRACE_SIZE, KM_SLEEP);
159 printk("Allocated VNODE_TRACE at 0x%p\n", vp->v_trace);
160#endif /* XFS_VNODE_TRACE */
161
162 vn_trace_exit(vp, "vn_initialize", (inst_t *)__return_address);
163 return vp;
164}
165
166/*
167 * Get a reference on a vnode.
168 */
169vnode_t *
170vn_get(
171 struct vnode *vp,
172 vmap_t *vmap)
173{
174 struct inode *inode;
175
176 XFS_STATS_INC(vn_get);
177 inode = LINVFS_GET_IP(vp);
178 if (inode->i_state & I_FREEING)
179 return NULL;
180
181 inode = ilookup(vmap->v_vfsp->vfs_super, vmap->v_ino);
182 if (!inode) /* Inode not present */
183 return NULL;
184
185 vn_trace_exit(vp, "vn_get", (inst_t *)__return_address);
186
187 return vp;
188}
189
190/*
191 * Revalidate the Linux inode from the vattr.
192 * Note: i_size _not_ updated; we must hold the inode
193 * semaphore when doing that - callers responsibility.
194 */
195void
196vn_revalidate_core(
197 struct vnode *vp,
198 vattr_t *vap)
199{
200 struct inode *inode = LINVFS_GET_IP(vp);
201
202 inode->i_mode = VTTOIF(vap->va_type) | vap->va_mode;
203 inode->i_nlink = vap->va_nlink;
204 inode->i_uid = vap->va_uid;
205 inode->i_gid = vap->va_gid;
206 inode->i_blocks = vap->va_nblocks;
207 inode->i_mtime = vap->va_mtime;
208 inode->i_ctime = vap->va_ctime;
209 inode->i_atime = vap->va_atime;
210 if (vap->va_xflags & XFS_XFLAG_IMMUTABLE)
211 inode->i_flags |= S_IMMUTABLE;
212 else
213 inode->i_flags &= ~S_IMMUTABLE;
214 if (vap->va_xflags & XFS_XFLAG_APPEND)
215 inode->i_flags |= S_APPEND;
216 else
217 inode->i_flags &= ~S_APPEND;
218 if (vap->va_xflags & XFS_XFLAG_SYNC)
219 inode->i_flags |= S_SYNC;
220 else
221 inode->i_flags &= ~S_SYNC;
222 if (vap->va_xflags & XFS_XFLAG_NOATIME)
223 inode->i_flags |= S_NOATIME;
224 else
225 inode->i_flags &= ~S_NOATIME;
226}
227
228/*
229 * Revalidate the Linux inode from the vnode.
230 */
231int
232vn_revalidate(
233 struct vnode *vp)
234{
235 vattr_t va;
236 int error;
237
238 vn_trace_entry(vp, "vn_revalidate", (inst_t *)__return_address);
239 ASSERT(vp->v_fbhv != NULL);
240
241 va.va_mask = XFS_AT_STAT|XFS_AT_XFLAGS;
242 VOP_GETATTR(vp, &va, 0, NULL, error);
243 if (!error) {
244 vn_revalidate_core(vp, &va);
245 VUNMODIFY(vp);
246 }
247 return -error;
248}
249
250/*
251 * purge a vnode from the cache
252 * At this point the vnode is guaranteed to have no references (vn_count == 0)
253 * The caller has to make sure that there are no ways someone could
254 * get a handle (via vn_get) on the vnode (usually done via a mount/vfs lock).
255 */
256void
257vn_purge(
258 struct vnode *vp,
259 vmap_t *vmap)
260{
261 vn_trace_entry(vp, "vn_purge", (inst_t *)__return_address);
262
263again:
264 /*
265 * Check whether vp has already been reclaimed since our caller
266 * sampled its version while holding a filesystem cache lock that
267 * its VOP_RECLAIM function acquires.
268 */
269 VN_LOCK(vp);
270 if (vp->v_number != vmap->v_number) {
271 VN_UNLOCK(vp, 0);
272 return;
273 }
274
275 /*
276 * If vp is being reclaimed or inactivated, wait until it is inert,
277 * then proceed. Can't assume that vnode is actually reclaimed
278 * just because the reclaimed flag is asserted -- a vn_alloc
279 * reclaim can fail.
280 */
281 if (vp->v_flag & (VINACT | VRECLM)) {
282 ASSERT(vn_count(vp) == 0);
283 vp->v_flag |= VWAIT;
284 sv_wait(vptosync(vp), PINOD, &vp->v_lock, 0);
285 goto again;
286 }
287
288 /*
289 * Another process could have raced in and gotten this vnode...
290 */
291 if (vn_count(vp) > 0) {
292 VN_UNLOCK(vp, 0);
293 return;
294 }
295
296 XFS_STATS_DEC(vn_active);
297 vp->v_flag |= VRECLM;
298 VN_UNLOCK(vp, 0);
299
300 /*
301 * Call VOP_RECLAIM and clean vp. The FSYNC_INVAL flag tells
302 * vp's filesystem to flush and invalidate all cached resources.
303 * When vn_reclaim returns, vp should have no private data,
304 * either in a system cache or attached to v_data.
305 */
306 if (vn_reclaim(vp) != 0)
307 panic("vn_purge: cannot reclaim");
308
309 /*
310 * Wakeup anyone waiting for vp to be reclaimed.
311 */
312 vn_wakeup(vp);
313}
314
315/*
316 * Add a reference to a referenced vnode.
317 */
318struct vnode *
319vn_hold(
320 struct vnode *vp)
321{
322 struct inode *inode;
323
324 XFS_STATS_INC(vn_hold);
325
326 VN_LOCK(vp);
327 inode = igrab(LINVFS_GET_IP(vp));
328 ASSERT(inode);
329 VN_UNLOCK(vp, 0);
330
331 return vp;
332}
333
334/*
335 * Call VOP_INACTIVE on last reference.
336 */
337void
338vn_rele(
339 struct vnode *vp)
340{
341 int vcnt;
342 int cache;
343
344 XFS_STATS_INC(vn_rele);
345
346 VN_LOCK(vp);
347
348 vn_trace_entry(vp, "vn_rele", (inst_t *)__return_address);
349 vcnt = vn_count(vp);
350
351 /*
352 * Since we always get called from put_inode we know
353 * that i_count won't be decremented after we
354 * return.
355 */
356 if (!vcnt) {
357 /*
358 * As soon as we turn this on, noone can find us in vn_get
359 * until we turn off VINACT or VRECLM
360 */
361 vp->v_flag |= VINACT;
362 VN_UNLOCK(vp, 0);
363
364 /*
365 * Do not make the VOP_INACTIVE call if there
366 * are no behaviors attached to the vnode to call.
367 */
368 if (vp->v_fbhv)
369 VOP_INACTIVE(vp, NULL, cache);
370
371 VN_LOCK(vp);
372 if (vp->v_flag & VWAIT)
373 sv_broadcast(vptosync(vp));
374
375 vp->v_flag &= ~(VINACT|VWAIT|VRECLM|VMODIFIED);
376 }
377
378 VN_UNLOCK(vp, 0);
379
380 vn_trace_exit(vp, "vn_rele", (inst_t *)__return_address);
381}
382
383/*
384 * Finish the removal of a vnode.
385 */
386void
387vn_remove(
388 struct vnode *vp)
389{
390 vmap_t vmap;
391
392 /* Make sure we don't do this to the same vnode twice */
393 if (!(vp->v_fbhv))
394 return;
395
396 XFS_STATS_INC(vn_remove);
397 vn_trace_exit(vp, "vn_remove", (inst_t *)__return_address);
398
399 /*
400 * After the following purge the vnode
401 * will no longer exist.
402 */
403 VMAP(vp, vmap);
404 vn_purge(vp, &vmap);
405}
406
407
408#ifdef XFS_VNODE_TRACE
409
410#define KTRACE_ENTER(vp, vk, s, line, ra) \
411 ktrace_enter( (vp)->v_trace, \
412/* 0 */ (void *)(__psint_t)(vk), \
413/* 1 */ (void *)(s), \
414/* 2 */ (void *)(__psint_t) line, \
415/* 3 */ (void *)(vn_count(vp)), \
416/* 4 */ (void *)(ra), \
417/* 5 */ (void *)(__psunsigned_t)(vp)->v_flag, \
418/* 6 */ (void *)(__psint_t)current_cpu(), \
419/* 7 */ (void *)(__psint_t)current_pid(), \
420/* 8 */ (void *)__return_address, \
421/* 9 */ 0, 0, 0, 0, 0, 0, 0)
422
423/*
424 * Vnode tracing code.
425 */
426void
427vn_trace_entry(vnode_t *vp, char *func, inst_t *ra)
428{
429 KTRACE_ENTER(vp, VNODE_KTRACE_ENTRY, func, 0, ra);
430}
431
432void
433vn_trace_exit(vnode_t *vp, char *func, inst_t *ra)
434{
435 KTRACE_ENTER(vp, VNODE_KTRACE_EXIT, func, 0, ra);
436}
437
438void
439vn_trace_hold(vnode_t *vp, char *file, int line, inst_t *ra)
440{
441 KTRACE_ENTER(vp, VNODE_KTRACE_HOLD, file, line, ra);
442}
443
444void
445vn_trace_ref(vnode_t *vp, char *file, int line, inst_t *ra)
446{
447 KTRACE_ENTER(vp, VNODE_KTRACE_REF, file, line, ra);
448}
449
450void
451vn_trace_rele(vnode_t *vp, char *file, int line, inst_t *ra)
452{
453 KTRACE_ENTER(vp, VNODE_KTRACE_RELE, file, line, ra);
454}
455#endif /* XFS_VNODE_TRACE */
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
new file mode 100644
index 000000000000..da76c1f1e11c
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -0,0 +1,666 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 *
32 * Portions Copyright (c) 1989, 1993
33 * The Regents of the University of California. All rights reserved.
34 *
35 * Redistribution and use in source and binary forms, with or without
36 * modification, are permitted provided that the following conditions
37 * are met:
38 * 1. Redistributions of source code must retain the above copyright
39 * notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 * notice, this list of conditions and the following disclaimer in the
42 * documentation and/or other materials provided with the distribution.
43 * 3. Neither the name of the University nor the names of its contributors
44 * may be used to endorse or promote products derived from this software
45 * without specific prior written permission.
46 *
47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57 * SUCH DAMAGE.
58 */
59#ifndef __XFS_VNODE_H__
60#define __XFS_VNODE_H__
61
62struct uio;
63struct file;
64struct vattr;
65struct xfs_iomap;
66struct attrlist_cursor_kern;
67
68/*
69 * Vnode types. VNON means no type.
70 */
71enum vtype { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VFIFO, VBAD, VSOCK };
72
73typedef xfs_ino_t vnumber_t;
74typedef struct dentry vname_t;
75typedef bhv_head_t vn_bhv_head_t;
76
77/*
78 * MP locking protocols:
79 * v_flag, v_vfsp VN_LOCK/VN_UNLOCK
80 * v_type read-only or fs-dependent
81 */
82typedef struct vnode {
83 __u32 v_flag; /* vnode flags (see below) */
84 enum vtype v_type; /* vnode type */
85 struct vfs *v_vfsp; /* ptr to containing VFS */
86 vnumber_t v_number; /* in-core vnode number */
87 vn_bhv_head_t v_bh; /* behavior head */
88 spinlock_t v_lock; /* VN_LOCK/VN_UNLOCK */
89 struct inode v_inode; /* Linux inode */
90#ifdef XFS_VNODE_TRACE
91 struct ktrace *v_trace; /* trace header structure */
92#endif
93} vnode_t;
94
95#define v_fbhv v_bh.bh_first /* first behavior */
96#define v_fops v_bh.bh_first->bd_ops /* first behavior ops */
97
98#define VNODE_POSITION_BASE BHV_POSITION_BASE /* chain bottom */
99#define VNODE_POSITION_TOP BHV_POSITION_TOP /* chain top */
100#define VNODE_POSITION_INVALID BHV_POSITION_INVALID /* invalid pos. num */
101
102typedef enum {
103 VN_BHV_UNKNOWN, /* not specified */
104 VN_BHV_XFS, /* xfs */
105 VN_BHV_DM, /* data migration */
106 VN_BHV_QM, /* quota manager */
107 VN_BHV_IO, /* IO path */
108 VN_BHV_END /* housekeeping end-of-range */
109} vn_bhv_t;
110
111#define VNODE_POSITION_XFS (VNODE_POSITION_BASE)
112#define VNODE_POSITION_DM (VNODE_POSITION_BASE+10)
113#define VNODE_POSITION_QM (VNODE_POSITION_BASE+20)
114#define VNODE_POSITION_IO (VNODE_POSITION_BASE+30)
115
116/*
117 * Macros for dealing with the behavior descriptor inside of the vnode.
118 */
119#define BHV_TO_VNODE(bdp) ((vnode_t *)BHV_VOBJ(bdp))
120#define BHV_TO_VNODE_NULL(bdp) ((vnode_t *)BHV_VOBJNULL(bdp))
121
122#define VN_BHV_HEAD(vp) ((bhv_head_t *)(&((vp)->v_bh)))
123#define vn_bhv_head_init(bhp,name) bhv_head_init(bhp,name)
124#define vn_bhv_remove(bhp,bdp) bhv_remove(bhp,bdp)
125#define vn_bhv_lookup(bhp,ops) bhv_lookup(bhp,ops)
126#define vn_bhv_lookup_unlocked(bhp,ops) bhv_lookup_unlocked(bhp,ops)
127
128/*
129 * Vnode to Linux inode mapping.
130 */
131#define LINVFS_GET_VP(inode) ((vnode_t *)list_entry(inode, vnode_t, v_inode))
132#define LINVFS_GET_IP(vp) (&(vp)->v_inode)
133
134/*
135 * Convert between vnode types and inode formats (since POSIX.1
136 * defines mode word of stat structure in terms of inode formats).
137 */
138extern enum vtype iftovt_tab[];
139extern u_short vttoif_tab[];
140#define IFTOVT(mode) (iftovt_tab[((mode) & S_IFMT) >> 12])
141#define VTTOIF(indx) (vttoif_tab[(int)(indx)])
142#define MAKEIMODE(indx, mode) (int)(VTTOIF(indx) | (mode))
143
144
145/*
146 * Vnode flags.
147 */
148#define VINACT 0x1 /* vnode is being inactivated */
149#define VRECLM 0x2 /* vnode is being reclaimed */
150#define VWAIT 0x4 /* waiting for VINACT/VRECLM to end */
151#define VMODIFIED 0x8 /* XFS inode state possibly differs */
152 /* to the Linux inode state. */
153
154/*
155 * Values for the VOP_RWLOCK and VOP_RWUNLOCK flags parameter.
156 */
157typedef enum vrwlock {
158 VRWLOCK_NONE,
159 VRWLOCK_READ,
160 VRWLOCK_WRITE,
161 VRWLOCK_WRITE_DIRECT,
162 VRWLOCK_TRY_READ,
163 VRWLOCK_TRY_WRITE
164} vrwlock_t;
165
166/*
167 * Return values for VOP_INACTIVE. A return value of
168 * VN_INACTIVE_NOCACHE implies that the file system behavior
169 * has disassociated its state and bhv_desc_t from the vnode.
170 */
171#define VN_INACTIVE_CACHE 0
172#define VN_INACTIVE_NOCACHE 1
173
174/*
175 * Values for the cmd code given to VOP_VNODE_CHANGE.
176 */
177typedef enum vchange {
178 VCHANGE_FLAGS_FRLOCKS = 0,
179 VCHANGE_FLAGS_ENF_LOCKING = 1,
180 VCHANGE_FLAGS_TRUNCATED = 2,
181 VCHANGE_FLAGS_PAGE_DIRTY = 3,
182 VCHANGE_FLAGS_IOEXCL_COUNT = 4
183} vchange_t;
184
185
186typedef int (*vop_open_t)(bhv_desc_t *, struct cred *);
187typedef ssize_t (*vop_read_t)(bhv_desc_t *, struct kiocb *,
188 const struct iovec *, unsigned int,
189 loff_t *, int, struct cred *);
190typedef ssize_t (*vop_write_t)(bhv_desc_t *, struct kiocb *,
191 const struct iovec *, unsigned int,
192 loff_t *, int, struct cred *);
193typedef ssize_t (*vop_sendfile_t)(bhv_desc_t *, struct file *,
194 loff_t *, int, size_t, read_actor_t,
195 void *, struct cred *);
196typedef int (*vop_ioctl_t)(bhv_desc_t *, struct inode *, struct file *,
197 int, unsigned int, void __user *);
198typedef int (*vop_getattr_t)(bhv_desc_t *, struct vattr *, int,
199 struct cred *);
200typedef int (*vop_setattr_t)(bhv_desc_t *, struct vattr *, int,
201 struct cred *);
202typedef int (*vop_access_t)(bhv_desc_t *, int, struct cred *);
203typedef int (*vop_lookup_t)(bhv_desc_t *, vname_t *, vnode_t **,
204 int, vnode_t *, struct cred *);
205typedef int (*vop_create_t)(bhv_desc_t *, vname_t *, struct vattr *,
206 vnode_t **, struct cred *);
207typedef int (*vop_remove_t)(bhv_desc_t *, vname_t *, struct cred *);
208typedef int (*vop_link_t)(bhv_desc_t *, vnode_t *, vname_t *,
209 struct cred *);
210typedef int (*vop_rename_t)(bhv_desc_t *, vname_t *, vnode_t *, vname_t *,
211 struct cred *);
212typedef int (*vop_mkdir_t)(bhv_desc_t *, vname_t *, struct vattr *,
213 vnode_t **, struct cred *);
214typedef int (*vop_rmdir_t)(bhv_desc_t *, vname_t *, struct cred *);
215typedef int (*vop_readdir_t)(bhv_desc_t *, struct uio *, struct cred *,
216 int *);
217typedef int (*vop_symlink_t)(bhv_desc_t *, vname_t *, struct vattr *,
218 char *, vnode_t **, struct cred *);
219typedef int (*vop_readlink_t)(bhv_desc_t *, struct uio *, int,
220 struct cred *);
221typedef int (*vop_fsync_t)(bhv_desc_t *, int, struct cred *,
222 xfs_off_t, xfs_off_t);
223typedef int (*vop_inactive_t)(bhv_desc_t *, struct cred *);
224typedef int (*vop_fid2_t)(bhv_desc_t *, struct fid *);
225typedef int (*vop_release_t)(bhv_desc_t *);
226typedef int (*vop_rwlock_t)(bhv_desc_t *, vrwlock_t);
227typedef void (*vop_rwunlock_t)(bhv_desc_t *, vrwlock_t);
228typedef int (*vop_bmap_t)(bhv_desc_t *, xfs_off_t, ssize_t, int,
229 struct xfs_iomap *, int *);
230typedef int (*vop_reclaim_t)(bhv_desc_t *);
231typedef int (*vop_attr_get_t)(bhv_desc_t *, char *, char *, int *, int,
232 struct cred *);
233typedef int (*vop_attr_set_t)(bhv_desc_t *, char *, char *, int, int,
234 struct cred *);
235typedef int (*vop_attr_remove_t)(bhv_desc_t *, char *, int, struct cred *);
236typedef int (*vop_attr_list_t)(bhv_desc_t *, char *, int, int,
237 struct attrlist_cursor_kern *, struct cred *);
238typedef void (*vop_link_removed_t)(bhv_desc_t *, vnode_t *, int);
239typedef void (*vop_vnode_change_t)(bhv_desc_t *, vchange_t, __psint_t);
240typedef void (*vop_ptossvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
241typedef void (*vop_pflushinvalvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
242typedef int (*vop_pflushvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t,
243 uint64_t, int);
244typedef int (*vop_iflush_t)(bhv_desc_t *, int);
245
246
247typedef struct vnodeops {
248 bhv_position_t vn_position; /* position within behavior chain */
249 vop_open_t vop_open;
250 vop_read_t vop_read;
251 vop_write_t vop_write;
252 vop_sendfile_t vop_sendfile;
253 vop_ioctl_t vop_ioctl;
254 vop_getattr_t vop_getattr;
255 vop_setattr_t vop_setattr;
256 vop_access_t vop_access;
257 vop_lookup_t vop_lookup;
258 vop_create_t vop_create;
259 vop_remove_t vop_remove;
260 vop_link_t vop_link;
261 vop_rename_t vop_rename;
262 vop_mkdir_t vop_mkdir;
263 vop_rmdir_t vop_rmdir;
264 vop_readdir_t vop_readdir;
265 vop_symlink_t vop_symlink;
266 vop_readlink_t vop_readlink;
267 vop_fsync_t vop_fsync;
268 vop_inactive_t vop_inactive;
269 vop_fid2_t vop_fid2;
270 vop_rwlock_t vop_rwlock;
271 vop_rwunlock_t vop_rwunlock;
272 vop_bmap_t vop_bmap;
273 vop_reclaim_t vop_reclaim;
274 vop_attr_get_t vop_attr_get;
275 vop_attr_set_t vop_attr_set;
276 vop_attr_remove_t vop_attr_remove;
277 vop_attr_list_t vop_attr_list;
278 vop_link_removed_t vop_link_removed;
279 vop_vnode_change_t vop_vnode_change;
280 vop_ptossvp_t vop_tosspages;
281 vop_pflushinvalvp_t vop_flushinval_pages;
282 vop_pflushvp_t vop_flush_pages;
283 vop_release_t vop_release;
284 vop_iflush_t vop_iflush;
285} vnodeops_t;
286
287/*
288 * VOP's.
289 */
290#define _VOP_(op, vp) (*((vnodeops_t *)(vp)->v_fops)->op)
291
292#define VOP_READ(vp,file,iov,segs,offset,ioflags,cr,rv) \
293 rv = _VOP_(vop_read, vp)((vp)->v_fbhv,file,iov,segs,offset,ioflags,cr)
294#define VOP_WRITE(vp,file,iov,segs,offset,ioflags,cr,rv) \
295 rv = _VOP_(vop_write, vp)((vp)->v_fbhv,file,iov,segs,offset,ioflags,cr)
296#define VOP_SENDFILE(vp,f,off,ioflags,cnt,act,targ,cr,rv) \
297 rv = _VOP_(vop_sendfile, vp)((vp)->v_fbhv,f,off,ioflags,cnt,act,targ,cr)
298#define VOP_BMAP(vp,of,sz,rw,b,n,rv) \
299 rv = _VOP_(vop_bmap, vp)((vp)->v_fbhv,of,sz,rw,b,n)
300#define VOP_OPEN(vp, cr, rv) \
301 rv = _VOP_(vop_open, vp)((vp)->v_fbhv, cr)
302#define VOP_GETATTR(vp, vap, f, cr, rv) \
303 rv = _VOP_(vop_getattr, vp)((vp)->v_fbhv, vap, f, cr)
304#define VOP_SETATTR(vp, vap, f, cr, rv) \
305 rv = _VOP_(vop_setattr, vp)((vp)->v_fbhv, vap, f, cr)
306#define VOP_ACCESS(vp, mode, cr, rv) \
307 rv = _VOP_(vop_access, vp)((vp)->v_fbhv, mode, cr)
308#define VOP_LOOKUP(vp,d,vpp,f,rdir,cr,rv) \
309 rv = _VOP_(vop_lookup, vp)((vp)->v_fbhv,d,vpp,f,rdir,cr)
310#define VOP_CREATE(dvp,d,vap,vpp,cr,rv) \
311 rv = _VOP_(vop_create, dvp)((dvp)->v_fbhv,d,vap,vpp,cr)
312#define VOP_REMOVE(dvp,d,cr,rv) \
313 rv = _VOP_(vop_remove, dvp)((dvp)->v_fbhv,d,cr)
314#define VOP_LINK(tdvp,fvp,d,cr,rv) \
315 rv = _VOP_(vop_link, tdvp)((tdvp)->v_fbhv,fvp,d,cr)
316#define VOP_RENAME(fvp,fnm,tdvp,tnm,cr,rv) \
317 rv = _VOP_(vop_rename, fvp)((fvp)->v_fbhv,fnm,tdvp,tnm,cr)
318#define VOP_MKDIR(dp,d,vap,vpp,cr,rv) \
319 rv = _VOP_(vop_mkdir, dp)((dp)->v_fbhv,d,vap,vpp,cr)
320#define VOP_RMDIR(dp,d,cr,rv) \
321 rv = _VOP_(vop_rmdir, dp)((dp)->v_fbhv,d,cr)
322#define VOP_READDIR(vp,uiop,cr,eofp,rv) \
323 rv = _VOP_(vop_readdir, vp)((vp)->v_fbhv,uiop,cr,eofp)
324#define VOP_SYMLINK(dvp,d,vap,tnm,vpp,cr,rv) \
325 rv = _VOP_(vop_symlink, dvp) ((dvp)->v_fbhv,d,vap,tnm,vpp,cr)
326#define VOP_READLINK(vp,uiop,fl,cr,rv) \
327 rv = _VOP_(vop_readlink, vp)((vp)->v_fbhv,uiop,fl,cr)
328#define VOP_FSYNC(vp,f,cr,b,e,rv) \
329 rv = _VOP_(vop_fsync, vp)((vp)->v_fbhv,f,cr,b,e)
330#define VOP_INACTIVE(vp, cr, rv) \
331 rv = _VOP_(vop_inactive, vp)((vp)->v_fbhv, cr)
332#define VOP_RELEASE(vp, rv) \
333 rv = _VOP_(vop_release, vp)((vp)->v_fbhv)
334#define VOP_FID2(vp, fidp, rv) \
335 rv = _VOP_(vop_fid2, vp)((vp)->v_fbhv, fidp)
336#define VOP_RWLOCK(vp,i) \
337 (void)_VOP_(vop_rwlock, vp)((vp)->v_fbhv, i)
338#define VOP_RWLOCK_TRY(vp,i) \
339 _VOP_(vop_rwlock, vp)((vp)->v_fbhv, i)
340#define VOP_RWUNLOCK(vp,i) \
341 (void)_VOP_(vop_rwunlock, vp)((vp)->v_fbhv, i)
342#define VOP_FRLOCK(vp,c,fl,flags,offset,fr,rv) \
343 rv = _VOP_(vop_frlock, vp)((vp)->v_fbhv,c,fl,flags,offset,fr)
344#define VOP_RECLAIM(vp, rv) \
345 rv = _VOP_(vop_reclaim, vp)((vp)->v_fbhv)
346#define VOP_ATTR_GET(vp, name, val, vallenp, fl, cred, rv) \
347 rv = _VOP_(vop_attr_get, vp)((vp)->v_fbhv,name,val,vallenp,fl,cred)
348#define VOP_ATTR_SET(vp, name, val, vallen, fl, cred, rv) \
349 rv = _VOP_(vop_attr_set, vp)((vp)->v_fbhv,name,val,vallen,fl,cred)
350#define VOP_ATTR_REMOVE(vp, name, flags, cred, rv) \
351 rv = _VOP_(vop_attr_remove, vp)((vp)->v_fbhv,name,flags,cred)
352#define VOP_ATTR_LIST(vp, buf, buflen, fl, cursor, cred, rv) \
353 rv = _VOP_(vop_attr_list, vp)((vp)->v_fbhv,buf,buflen,fl,cursor,cred)
354#define VOP_LINK_REMOVED(vp, dvp, linkzero) \
355 (void)_VOP_(vop_link_removed, vp)((vp)->v_fbhv, dvp, linkzero)
356#define VOP_VNODE_CHANGE(vp, cmd, val) \
357 (void)_VOP_(vop_vnode_change, vp)((vp)->v_fbhv,cmd,val)
358/*
359 * These are page cache functions that now go thru VOPs.
360 * 'last' parameter is unused and left in for IRIX compatibility
361 */
362#define VOP_TOSS_PAGES(vp, first, last, fiopt) \
363 _VOP_(vop_tosspages, vp)((vp)->v_fbhv,first, last, fiopt)
364/*
365 * 'last' parameter is unused and left in for IRIX compatibility
366 */
367#define VOP_FLUSHINVAL_PAGES(vp, first, last, fiopt) \
368 _VOP_(vop_flushinval_pages, vp)((vp)->v_fbhv,first,last,fiopt)
369/*
370 * 'last' parameter is unused and left in for IRIX compatibility
371 */
372#define VOP_FLUSH_PAGES(vp, first, last, flags, fiopt, rv) \
373 rv = _VOP_(vop_flush_pages, vp)((vp)->v_fbhv,first,last,flags,fiopt)
374#define VOP_IOCTL(vp, inode, filp, fl, cmd, arg, rv) \
375 rv = _VOP_(vop_ioctl, vp)((vp)->v_fbhv,inode,filp,fl,cmd,arg)
376#define VOP_IFLUSH(vp, flags, rv) \
377 rv = _VOP_(vop_iflush, vp)((vp)->v_fbhv, flags)
378
379/*
380 * Flags for read/write calls - same values as IRIX
381 */
382#define IO_ISAIO 0x00001 /* don't wait for completion */
383#define IO_ISDIRECT 0x00004 /* bypass page cache */
384#define IO_INVIS 0x00020 /* don't update inode timestamps */
385
386/*
387 * Flags for VOP_IFLUSH call
388 */
389#define FLUSH_SYNC 1 /* wait for flush to complete */
390#define FLUSH_INODE 2 /* flush the inode itself */
391#define FLUSH_LOG 4 /* force the last log entry for
392 * this inode out to disk */
393
394/*
395 * Flush/Invalidate options for VOP_TOSS_PAGES, VOP_FLUSHINVAL_PAGES and
396 * VOP_FLUSH_PAGES.
397 */
398#define FI_NONE 0 /* none */
399#define FI_REMAPF 1 /* Do a remapf prior to the operation */
400#define FI_REMAPF_LOCKED 2 /* Do a remapf prior to the operation.
401 Prevent VM access to the pages until
402 the operation completes. */
403
404/*
405 * Vnode attributes. va_mask indicates those attributes the caller
406 * wants to set or extract.
407 */
408typedef struct vattr {
409 int va_mask; /* bit-mask of attributes present */
410 enum vtype va_type; /* vnode type (for create) */
411 mode_t va_mode; /* file access mode and type */
412 nlink_t va_nlink; /* number of references to file */
413 uid_t va_uid; /* owner user id */
414 gid_t va_gid; /* owner group id */
415 xfs_ino_t va_nodeid; /* file id */
416 xfs_off_t va_size; /* file size in bytes */
417 u_long va_blocksize; /* blocksize preferred for i/o */
418 struct timespec va_atime; /* time of last access */
419 struct timespec va_mtime; /* time of last modification */
420 struct timespec va_ctime; /* time file changed */
421 u_int va_gen; /* generation number of file */
422 xfs_dev_t va_rdev; /* device the special file represents */
423 __int64_t va_nblocks; /* number of blocks allocated */
424 u_long va_xflags; /* random extended file flags */
425 u_long va_extsize; /* file extent size */
426 u_long va_nextents; /* number of extents in file */
427 u_long va_anextents; /* number of attr extents in file */
428 int va_projid; /* project id */
429} vattr_t;
430
431/*
432 * setattr or getattr attributes
433 */
434#define XFS_AT_TYPE 0x00000001
435#define XFS_AT_MODE 0x00000002
436#define XFS_AT_UID 0x00000004
437#define XFS_AT_GID 0x00000008
438#define XFS_AT_FSID 0x00000010
439#define XFS_AT_NODEID 0x00000020
440#define XFS_AT_NLINK 0x00000040
441#define XFS_AT_SIZE 0x00000080
442#define XFS_AT_ATIME 0x00000100
443#define XFS_AT_MTIME 0x00000200
444#define XFS_AT_CTIME 0x00000400
445#define XFS_AT_RDEV 0x00000800
446#define XFS_AT_BLKSIZE 0x00001000
447#define XFS_AT_NBLOCKS 0x00002000
448#define XFS_AT_VCODE 0x00004000
449#define XFS_AT_MAC 0x00008000
450#define XFS_AT_UPDATIME 0x00010000
451#define XFS_AT_UPDMTIME 0x00020000
452#define XFS_AT_UPDCTIME 0x00040000
453#define XFS_AT_ACL 0x00080000
454#define XFS_AT_CAP 0x00100000
455#define XFS_AT_INF 0x00200000
456#define XFS_AT_XFLAGS 0x00400000
457#define XFS_AT_EXTSIZE 0x00800000
458#define XFS_AT_NEXTENTS 0x01000000
459#define XFS_AT_ANEXTENTS 0x02000000
460#define XFS_AT_PROJID 0x04000000
461#define XFS_AT_SIZE_NOPERM 0x08000000
462#define XFS_AT_GENCOUNT 0x10000000
463
464#define XFS_AT_ALL (XFS_AT_TYPE|XFS_AT_MODE|XFS_AT_UID|XFS_AT_GID|\
465 XFS_AT_FSID|XFS_AT_NODEID|XFS_AT_NLINK|XFS_AT_SIZE|\
466 XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME|XFS_AT_RDEV|\
467 XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_VCODE|XFS_AT_MAC|\
468 XFS_AT_ACL|XFS_AT_CAP|XFS_AT_INF|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|\
469 XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|XFS_AT_PROJID|XFS_AT_GENCOUNT)
470
471#define XFS_AT_STAT (XFS_AT_TYPE|XFS_AT_MODE|XFS_AT_UID|XFS_AT_GID|\
472 XFS_AT_FSID|XFS_AT_NODEID|XFS_AT_NLINK|XFS_AT_SIZE|\
473 XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME|XFS_AT_RDEV|\
474 XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_PROJID)
475
476#define XFS_AT_TIMES (XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME)
477
478#define XFS_AT_UPDTIMES (XFS_AT_UPDATIME|XFS_AT_UPDMTIME|XFS_AT_UPDCTIME)
479
480#define XFS_AT_NOSET (XFS_AT_NLINK|XFS_AT_RDEV|XFS_AT_FSID|XFS_AT_NODEID|\
481 XFS_AT_TYPE|XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_VCODE|\
482 XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|XFS_AT_GENCOUNT)
483
484/*
485 * Modes.
486 */
487#define VSUID S_ISUID /* set user id on execution */
488#define VSGID S_ISGID /* set group id on execution */
489#define VSVTX S_ISVTX /* save swapped text even after use */
490#define VREAD S_IRUSR /* read, write, execute permissions */
491#define VWRITE S_IWUSR
492#define VEXEC S_IXUSR
493
494#define MODEMASK S_IALLUGO /* mode bits plus permission bits */
495
496/*
497 * Check whether mandatory file locking is enabled.
498 */
499#define MANDLOCK(vp, mode) \
500 ((vp)->v_type == VREG && ((mode) & (VSGID|(VEXEC>>3))) == VSGID)
501
502extern void vn_init(void);
503extern int vn_wait(struct vnode *);
504extern vnode_t *vn_initialize(struct inode *);
505
506/*
507 * Acquiring and invalidating vnodes:
508 *
509 * if (vn_get(vp, version, 0))
510 * ...;
511 * vn_purge(vp, version);
512 *
513 * vn_get and vn_purge must be called with vmap_t arguments, sampled
514 * while a lock that the vnode's VOP_RECLAIM function acquires is
515 * held, to ensure that the vnode sampled with the lock held isn't
516 * recycled (VOP_RECLAIMed) or deallocated between the release of the lock
517 * and the subsequent vn_get or vn_purge.
518 */
519
520/*
521 * vnode_map structures _must_ match vn_epoch and vnode structure sizes.
522 */
523typedef struct vnode_map {
524 vfs_t *v_vfsp;
525 vnumber_t v_number; /* in-core vnode number */
526 xfs_ino_t v_ino; /* inode # */
527} vmap_t;
528
529#define VMAP(vp, vmap) {(vmap).v_vfsp = (vp)->v_vfsp, \
530 (vmap).v_number = (vp)->v_number, \
531 (vmap).v_ino = (vp)->v_inode.i_ino; }
532
533extern void vn_purge(struct vnode *, vmap_t *);
534extern vnode_t *vn_get(struct vnode *, vmap_t *);
535extern int vn_revalidate(struct vnode *);
536extern void vn_revalidate_core(struct vnode *, vattr_t *);
537extern void vn_remove(struct vnode *);
538
539static inline int vn_count(struct vnode *vp)
540{
541 return atomic_read(&LINVFS_GET_IP(vp)->i_count);
542}
543
544/*
545 * Vnode reference counting functions (and macros for compatibility).
546 */
547extern vnode_t *vn_hold(struct vnode *);
548extern void vn_rele(struct vnode *);
549
550#if defined(XFS_VNODE_TRACE)
551#define VN_HOLD(vp) \
552 ((void)vn_hold(vp), \
553 vn_trace_hold(vp, __FILE__, __LINE__, (inst_t *)__return_address))
554#define VN_RELE(vp) \
555 (vn_trace_rele(vp, __FILE__, __LINE__, (inst_t *)__return_address), \
556 iput(LINVFS_GET_IP(vp)))
557#else
558#define VN_HOLD(vp) ((void)vn_hold(vp))
559#define VN_RELE(vp) (iput(LINVFS_GET_IP(vp)))
560#endif
561
562/*
563 * Vname handling macros.
564 */
565#define VNAME(dentry) ((char *) (dentry)->d_name.name)
566#define VNAMELEN(dentry) ((dentry)->d_name.len)
567#define VNAME_TO_VNODE(dentry) (LINVFS_GET_VP((dentry)->d_inode))
568
569/*
570 * Vnode spinlock manipulation.
571 */
572#define VN_LOCK(vp) mutex_spinlock(&(vp)->v_lock)
573#define VN_UNLOCK(vp, s) mutex_spinunlock(&(vp)->v_lock, s)
574#define VN_FLAGSET(vp,b) vn_flagset(vp,b)
575#define VN_FLAGCLR(vp,b) vn_flagclr(vp,b)
576
577static __inline__ void vn_flagset(struct vnode *vp, uint flag)
578{
579 spin_lock(&vp->v_lock);
580 vp->v_flag |= flag;
581 spin_unlock(&vp->v_lock);
582}
583
584static __inline__ void vn_flagclr(struct vnode *vp, uint flag)
585{
586 spin_lock(&vp->v_lock);
587 vp->v_flag &= ~flag;
588 spin_unlock(&vp->v_lock);
589}
590
591/*
592 * Update modify/access/change times on the vnode
593 */
594#define VN_MTIMESET(vp, tvp) (LINVFS_GET_IP(vp)->i_mtime = *(tvp))
595#define VN_ATIMESET(vp, tvp) (LINVFS_GET_IP(vp)->i_atime = *(tvp))
596#define VN_CTIMESET(vp, tvp) (LINVFS_GET_IP(vp)->i_ctime = *(tvp))
597
598/*
599 * Dealing with bad inodes
600 */
601static inline void vn_mark_bad(struct vnode *vp)
602{
603 make_bad_inode(LINVFS_GET_IP(vp));
604}
605
606static inline int VN_BAD(struct vnode *vp)
607{
608 return is_bad_inode(LINVFS_GET_IP(vp));
609}
610
611/*
612 * Some useful predicates.
613 */
614#define VN_MAPPED(vp) mapping_mapped(LINVFS_GET_IP(vp)->i_mapping)
615#define VN_CACHED(vp) (LINVFS_GET_IP(vp)->i_mapping->nrpages)
616#define VN_DIRTY(vp) mapping_tagged(LINVFS_GET_IP(vp)->i_mapping, \
617 PAGECACHE_TAG_DIRTY)
618#define VMODIFY(vp) VN_FLAGSET(vp, VMODIFIED)
619#define VUNMODIFY(vp) VN_FLAGCLR(vp, VMODIFIED)
620
621/*
622 * Flags to VOP_SETATTR/VOP_GETATTR.
623 */
624#define ATTR_UTIME 0x01 /* non-default utime(2) request */
625#define ATTR_DMI 0x08 /* invocation from a DMI function */
626#define ATTR_LAZY 0x80 /* set/get attributes lazily */
627#define ATTR_NONBLOCK 0x100 /* return EAGAIN if operation would block */
628
629/*
630 * Flags to VOP_FSYNC and VOP_RECLAIM.
631 */
632#define FSYNC_NOWAIT 0 /* asynchronous flush */
633#define FSYNC_WAIT 0x1 /* synchronous fsync or forced reclaim */
634#define FSYNC_INVAL 0x2 /* flush and invalidate cached data */
635#define FSYNC_DATA 0x4 /* synchronous fsync of data only */
636
637/*
638 * Tracking vnode activity.
639 */
640#if defined(XFS_VNODE_TRACE)
641
642#define VNODE_TRACE_SIZE 16 /* number of trace entries */
643#define VNODE_KTRACE_ENTRY 1
644#define VNODE_KTRACE_EXIT 2
645#define VNODE_KTRACE_HOLD 3
646#define VNODE_KTRACE_REF 4
647#define VNODE_KTRACE_RELE 5
648
649extern void vn_trace_entry(struct vnode *, char *, inst_t *);
650extern void vn_trace_exit(struct vnode *, char *, inst_t *);
651extern void vn_trace_hold(struct vnode *, char *, int, inst_t *);
652extern void vn_trace_ref(struct vnode *, char *, int, inst_t *);
653extern void vn_trace_rele(struct vnode *, char *, int, inst_t *);
654
655#define VN_TRACE(vp) \
656 vn_trace_ref(vp, __FILE__, __LINE__, (inst_t *)__return_address)
657#else
658#define vn_trace_entry(a,b,c)
659#define vn_trace_exit(a,b,c)
660#define vn_trace_hold(a,b,c,d)
661#define vn_trace_ref(a,b,c,d)
662#define vn_trace_rele(a,b,c,d)
663#define VN_TRACE(vp)
664#endif
665
666#endif /* __XFS_VNODE_H__ */
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
new file mode 100644
index 000000000000..740d20d33187
--- /dev/null
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -0,0 +1,1648 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34#include "xfs_fs.h"
35#include "xfs_inum.h"
36#include "xfs_log.h"
37#include "xfs_trans.h"
38#include "xfs_sb.h"
39#include "xfs_ag.h"
40#include "xfs_dir.h"
41#include "xfs_dir2.h"
42#include "xfs_alloc.h"
43#include "xfs_dmapi.h"
44#include "xfs_quota.h"
45#include "xfs_mount.h"
46#include "xfs_alloc_btree.h"
47#include "xfs_bmap_btree.h"
48#include "xfs_ialloc_btree.h"
49#include "xfs_btree.h"
50#include "xfs_ialloc.h"
51#include "xfs_attr_sf.h"
52#include "xfs_dir_sf.h"
53#include "xfs_dir2_sf.h"
54#include "xfs_dinode.h"
55#include "xfs_inode.h"
56#include "xfs_bmap.h"
57#include "xfs_bit.h"
58#include "xfs_rtalloc.h"
59#include "xfs_error.h"
60#include "xfs_itable.h"
61#include "xfs_rw.h"
62#include "xfs_acl.h"
63#include "xfs_cap.h"
64#include "xfs_mac.h"
65#include "xfs_attr.h"
66#include "xfs_buf_item.h"
67#include "xfs_trans_space.h"
68#include "xfs_trans_priv.h"
69
70#include "xfs_qm.h"
71
72
73/*
74 LOCK ORDER
75
76 inode lock (ilock)
77 dquot hash-chain lock (hashlock)
78 xqm dquot freelist lock (freelistlock
79 mount's dquot list lock (mplistlock)
80 user dquot lock - lock ordering among dquots is based on the uid or gid
81 group dquot lock - similar to udquots. Between the two dquots, the udquot
82 has to be locked first.
83 pin lock - the dquot lock must be held to take this lock.
84 flush lock - ditto.
85*/
86
87STATIC void xfs_qm_dqflush_done(xfs_buf_t *, xfs_dq_logitem_t *);
88
89#ifdef DEBUG
90xfs_buftarg_t *xfs_dqerror_target;
91int xfs_do_dqerror;
92int xfs_dqreq_num;
93int xfs_dqerror_mod = 33;
94#endif
95
96/*
97 * Allocate and initialize a dquot. We don't always allocate fresh memory;
98 * we try to reclaim a free dquot if the number of incore dquots are above
99 * a threshold.
100 * The only field inside the core that gets initialized at this point
101 * is the d_id field. The idea is to fill in the entire q_core
102 * when we read in the on disk dquot.
103 */
104xfs_dquot_t *
105xfs_qm_dqinit(
106 xfs_mount_t *mp,
107 xfs_dqid_t id,
108 uint type)
109{
110 xfs_dquot_t *dqp;
111 boolean_t brandnewdquot;
112
113 brandnewdquot = xfs_qm_dqalloc_incore(&dqp);
114 dqp->dq_flags = type;
115 INT_SET(dqp->q_core.d_id, ARCH_CONVERT, id);
116 dqp->q_mount = mp;
117
118 /*
119 * No need to re-initialize these if this is a reclaimed dquot.
120 */
121 if (brandnewdquot) {
122 dqp->dq_flnext = dqp->dq_flprev = dqp;
123 mutex_init(&dqp->q_qlock, MUTEX_DEFAULT, "xdq");
124 initnsema(&dqp->q_flock, 1, "fdq");
125 sv_init(&dqp->q_pinwait, SV_DEFAULT, "pdq");
126
127#ifdef XFS_DQUOT_TRACE
128 dqp->q_trace = ktrace_alloc(DQUOT_TRACE_SIZE, KM_SLEEP);
129 xfs_dqtrace_entry(dqp, "DQINIT");
130#endif
131 } else {
132 /*
133 * Only the q_core portion was zeroed in dqreclaim_one().
134 * So, we need to reset others.
135 */
136 dqp->q_nrefs = 0;
137 dqp->q_blkno = 0;
138 dqp->MPL_NEXT = dqp->HL_NEXT = NULL;
139 dqp->HL_PREVP = dqp->MPL_PREVP = NULL;
140 dqp->q_bufoffset = 0;
141 dqp->q_fileoffset = 0;
142 dqp->q_transp = NULL;
143 dqp->q_gdquot = NULL;
144 dqp->q_res_bcount = 0;
145 dqp->q_res_icount = 0;
146 dqp->q_res_rtbcount = 0;
147 dqp->q_pincount = 0;
148 dqp->q_hash = NULL;
149 ASSERT(dqp->dq_flnext == dqp->dq_flprev);
150
151#ifdef XFS_DQUOT_TRACE
152 ASSERT(dqp->q_trace);
153 xfs_dqtrace_entry(dqp, "DQRECLAIMED_INIT");
154#endif
155 }
156
157 /*
158 * log item gets initialized later
159 */
160 return (dqp);
161}
162
163/*
164 * This is called to free all the memory associated with a dquot
165 */
166void
167xfs_qm_dqdestroy(
168 xfs_dquot_t *dqp)
169{
170 ASSERT(! XFS_DQ_IS_ON_FREELIST(dqp));
171
172 mutex_destroy(&dqp->q_qlock);
173 freesema(&dqp->q_flock);
174 sv_destroy(&dqp->q_pinwait);
175
176#ifdef XFS_DQUOT_TRACE
177 if (dqp->q_trace)
178 ktrace_free(dqp->q_trace);
179 dqp->q_trace = NULL;
180#endif
181 kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
182 atomic_dec(&xfs_Gqm->qm_totaldquots);
183}
184
185/*
186 * This is what a 'fresh' dquot inside a dquot chunk looks like on disk.
187 */
188STATIC void
189xfs_qm_dqinit_core(
190 xfs_dqid_t id,
191 uint type,
192 xfs_dqblk_t *d)
193{
194 /*
195 * Caller has zero'd the entire dquot 'chunk' already.
196 */
197 INT_SET(d->dd_diskdq.d_magic, ARCH_CONVERT, XFS_DQUOT_MAGIC);
198 INT_SET(d->dd_diskdq.d_version, ARCH_CONVERT, XFS_DQUOT_VERSION);
199 INT_SET(d->dd_diskdq.d_id, ARCH_CONVERT, id);
200 INT_SET(d->dd_diskdq.d_flags, ARCH_CONVERT, type);
201}
202
203
204#ifdef XFS_DQUOT_TRACE
205/*
206 * Dquot tracing for debugging.
207 */
208/* ARGSUSED */
209void
210__xfs_dqtrace_entry(
211 xfs_dquot_t *dqp,
212 char *func,
213 void *retaddr,
214 xfs_inode_t *ip)
215{
216 xfs_dquot_t *udqp = NULL;
217 xfs_ino_t ino = 0;
218
219 ASSERT(dqp->q_trace);
220 if (ip) {
221 ino = ip->i_ino;
222 udqp = ip->i_udquot;
223 }
224 ktrace_enter(dqp->q_trace,
225 (void *)(__psint_t)DQUOT_KTRACE_ENTRY,
226 (void *)func,
227 (void *)(__psint_t)dqp->q_nrefs,
228 (void *)(__psint_t)dqp->dq_flags,
229 (void *)(__psint_t)dqp->q_res_bcount,
230 (void *)(__psint_t)INT_GET(dqp->q_core.d_bcount,
231 ARCH_CONVERT),
232 (void *)(__psint_t)INT_GET(dqp->q_core.d_icount,
233 ARCH_CONVERT),
234 (void *)(__psint_t)INT_GET(dqp->q_core.d_blk_hardlimit,
235 ARCH_CONVERT),
236 (void *)(__psint_t)INT_GET(dqp->q_core.d_blk_softlimit,
237 ARCH_CONVERT),
238 (void *)(__psint_t)INT_GET(dqp->q_core.d_ino_hardlimit,
239 ARCH_CONVERT),
240 (void *)(__psint_t)INT_GET(dqp->q_core.d_ino_softlimit,
241 ARCH_CONVERT),
242 (void *)(__psint_t)INT_GET(dqp->q_core.d_id, ARCH_CONVERT),
243 (void *)(__psint_t)current_pid(),
244 (void *)(__psint_t)ino,
245 (void *)(__psint_t)retaddr,
246 (void *)(__psint_t)udqp);
247 return;
248}
249#endif
250
251
252/*
253 * If default limits are in force, push them into the dquot now.
254 * We overwrite the dquot limits only if they are zero and this
255 * is not the root dquot.
256 */
257void
258xfs_qm_adjust_dqlimits(
259 xfs_mount_t *mp,
260 xfs_disk_dquot_t *d)
261{
262 xfs_quotainfo_t *q = mp->m_quotainfo;
263
264 ASSERT(d->d_id);
265
266 if (q->qi_bsoftlimit && !d->d_blk_softlimit)
267 INT_SET(d->d_blk_softlimit, ARCH_CONVERT, q->qi_bsoftlimit);
268 if (q->qi_bhardlimit && !d->d_blk_hardlimit)
269 INT_SET(d->d_blk_hardlimit, ARCH_CONVERT, q->qi_bhardlimit);
270 if (q->qi_isoftlimit && !d->d_ino_softlimit)
271 INT_SET(d->d_ino_softlimit, ARCH_CONVERT, q->qi_isoftlimit);
272 if (q->qi_ihardlimit && !d->d_ino_hardlimit)
273 INT_SET(d->d_ino_hardlimit, ARCH_CONVERT, q->qi_ihardlimit);
274 if (q->qi_rtbsoftlimit && !d->d_rtb_softlimit)
275 INT_SET(d->d_rtb_softlimit, ARCH_CONVERT, q->qi_rtbsoftlimit);
276 if (q->qi_rtbhardlimit && !d->d_rtb_hardlimit)
277 INT_SET(d->d_rtb_hardlimit, ARCH_CONVERT, q->qi_rtbhardlimit);
278}
279
280/*
281 * Check the limits and timers of a dquot and start or reset timers
282 * if necessary.
283 * This gets called even when quota enforcement is OFF, which makes our
284 * life a little less complicated. (We just don't reject any quota
285 * reservations in that case, when enforcement is off).
286 * We also return 0 as the values of the timers in Q_GETQUOTA calls, when
287 * enforcement's off.
288 * In contrast, warnings are a little different in that they don't
289 * 'automatically' get started when limits get exceeded.
290 */
291void
292xfs_qm_adjust_dqtimers(
293 xfs_mount_t *mp,
294 xfs_disk_dquot_t *d)
295{
296 ASSERT(d->d_id);
297
298#ifdef QUOTADEBUG
299 if (INT_GET(d->d_blk_hardlimit, ARCH_CONVERT))
300 ASSERT(INT_GET(d->d_blk_softlimit, ARCH_CONVERT) <=
301 INT_GET(d->d_blk_hardlimit, ARCH_CONVERT));
302 if (INT_GET(d->d_ino_hardlimit, ARCH_CONVERT))
303 ASSERT(INT_GET(d->d_ino_softlimit, ARCH_CONVERT) <=
304 INT_GET(d->d_ino_hardlimit, ARCH_CONVERT));
305 if (INT_GET(d->d_rtb_hardlimit, ARCH_CONVERT))
306 ASSERT(INT_GET(d->d_rtb_softlimit, ARCH_CONVERT) <=
307 INT_GET(d->d_rtb_hardlimit, ARCH_CONVERT));
308#endif
309 if (!d->d_btimer) {
310 if ((INT_GET(d->d_blk_softlimit, ARCH_CONVERT) &&
311 (INT_GET(d->d_bcount, ARCH_CONVERT) >=
312 INT_GET(d->d_blk_softlimit, ARCH_CONVERT))) ||
313 (INT_GET(d->d_blk_hardlimit, ARCH_CONVERT) &&
314 (INT_GET(d->d_bcount, ARCH_CONVERT) >=
315 INT_GET(d->d_blk_hardlimit, ARCH_CONVERT)))) {
316 INT_SET(d->d_btimer, ARCH_CONVERT,
317 get_seconds() + XFS_QI_BTIMELIMIT(mp));
318 }
319 } else {
320 if ((!d->d_blk_softlimit ||
321 (INT_GET(d->d_bcount, ARCH_CONVERT) <
322 INT_GET(d->d_blk_softlimit, ARCH_CONVERT))) &&
323 (!d->d_blk_hardlimit ||
324 (INT_GET(d->d_bcount, ARCH_CONVERT) <
325 INT_GET(d->d_blk_hardlimit, ARCH_CONVERT)))) {
326 d->d_btimer = 0;
327 }
328 }
329
330 if (!d->d_itimer) {
331 if ((INT_GET(d->d_ino_softlimit, ARCH_CONVERT) &&
332 (INT_GET(d->d_icount, ARCH_CONVERT) >=
333 INT_GET(d->d_ino_softlimit, ARCH_CONVERT))) ||
334 (INT_GET(d->d_ino_hardlimit, ARCH_CONVERT) &&
335 (INT_GET(d->d_icount, ARCH_CONVERT) >=
336 INT_GET(d->d_ino_hardlimit, ARCH_CONVERT)))) {
337 INT_SET(d->d_itimer, ARCH_CONVERT,
338 get_seconds() + XFS_QI_ITIMELIMIT(mp));
339 }
340 } else {
341 if ((!d->d_ino_softlimit ||
342 (INT_GET(d->d_icount, ARCH_CONVERT) <
343 INT_GET(d->d_ino_softlimit, ARCH_CONVERT))) &&
344 (!d->d_ino_hardlimit ||
345 (INT_GET(d->d_icount, ARCH_CONVERT) <
346 INT_GET(d->d_ino_hardlimit, ARCH_CONVERT)))) {
347 d->d_itimer = 0;
348 }
349 }
350
351 if (!d->d_rtbtimer) {
352 if ((INT_GET(d->d_rtb_softlimit, ARCH_CONVERT) &&
353 (INT_GET(d->d_rtbcount, ARCH_CONVERT) >=
354 INT_GET(d->d_rtb_softlimit, ARCH_CONVERT))) ||
355 (INT_GET(d->d_rtb_hardlimit, ARCH_CONVERT) &&
356 (INT_GET(d->d_rtbcount, ARCH_CONVERT) >=
357 INT_GET(d->d_rtb_hardlimit, ARCH_CONVERT)))) {
358 INT_SET(d->d_rtbtimer, ARCH_CONVERT,
359 get_seconds() + XFS_QI_RTBTIMELIMIT(mp));
360 }
361 } else {
362 if ((!d->d_rtb_softlimit ||
363 (INT_GET(d->d_rtbcount, ARCH_CONVERT) <
364 INT_GET(d->d_rtb_softlimit, ARCH_CONVERT))) &&
365 (!d->d_rtb_hardlimit ||
366 (INT_GET(d->d_rtbcount, ARCH_CONVERT) <
367 INT_GET(d->d_rtb_hardlimit, ARCH_CONVERT)))) {
368 d->d_rtbtimer = 0;
369 }
370 }
371}
372
373/*
374 * Increment or reset warnings of a given dquot.
375 */
376int
377xfs_qm_dqwarn(
378 xfs_disk_dquot_t *d,
379 uint flags)
380{
381 int warned;
382
383 /*
384 * root's limits are not real limits.
385 */
386 if (!d->d_id)
387 return (0);
388
389 warned = 0;
390 if (INT_GET(d->d_blk_softlimit, ARCH_CONVERT) &&
391 (INT_GET(d->d_bcount, ARCH_CONVERT) >=
392 INT_GET(d->d_blk_softlimit, ARCH_CONVERT))) {
393 if (flags & XFS_QMOPT_DOWARN) {
394 INT_MOD(d->d_bwarns, ARCH_CONVERT, +1);
395 warned++;
396 }
397 } else {
398 if (!d->d_blk_softlimit ||
399 (INT_GET(d->d_bcount, ARCH_CONVERT) <
400 INT_GET(d->d_blk_softlimit, ARCH_CONVERT))) {
401 d->d_bwarns = 0;
402 }
403 }
404
405 if (INT_GET(d->d_ino_softlimit, ARCH_CONVERT) > 0 &&
406 (INT_GET(d->d_icount, ARCH_CONVERT) >=
407 INT_GET(d->d_ino_softlimit, ARCH_CONVERT))) {
408 if (flags & XFS_QMOPT_DOWARN) {
409 INT_MOD(d->d_iwarns, ARCH_CONVERT, +1);
410 warned++;
411 }
412 } else {
413 if (!d->d_ino_softlimit ||
414 (INT_GET(d->d_icount, ARCH_CONVERT) <
415 INT_GET(d->d_ino_softlimit, ARCH_CONVERT))) {
416 d->d_iwarns = 0;
417 }
418 }
419#ifdef QUOTADEBUG
420 if (INT_GET(d->d_iwarns, ARCH_CONVERT))
421 cmn_err(CE_DEBUG,
422 "--------@@Inode warnings running : %Lu >= %Lu",
423 INT_GET(d->d_icount, ARCH_CONVERT),
424 INT_GET(d->d_ino_softlimit, ARCH_CONVERT));
425 if (INT_GET(d->d_bwarns, ARCH_CONVERT))
426 cmn_err(CE_DEBUG,
427 "--------@@Blks warnings running : %Lu >= %Lu",
428 INT_GET(d->d_bcount, ARCH_CONVERT),
429 INT_GET(d->d_blk_softlimit, ARCH_CONVERT));
430#endif
431 return (warned);
432}
433
434
435/*
436 * initialize a buffer full of dquots and log the whole thing
437 */
438STATIC void
439xfs_qm_init_dquot_blk(
440 xfs_trans_t *tp,
441 xfs_mount_t *mp,
442 xfs_dqid_t id,
443 uint type,
444 xfs_buf_t *bp)
445{
446 xfs_dqblk_t *d;
447 int curid, i;
448
449 ASSERT(tp);
450 ASSERT(XFS_BUF_ISBUSY(bp));
451 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
452
453 d = (xfs_dqblk_t *)XFS_BUF_PTR(bp);
454
455 /*
456 * ID of the first dquot in the block - id's are zero based.
457 */
458 curid = id - (id % XFS_QM_DQPERBLK(mp));
459 ASSERT(curid >= 0);
460 memset(d, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp)));
461 for (i = 0; i < XFS_QM_DQPERBLK(mp); i++, d++, curid++)
462 xfs_qm_dqinit_core(curid, type, d);
463 xfs_trans_dquot_buf(tp, bp,
464 type & XFS_DQ_USER ?
465 XFS_BLI_UDQUOT_BUF :
466 XFS_BLI_GDQUOT_BUF);
467 xfs_trans_log_buf(tp, bp, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1);
468}
469
470
471
472/*
473 * Allocate a block and fill it with dquots.
474 * This is called when the bmapi finds a hole.
475 */
476STATIC int
477xfs_qm_dqalloc(
478 xfs_trans_t *tp,
479 xfs_mount_t *mp,
480 xfs_dquot_t *dqp,
481 xfs_inode_t *quotip,
482 xfs_fileoff_t offset_fsb,
483 xfs_buf_t **O_bpp)
484{
485 xfs_fsblock_t firstblock;
486 xfs_bmap_free_t flist;
487 xfs_bmbt_irec_t map;
488 int nmaps, error, committed;
489 xfs_buf_t *bp;
490
491 ASSERT(tp != NULL);
492 xfs_dqtrace_entry(dqp, "DQALLOC");
493
494 /*
495 * Initialize the bmap freelist prior to calling bmapi code.
496 */
497 XFS_BMAP_INIT(&flist, &firstblock);
498 xfs_ilock(quotip, XFS_ILOCK_EXCL);
499 /*
500 * Return if this type of quotas is turned off while we didn't
501 * have an inode lock
502 */
503 if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
504 xfs_iunlock(quotip, XFS_ILOCK_EXCL);
505 return (ESRCH);
506 }
507
508 /*
509 * xfs_trans_commit normally decrements the vnode ref count
510 * when it unlocks the inode. Since we want to keep the quota
511 * inode around, we bump the vnode ref count now.
512 */
513 VN_HOLD(XFS_ITOV(quotip));
514
515 xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
516 nmaps = 1;
517 if ((error = xfs_bmapi(tp, quotip,
518 offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB,
519 XFS_BMAPI_METADATA | XFS_BMAPI_WRITE,
520 &firstblock,
521 XFS_QM_DQALLOC_SPACE_RES(mp),
522 &map, &nmaps, &flist))) {
523 goto error0;
524 }
525 ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
526 ASSERT(nmaps == 1);
527 ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
528 (map.br_startblock != HOLESTARTBLOCK));
529
530 /*
531 * Keep track of the blkno to save a lookup later
532 */
533 dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
534
535 /* now we can just get the buffer (there's nothing to read yet) */
536 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
537 dqp->q_blkno,
538 XFS_QI_DQCHUNKLEN(mp),
539 0);
540 if (!bp || (error = XFS_BUF_GETERROR(bp)))
541 goto error1;
542 /*
543 * Make a chunk of dquots out of this buffer and log
544 * the entire thing.
545 */
546 xfs_qm_init_dquot_blk(tp, mp, INT_GET(dqp->q_core.d_id, ARCH_CONVERT),
547 dqp->dq_flags & (XFS_DQ_USER|XFS_DQ_GROUP),
548 bp);
549
550 if ((error = xfs_bmap_finish(&tp, &flist, firstblock, &committed))) {
551 goto error1;
552 }
553
554 *O_bpp = bp;
555 return 0;
556
557 error1:
558 xfs_bmap_cancel(&flist);
559 error0:
560 xfs_iunlock(quotip, XFS_ILOCK_EXCL);
561
562 return (error);
563}
564
565/*
566 * Maps a dquot to the buffer containing its on-disk version.
567 * This returns a ptr to the buffer containing the on-disk dquot
568 * in the bpp param, and a ptr to the on-disk dquot within that buffer
569 */
570STATIC int
571xfs_qm_dqtobp(
572 xfs_trans_t *tp,
573 xfs_dquot_t *dqp,
574 xfs_disk_dquot_t **O_ddpp,
575 xfs_buf_t **O_bpp,
576 uint flags)
577{
578 xfs_bmbt_irec_t map;
579 int nmaps, error;
580 xfs_buf_t *bp;
581 xfs_inode_t *quotip;
582 xfs_mount_t *mp;
583 xfs_disk_dquot_t *ddq;
584 xfs_dqid_t id;
585 boolean_t newdquot;
586
587 mp = dqp->q_mount;
588 id = INT_GET(dqp->q_core.d_id, ARCH_CONVERT);
589 nmaps = 1;
590 newdquot = B_FALSE;
591
592 /*
593 * If we don't know where the dquot lives, find out.
594 */
595 if (dqp->q_blkno == (xfs_daddr_t) 0) {
596 /* We use the id as an index */
597 dqp->q_fileoffset = (xfs_fileoff_t) ((uint)id /
598 XFS_QM_DQPERBLK(mp));
599 nmaps = 1;
600 quotip = XFS_DQ_TO_QIP(dqp);
601 xfs_ilock(quotip, XFS_ILOCK_SHARED);
602 /*
603 * Return if this type of quotas is turned off while we didn't
604 * have an inode lock
605 */
606 if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
607 xfs_iunlock(quotip, XFS_ILOCK_SHARED);
608 return (ESRCH);
609 }
610 /*
611 * Find the block map; no allocations yet
612 */
613 error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
614 XFS_DQUOT_CLUSTER_SIZE_FSB,
615 XFS_BMAPI_METADATA,
616 NULL, 0, &map, &nmaps, NULL);
617
618 xfs_iunlock(quotip, XFS_ILOCK_SHARED);
619 if (error)
620 return (error);
621 ASSERT(nmaps == 1);
622 ASSERT(map.br_blockcount == 1);
623
624 /*
625 * offset of dquot in the (fixed sized) dquot chunk.
626 */
627 dqp->q_bufoffset = (id % XFS_QM_DQPERBLK(mp)) *
628 sizeof(xfs_dqblk_t);
629 if (map.br_startblock == HOLESTARTBLOCK) {
630 /*
631 * We don't allocate unless we're asked to
632 */
633 if (!(flags & XFS_QMOPT_DQALLOC))
634 return (ENOENT);
635
636 ASSERT(tp);
637 if ((error = xfs_qm_dqalloc(tp, mp, dqp, quotip,
638 dqp->q_fileoffset, &bp)))
639 return (error);
640 newdquot = B_TRUE;
641 } else {
642 /*
643 * store the blkno etc so that we don't have to do the
644 * mapping all the time
645 */
646 dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
647 }
648 }
649 ASSERT(dqp->q_blkno != DELAYSTARTBLOCK);
650 ASSERT(dqp->q_blkno != HOLESTARTBLOCK);
651
652 /*
653 * Read in the buffer, unless we've just done the allocation
654 * (in which case we already have the buf).
655 */
656 if (! newdquot) {
657 xfs_dqtrace_entry(dqp, "DQTOBP READBUF");
658 if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
659 dqp->q_blkno,
660 XFS_QI_DQCHUNKLEN(mp),
661 0, &bp))) {
662 return (error);
663 }
664 if (error || !bp)
665 return XFS_ERROR(error);
666 }
667 ASSERT(XFS_BUF_ISBUSY(bp));
668 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
669
670 /*
671 * calculate the location of the dquot inside the buffer.
672 */
673 ddq = (xfs_disk_dquot_t *)((char *)XFS_BUF_PTR(bp) + dqp->q_bufoffset);
674
675 /*
676 * A simple sanity check in case we got a corrupted dquot...
677 */
678 if (xfs_qm_dqcheck(ddq, id,
679 dqp->dq_flags & (XFS_DQ_USER|XFS_DQ_GROUP),
680 flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN),
681 "dqtobp")) {
682 if (!(flags & XFS_QMOPT_DQREPAIR)) {
683 xfs_trans_brelse(tp, bp);
684 return XFS_ERROR(EIO);
685 }
686 XFS_BUF_BUSY(bp); /* We dirtied this */
687 }
688
689 *O_bpp = bp;
690 *O_ddpp = ddq;
691
692 return (0);
693}
694
695
696/*
697 * Read in the ondisk dquot using dqtobp() then copy it to an incore version,
698 * and release the buffer immediately.
699 *
700 */
701/* ARGSUSED */
702STATIC int
703xfs_qm_dqread(
704 xfs_trans_t *tp,
705 xfs_dqid_t id,
706 xfs_dquot_t *dqp, /* dquot to get filled in */
707 uint flags)
708{
709 xfs_disk_dquot_t *ddqp;
710 xfs_buf_t *bp;
711 int error;
712
713 /*
714 * get a pointer to the on-disk dquot and the buffer containing it
715 * dqp already knows its own type (GROUP/USER).
716 */
717 xfs_dqtrace_entry(dqp, "DQREAD");
718 if ((error = xfs_qm_dqtobp(tp, dqp, &ddqp, &bp, flags))) {
719 return (error);
720 }
721
722 /* copy everything from disk dquot to the incore dquot */
723 memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t));
724 ASSERT(INT_GET(dqp->q_core.d_id, ARCH_CONVERT) == id);
725 xfs_qm_dquot_logitem_init(dqp);
726
727 /*
728 * Reservation counters are defined as reservation plus current usage
729 * to avoid having to add everytime.
730 */
731 dqp->q_res_bcount = INT_GET(ddqp->d_bcount, ARCH_CONVERT);
732 dqp->q_res_icount = INT_GET(ddqp->d_icount, ARCH_CONVERT);
733 dqp->q_res_rtbcount = INT_GET(ddqp->d_rtbcount, ARCH_CONVERT);
734
735 /* Mark the buf so that this will stay incore a little longer */
736 XFS_BUF_SET_VTYPE_REF(bp, B_FS_DQUOT, XFS_DQUOT_REF);
737
738 /*
739 * We got the buffer with a xfs_trans_read_buf() (in dqtobp())
740 * So we need to release with xfs_trans_brelse().
741 * The strategy here is identical to that of inodes; we lock
742 * the dquot in xfs_qm_dqget() before making it accessible to
743 * others. This is because dquots, like inodes, need a good level of
744 * concurrency, and we don't want to take locks on the entire buffers
745 * for dquot accesses.
746 * Note also that the dquot buffer may even be dirty at this point, if
747 * this particular dquot was repaired. We still aren't afraid to
748 * brelse it because we have the changes incore.
749 */
750 ASSERT(XFS_BUF_ISBUSY(bp));
751 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
752 xfs_trans_brelse(tp, bp);
753
754 return (error);
755}
756
757
758/*
759 * allocate an incore dquot from the kernel heap,
760 * and fill its core with quota information kept on disk.
761 * If XFS_QMOPT_DQALLOC is set, it'll allocate a dquot on disk
762 * if it wasn't already allocated.
763 */
764STATIC int
765xfs_qm_idtodq(
766 xfs_mount_t *mp,
767 xfs_dqid_t id, /* gid or uid, depending on type */
768 uint type, /* UDQUOT or GDQUOT */
769 uint flags, /* DQALLOC, DQREPAIR */
770 xfs_dquot_t **O_dqpp)/* OUT : incore dquot, not locked */
771{
772 xfs_dquot_t *dqp;
773 int error;
774 xfs_trans_t *tp;
775 int cancelflags=0;
776
777 dqp = xfs_qm_dqinit(mp, id, type);
778 tp = NULL;
779 if (flags & XFS_QMOPT_DQALLOC) {
780 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
781 if ((error = xfs_trans_reserve(tp,
782 XFS_QM_DQALLOC_SPACE_RES(mp),
783 XFS_WRITE_LOG_RES(mp) +
784 BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1 +
785 128,
786 0,
787 XFS_TRANS_PERM_LOG_RES,
788 XFS_WRITE_LOG_COUNT))) {
789 cancelflags = 0;
790 goto error0;
791 }
792 cancelflags = XFS_TRANS_RELEASE_LOG_RES;
793 }
794
795 /*
796 * Read it from disk; xfs_dqread() takes care of
797 * all the necessary initialization of dquot's fields (locks, etc)
798 */
799 if ((error = xfs_qm_dqread(tp, id, dqp, flags))) {
800 /*
801 * This can happen if quotas got turned off (ESRCH),
802 * or if the dquot didn't exist on disk and we ask to
803 * allocate (ENOENT).
804 */
805 xfs_dqtrace_entry(dqp, "DQREAD FAIL");
806 cancelflags |= XFS_TRANS_ABORT;
807 goto error0;
808 }
809 if (tp) {
810 if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES,
811 NULL)))
812 goto error1;
813 }
814
815 *O_dqpp = dqp;
816 return (0);
817
818 error0:
819 ASSERT(error);
820 if (tp)
821 xfs_trans_cancel(tp, cancelflags);
822 error1:
823 xfs_qm_dqdestroy(dqp);
824 *O_dqpp = NULL;
825 return (error);
826}
827
828/*
829 * Lookup a dquot in the incore dquot hashtable. We keep two separate
830 * hashtables for user and group dquots; and, these are global tables
831 * inside the XQM, not per-filesystem tables.
832 * The hash chain must be locked by caller, and it is left locked
833 * on return. Returning dquot is locked.
834 */
835STATIC int
836xfs_qm_dqlookup(
837 xfs_mount_t *mp,
838 xfs_dqid_t id,
839 xfs_dqhash_t *qh,
840 xfs_dquot_t **O_dqpp)
841{
842 xfs_dquot_t *dqp;
843 uint flist_locked;
844 xfs_dquot_t *d;
845
846 ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
847
848 flist_locked = B_FALSE;
849
850 /*
851 * Traverse the hashchain looking for a match
852 */
853 for (dqp = qh->qh_next; dqp != NULL; dqp = dqp->HL_NEXT) {
854 /*
855 * We already have the hashlock. We don't need the
856 * dqlock to look at the id field of the dquot, since the
857 * id can't be modified without the hashlock anyway.
858 */
859 if (INT_GET(dqp->q_core.d_id, ARCH_CONVERT) == id && dqp->q_mount == mp) {
860 xfs_dqtrace_entry(dqp, "DQFOUND BY LOOKUP");
861 /*
862 * All in core dquots must be on the dqlist of mp
863 */
864 ASSERT(dqp->MPL_PREVP != NULL);
865
866 xfs_dqlock(dqp);
867 if (dqp->q_nrefs == 0) {
868 ASSERT (XFS_DQ_IS_ON_FREELIST(dqp));
869 if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) {
870 xfs_dqtrace_entry(dqp, "DQLOOKUP: WANT");
871
872 /*
873 * We may have raced with dqreclaim_one()
874 * (and lost). So, flag that we don't
875 * want the dquot to be reclaimed.
876 */
877 dqp->dq_flags |= XFS_DQ_WANT;
878 xfs_dqunlock(dqp);
879 xfs_qm_freelist_lock(xfs_Gqm);
880 xfs_dqlock(dqp);
881 dqp->dq_flags &= ~(XFS_DQ_WANT);
882 }
883 flist_locked = B_TRUE;
884 }
885
886 /*
887 * id couldn't have changed; we had the hashlock all
888 * along
889 */
890 ASSERT(INT_GET(dqp->q_core.d_id, ARCH_CONVERT) == id);
891
892 if (flist_locked) {
893 if (dqp->q_nrefs != 0) {
894 xfs_qm_freelist_unlock(xfs_Gqm);
895 flist_locked = B_FALSE;
896 } else {
897 /*
898 * take it off the freelist
899 */
900 xfs_dqtrace_entry(dqp,
901 "DQLOOKUP: TAKEOFF FL");
902 XQM_FREELIST_REMOVE(dqp);
903 /* xfs_qm_freelist_print(&(xfs_Gqm->
904 qm_dqfreelist),
905 "after removal"); */
906 }
907 }
908
909 /*
910 * grab a reference
911 */
912 XFS_DQHOLD(dqp);
913
914 if (flist_locked)
915 xfs_qm_freelist_unlock(xfs_Gqm);
916 /*
917 * move the dquot to the front of the hashchain
918 */
919 ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
920 if (dqp->HL_PREVP != &qh->qh_next) {
921 xfs_dqtrace_entry(dqp,
922 "DQLOOKUP: HASH MOVETOFRONT");
923 if ((d = dqp->HL_NEXT))
924 d->HL_PREVP = dqp->HL_PREVP;
925 *(dqp->HL_PREVP) = d;
926 d = qh->qh_next;
927 d->HL_PREVP = &dqp->HL_NEXT;
928 dqp->HL_NEXT = d;
929 dqp->HL_PREVP = &qh->qh_next;
930 qh->qh_next = dqp;
931 }
932 xfs_dqtrace_entry(dqp, "LOOKUP END");
933 *O_dqpp = dqp;
934 ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
935 return (0);
936 }
937 }
938
939 *O_dqpp = NULL;
940 ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
941 return (1);
942}
943
944/*
945 * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a
946 * a locked dquot, doing an allocation (if requested) as needed.
947 * When both an inode and an id are given, the inode's id takes precedence.
948 * That is, if the id changes while we don't hold the ilock inside this
949 * function, the new dquot is returned, not necessarily the one requested
950 * in the id argument.
951 */
952int
953xfs_qm_dqget(
954 xfs_mount_t *mp,
955 xfs_inode_t *ip, /* locked inode (optional) */
956 xfs_dqid_t id, /* gid or uid, depending on type */
957 uint type, /* UDQUOT or GDQUOT */
958 uint flags, /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */
959 xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */
960{
961 xfs_dquot_t *dqp;
962 xfs_dqhash_t *h;
963 uint version;
964 int error;
965
966 ASSERT(XFS_IS_QUOTA_RUNNING(mp));
967 if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) ||
968 (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) {
969 return (ESRCH);
970 }
971 h = XFS_DQ_HASH(mp, id, type);
972
973#ifdef DEBUG
974 if (xfs_do_dqerror) {
975 if ((xfs_dqerror_target == mp->m_ddev_targp) &&
976 (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) {
977 cmn_err(CE_DEBUG, "Returning error in dqget");
978 return (EIO);
979 }
980 }
981#endif
982
983 again:
984
985#ifdef DEBUG
986 ASSERT(type == XFS_DQ_USER || type == XFS_DQ_GROUP);
987 if (ip) {
988 ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
989 if (type == XFS_DQ_USER)
990 ASSERT(ip->i_udquot == NULL);
991 else
992 ASSERT(ip->i_gdquot == NULL);
993 }
994#endif
995 XFS_DQ_HASH_LOCK(h);
996
997 /*
998 * Look in the cache (hashtable).
999 * The chain is kept locked during lookup.
1000 */
1001 if (xfs_qm_dqlookup(mp, id, h, O_dqpp) == 0) {
1002 XQM_STATS_INC(xqmstats.xs_qm_dqcachehits);
1003 /*
1004 * The dquot was found, moved to the front of the chain,
1005 * taken off the freelist if it was on it, and locked
1006 * at this point. Just unlock the hashchain and return.
1007 */
1008 ASSERT(*O_dqpp);
1009 ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp));
1010 XFS_DQ_HASH_UNLOCK(h);
1011 xfs_dqtrace_entry(*O_dqpp, "DQGET DONE (FROM CACHE)");
1012 return (0); /* success */
1013 }
1014 XQM_STATS_INC(xqmstats.xs_qm_dqcachemisses);
1015
1016 /*
1017 * Dquot cache miss. We don't want to keep the inode lock across
1018 * a (potential) disk read. Also we don't want to deal with the lock
1019 * ordering between quotainode and this inode. OTOH, dropping the inode
1020 * lock here means dealing with a chown that can happen before
1021 * we re-acquire the lock.
1022 */
1023 if (ip)
1024 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1025 /*
1026 * Save the hashchain version stamp, and unlock the chain, so that
1027 * we don't keep the lock across a disk read
1028 */
1029 version = h->qh_version;
1030 XFS_DQ_HASH_UNLOCK(h);
1031
1032 /*
1033 * Allocate the dquot on the kernel heap, and read the ondisk
1034 * portion off the disk. Also, do all the necessary initialization
1035 * This can return ENOENT if dquot didn't exist on disk and we didn't
1036 * ask it to allocate; ESRCH if quotas got turned off suddenly.
1037 */
1038 if ((error = xfs_qm_idtodq(mp, id, type,
1039 flags & (XFS_QMOPT_DQALLOC|XFS_QMOPT_DQREPAIR|
1040 XFS_QMOPT_DOWARN),
1041 &dqp))) {
1042 if (ip)
1043 xfs_ilock(ip, XFS_ILOCK_EXCL);
1044 return (error);
1045 }
1046
1047 /*
1048 * See if this is mount code calling to look at the overall quota limits
1049 * which are stored in the id == 0 user or group's dquot.
1050 * Since we may not have done a quotacheck by this point, just return
1051 * the dquot without attaching it to any hashtables, lists, etc, or even
1052 * taking a reference.
1053 * The caller must dqdestroy this once done.
1054 */
1055 if (flags & XFS_QMOPT_DQSUSER) {
1056 ASSERT(id == 0);
1057 ASSERT(! ip);
1058 goto dqret;
1059 }
1060
1061 /*
1062 * Dquot lock comes after hashlock in the lock ordering
1063 */
1064 if (ip) {
1065 xfs_ilock(ip, XFS_ILOCK_EXCL);
1066 if (! XFS_IS_DQTYPE_ON(mp, type)) {
1067 /* inode stays locked on return */
1068 xfs_qm_dqdestroy(dqp);
1069 return XFS_ERROR(ESRCH);
1070 }
1071 /*
1072 * A dquot could be attached to this inode by now, since
1073 * we had dropped the ilock.
1074 */
1075 if (type == XFS_DQ_USER) {
1076 if (ip->i_udquot) {
1077 xfs_qm_dqdestroy(dqp);
1078 dqp = ip->i_udquot;
1079 xfs_dqlock(dqp);
1080 goto dqret;
1081 }
1082 } else {
1083 if (ip->i_gdquot) {
1084 xfs_qm_dqdestroy(dqp);
1085 dqp = ip->i_gdquot;
1086 xfs_dqlock(dqp);
1087 goto dqret;
1088 }
1089 }
1090 }
1091
1092 /*
1093 * Hashlock comes after ilock in lock order
1094 */
1095 XFS_DQ_HASH_LOCK(h);
1096 if (version != h->qh_version) {
1097 xfs_dquot_t *tmpdqp;
1098 /*
1099 * Now, see if somebody else put the dquot in the
1100 * hashtable before us. This can happen because we didn't
1101 * keep the hashchain lock. We don't have to worry about
1102 * lock order between the two dquots here since dqp isn't
1103 * on any findable lists yet.
1104 */
1105 if (xfs_qm_dqlookup(mp, id, h, &tmpdqp) == 0) {
1106 /*
1107 * Duplicate found. Just throw away the new dquot
1108 * and start over.
1109 */
1110 xfs_qm_dqput(tmpdqp);
1111 XFS_DQ_HASH_UNLOCK(h);
1112 xfs_qm_dqdestroy(dqp);
1113 XQM_STATS_INC(xqmstats.xs_qm_dquot_dups);
1114 goto again;
1115 }
1116 }
1117
1118 /*
1119 * Put the dquot at the beginning of the hash-chain and mp's list
1120 * LOCK ORDER: hashlock, freelistlock, mplistlock, udqlock, gdqlock ..
1121 */
1122 ASSERT(XFS_DQ_IS_HASH_LOCKED(h));
1123 dqp->q_hash = h;
1124 XQM_HASHLIST_INSERT(h, dqp);
1125
1126 /*
1127 * Attach this dquot to this filesystem's list of all dquots,
1128 * kept inside the mount structure in m_quotainfo field
1129 */
1130 xfs_qm_mplist_lock(mp);
1131
1132 /*
1133 * We return a locked dquot to the caller, with a reference taken
1134 */
1135 xfs_dqlock(dqp);
1136 dqp->q_nrefs = 1;
1137
1138 XQM_MPLIST_INSERT(&(XFS_QI_MPL_LIST(mp)), dqp);
1139
1140 xfs_qm_mplist_unlock(mp);
1141 XFS_DQ_HASH_UNLOCK(h);
1142 dqret:
1143 ASSERT((ip == NULL) || XFS_ISLOCKED_INODE_EXCL(ip));
1144 xfs_dqtrace_entry(dqp, "DQGET DONE");
1145 *O_dqpp = dqp;
1146 return (0);
1147}
1148
1149
1150/*
1151 * Release a reference to the dquot (decrement ref-count)
1152 * and unlock it. If there is a group quota attached to this
1153 * dquot, carefully release that too without tripping over
1154 * deadlocks'n'stuff.
1155 */
1156void
1157xfs_qm_dqput(
1158 xfs_dquot_t *dqp)
1159{
1160 xfs_dquot_t *gdqp;
1161
1162 ASSERT(dqp->q_nrefs > 0);
1163 ASSERT(XFS_DQ_IS_LOCKED(dqp));
1164 xfs_dqtrace_entry(dqp, "DQPUT");
1165
1166 if (dqp->q_nrefs != 1) {
1167 dqp->q_nrefs--;
1168 xfs_dqunlock(dqp);
1169 return;
1170 }
1171
1172 /*
1173 * drop the dqlock and acquire the freelist and dqlock
1174 * in the right order; but try to get it out-of-order first
1175 */
1176 if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) {
1177 xfs_dqtrace_entry(dqp, "DQPUT: FLLOCK-WAIT");
1178 xfs_dqunlock(dqp);
1179 xfs_qm_freelist_lock(xfs_Gqm);
1180 xfs_dqlock(dqp);
1181 }
1182
1183 while (1) {
1184 gdqp = NULL;
1185
1186 /* We can't depend on nrefs being == 1 here */
1187 if (--dqp->q_nrefs == 0) {
1188 xfs_dqtrace_entry(dqp, "DQPUT: ON FREELIST");
1189 /*
1190 * insert at end of the freelist.
1191 */
1192 XQM_FREELIST_INSERT(&(xfs_Gqm->qm_dqfreelist), dqp);
1193
1194 /*
1195 * If we just added a udquot to the freelist, then
1196 * we want to release the gdquot reference that
1197 * it (probably) has. Otherwise it'll keep the
1198 * gdquot from getting reclaimed.
1199 */
1200 if ((gdqp = dqp->q_gdquot)) {
1201 /*
1202 * Avoid a recursive dqput call
1203 */
1204 xfs_dqlock(gdqp);
1205 dqp->q_gdquot = NULL;
1206 }
1207
1208 /* xfs_qm_freelist_print(&(xfs_Gqm->qm_dqfreelist),
1209 "@@@@@++ Free list (after append) @@@@@+");
1210 */
1211 }
1212 xfs_dqunlock(dqp);
1213
1214 /*
1215 * If we had a group quota inside the user quota as a hint,
1216 * release it now.
1217 */
1218 if (! gdqp)
1219 break;
1220 dqp = gdqp;
1221 }
1222 xfs_qm_freelist_unlock(xfs_Gqm);
1223}
1224
1225/*
1226 * Release a dquot. Flush it if dirty, then dqput() it.
1227 * dquot must not be locked.
1228 */
1229void
1230xfs_qm_dqrele(
1231 xfs_dquot_t *dqp)
1232{
1233 ASSERT(dqp);
1234 xfs_dqtrace_entry(dqp, "DQRELE");
1235
1236 xfs_dqlock(dqp);
1237 /*
1238 * We don't care to flush it if the dquot is dirty here.
1239 * That will create stutters that we want to avoid.
1240 * Instead we do a delayed write when we try to reclaim
1241 * a dirty dquot. Also xfs_sync will take part of the burden...
1242 */
1243 xfs_qm_dqput(dqp);
1244}
1245
1246
1247/*
1248 * Write a modified dquot to disk.
1249 * The dquot must be locked and the flush lock too taken by caller.
1250 * The flush lock will not be unlocked until the dquot reaches the disk,
1251 * but the dquot is free to be unlocked and modified by the caller
1252 * in the interim. Dquot is still locked on return. This behavior is
1253 * identical to that of inodes.
1254 */
1255int
1256xfs_qm_dqflush(
1257 xfs_dquot_t *dqp,
1258 uint flags)
1259{
1260 xfs_mount_t *mp;
1261 xfs_buf_t *bp;
1262 xfs_disk_dquot_t *ddqp;
1263 int error;
1264 SPLDECL(s);
1265
1266 ASSERT(XFS_DQ_IS_LOCKED(dqp));
1267 ASSERT(XFS_DQ_IS_FLUSH_LOCKED(dqp));
1268 xfs_dqtrace_entry(dqp, "DQFLUSH");
1269
1270 /*
1271 * If not dirty, nada.
1272 */
1273 if (!XFS_DQ_IS_DIRTY(dqp)) {
1274 xfs_dqfunlock(dqp);
1275 return (0);
1276 }
1277
1278 /*
1279 * Cant flush a pinned dquot. Wait for it.
1280 */
1281 xfs_qm_dqunpin_wait(dqp);
1282
1283 /*
1284 * This may have been unpinned because the filesystem is shutting
1285 * down forcibly. If that's the case we must not write this dquot
1286 * to disk, because the log record didn't make it to disk!
1287 */
1288 if (XFS_FORCED_SHUTDOWN(dqp->q_mount)) {
1289 dqp->dq_flags &= ~(XFS_DQ_DIRTY);
1290 xfs_dqfunlock(dqp);
1291 return XFS_ERROR(EIO);
1292 }
1293
1294 /*
1295 * Get the buffer containing the on-disk dquot
1296 * We don't need a transaction envelope because we know that the
1297 * the ondisk-dquot has already been allocated for.
1298 */
1299 if ((error = xfs_qm_dqtobp(NULL, dqp, &ddqp, &bp, XFS_QMOPT_DOWARN))) {
1300 xfs_dqtrace_entry(dqp, "DQTOBP FAIL");
1301 ASSERT(error != ENOENT);
1302 /*
1303 * Quotas could have gotten turned off (ESRCH)
1304 */
1305 xfs_dqfunlock(dqp);
1306 return (error);
1307 }
1308
1309 if (xfs_qm_dqcheck(&dqp->q_core, INT_GET(ddqp->d_id, ARCH_CONVERT), 0, XFS_QMOPT_DOWARN,
1310 "dqflush (incore copy)")) {
1311 xfs_force_shutdown(dqp->q_mount, XFS_CORRUPT_INCORE);
1312 return XFS_ERROR(EIO);
1313 }
1314
1315 /* This is the only portion of data that needs to persist */
1316 memcpy(ddqp, &(dqp->q_core), sizeof(xfs_disk_dquot_t));
1317
1318 /*
1319 * Clear the dirty field and remember the flush lsn for later use.
1320 */
1321 dqp->dq_flags &= ~(XFS_DQ_DIRTY);
1322 mp = dqp->q_mount;
1323
1324 /* lsn is 64 bits */
1325 AIL_LOCK(mp, s);
1326 dqp->q_logitem.qli_flush_lsn = dqp->q_logitem.qli_item.li_lsn;
1327 AIL_UNLOCK(mp, s);
1328
1329 /*
1330 * Attach an iodone routine so that we can remove this dquot from the
1331 * AIL and release the flush lock once the dquot is synced to disk.
1332 */
1333 xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t *, xfs_log_item_t *))
1334 xfs_qm_dqflush_done, &(dqp->q_logitem.qli_item));
1335 /*
1336 * If the buffer is pinned then push on the log so we won't
1337 * get stuck waiting in the write for too long.
1338 */
1339 if (XFS_BUF_ISPINNED(bp)) {
1340 xfs_dqtrace_entry(dqp, "DQFLUSH LOG FORCE");
1341 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
1342 }
1343
1344 if (flags & XFS_QMOPT_DELWRI) {
1345 xfs_bdwrite(mp, bp);
1346 } else if (flags & XFS_QMOPT_ASYNC) {
1347 xfs_bawrite(mp, bp);
1348 } else {
1349 error = xfs_bwrite(mp, bp);
1350 }
1351 xfs_dqtrace_entry(dqp, "DQFLUSH END");
1352 /*
1353 * dqp is still locked, but caller is free to unlock it now.
1354 */
1355 return (error);
1356
1357}
1358
1359/*
1360 * This is the dquot flushing I/O completion routine. It is called
1361 * from interrupt level when the buffer containing the dquot is
1362 * flushed to disk. It is responsible for removing the dquot logitem
1363 * from the AIL if it has not been re-logged, and unlocking the dquot's
1364 * flush lock. This behavior is very similar to that of inodes..
1365 */
1366/*ARGSUSED*/
1367STATIC void
1368xfs_qm_dqflush_done(
1369 xfs_buf_t *bp,
1370 xfs_dq_logitem_t *qip)
1371{
1372 xfs_dquot_t *dqp;
1373 SPLDECL(s);
1374
1375 dqp = qip->qli_dquot;
1376
1377 /*
1378 * We only want to pull the item from the AIL if its
1379 * location in the log has not changed since we started the flush.
1380 * Thus, we only bother if the dquot's lsn has
1381 * not changed. First we check the lsn outside the lock
1382 * since it's cheaper, and then we recheck while
1383 * holding the lock before removing the dquot from the AIL.
1384 */
1385 if ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
1386 qip->qli_item.li_lsn == qip->qli_flush_lsn) {
1387
1388 AIL_LOCK(dqp->q_mount, s);
1389 /*
1390 * xfs_trans_delete_ail() drops the AIL lock.
1391 */
1392 if (qip->qli_item.li_lsn == qip->qli_flush_lsn)
1393 xfs_trans_delete_ail(dqp->q_mount,
1394 (xfs_log_item_t*)qip, s);
1395 else
1396 AIL_UNLOCK(dqp->q_mount, s);
1397 }
1398
1399 /*
1400 * Release the dq's flush lock since we're done with it.
1401 */
1402 xfs_dqfunlock(dqp);
1403}
1404
1405
1406int
1407xfs_qm_dqflock_nowait(
1408 xfs_dquot_t *dqp)
1409{
1410 int locked;
1411
1412 locked = cpsema(&((dqp)->q_flock));
1413
1414 /* XXX ifdef these out */
1415 if (locked)
1416 (dqp)->dq_flags |= XFS_DQ_FLOCKED;
1417 return (locked);
1418}
1419
1420
1421int
1422xfs_qm_dqlock_nowait(
1423 xfs_dquot_t *dqp)
1424{
1425 return (mutex_trylock(&((dqp)->q_qlock)));
1426}
1427
1428void
1429xfs_dqlock(
1430 xfs_dquot_t *dqp)
1431{
1432 mutex_lock(&(dqp->q_qlock), PINOD);
1433}
1434
1435void
1436xfs_dqunlock(
1437 xfs_dquot_t *dqp)
1438{
1439 mutex_unlock(&(dqp->q_qlock));
1440 if (dqp->q_logitem.qli_dquot == dqp) {
1441 /* Once was dqp->q_mount, but might just have been cleared */
1442 xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_mountp,
1443 (xfs_log_item_t*)&(dqp->q_logitem));
1444 }
1445}
1446
1447
1448void
1449xfs_dqunlock_nonotify(
1450 xfs_dquot_t *dqp)
1451{
1452 mutex_unlock(&(dqp->q_qlock));
1453}
1454
1455void
1456xfs_dqlock2(
1457 xfs_dquot_t *d1,
1458 xfs_dquot_t *d2)
1459{
1460 if (d1 && d2) {
1461 ASSERT(d1 != d2);
1462 if (INT_GET(d1->q_core.d_id, ARCH_CONVERT) > INT_GET(d2->q_core.d_id, ARCH_CONVERT)) {
1463 xfs_dqlock(d2);
1464 xfs_dqlock(d1);
1465 } else {
1466 xfs_dqlock(d1);
1467 xfs_dqlock(d2);
1468 }
1469 } else {
1470 if (d1) {
1471 xfs_dqlock(d1);
1472 } else if (d2) {
1473 xfs_dqlock(d2);
1474 }
1475 }
1476}
1477
1478
1479/*
1480 * Take a dquot out of the mount's dqlist as well as the hashlist.
1481 * This is called via unmount as well as quotaoff, and the purge
1482 * will always succeed unless there are soft (temp) references
1483 * outstanding.
1484 *
1485 * This returns 0 if it was purged, 1 if it wasn't. It's not an error code
1486 * that we're returning! XXXsup - not cool.
1487 */
1488/* ARGSUSED */
1489int
1490xfs_qm_dqpurge(
1491 xfs_dquot_t *dqp,
1492 uint flags)
1493{
1494 xfs_dqhash_t *thishash;
1495 xfs_mount_t *mp;
1496
1497 mp = dqp->q_mount;
1498
1499 ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
1500 ASSERT(XFS_DQ_IS_HASH_LOCKED(dqp->q_hash));
1501
1502 xfs_dqlock(dqp);
1503 /*
1504 * We really can't afford to purge a dquot that is
1505 * referenced, because these are hard refs.
1506 * It shouldn't happen in general because we went thru _all_ inodes in
1507 * dqrele_all_inodes before calling this and didn't let the mountlock go.
1508 * However it is possible that we have dquots with temporary
1509 * references that are not attached to an inode. e.g. see xfs_setattr().
1510 */
1511 if (dqp->q_nrefs != 0) {
1512 xfs_dqunlock(dqp);
1513 XFS_DQ_HASH_UNLOCK(dqp->q_hash);
1514 return (1);
1515 }
1516
1517 ASSERT(XFS_DQ_IS_ON_FREELIST(dqp));
1518
1519 /*
1520 * If we're turning off quotas, we have to make sure that, for
1521 * example, we don't delete quota disk blocks while dquots are
1522 * in the process of getting written to those disk blocks.
1523 * This dquot might well be on AIL, and we can't leave it there
1524 * if we're turning off quotas. Basically, we need this flush
1525 * lock, and are willing to block on it.
1526 */
1527 if (! xfs_qm_dqflock_nowait(dqp)) {
1528 /*
1529 * Block on the flush lock after nudging dquot buffer,
1530 * if it is incore.
1531 */
1532 xfs_qm_dqflock_pushbuf_wait(dqp);
1533 }
1534
1535 /*
1536 * XXXIf we're turning this type of quotas off, we don't care
1537 * about the dirty metadata sitting in this dquot. OTOH, if
1538 * we're unmounting, we do care, so we flush it and wait.
1539 */
1540 if (XFS_DQ_IS_DIRTY(dqp)) {
1541 xfs_dqtrace_entry(dqp, "DQPURGE ->DQFLUSH: DQDIRTY");
1542 /* dqflush unlocks dqflock */
1543 /*
1544 * Given that dqpurge is a very rare occurrence, it is OK
1545 * that we're holding the hashlist and mplist locks
1546 * across the disk write. But, ... XXXsup
1547 *
1548 * We don't care about getting disk errors here. We need
1549 * to purge this dquot anyway, so we go ahead regardless.
1550 */
1551 (void) xfs_qm_dqflush(dqp, XFS_QMOPT_SYNC);
1552 xfs_dqflock(dqp);
1553 }
1554 ASSERT(dqp->q_pincount == 0);
1555 ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
1556 !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
1557
1558 thishash = dqp->q_hash;
1559 XQM_HASHLIST_REMOVE(thishash, dqp);
1560 XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(mp)), dqp);
1561 /*
1562 * XXX Move this to the front of the freelist, if we can get the
1563 * freelist lock.
1564 */
1565 ASSERT(XFS_DQ_IS_ON_FREELIST(dqp));
1566
1567 dqp->q_mount = NULL;
1568 dqp->q_hash = NULL;
1569 dqp->dq_flags = XFS_DQ_INACTIVE;
1570 memset(&dqp->q_core, 0, sizeof(dqp->q_core));
1571 xfs_dqfunlock(dqp);
1572 xfs_dqunlock(dqp);
1573 XFS_DQ_HASH_UNLOCK(thishash);
1574 return (0);
1575}
1576
1577
1578#ifdef QUOTADEBUG
1579void
1580xfs_qm_dqprint(xfs_dquot_t *dqp)
1581{
1582 cmn_err(CE_DEBUG, "-----------KERNEL DQUOT----------------");
1583 cmn_err(CE_DEBUG, "---- dquotID = %d",
1584 (int)INT_GET(dqp->q_core.d_id, ARCH_CONVERT));
1585 cmn_err(CE_DEBUG, "---- type = %s",
1586 XFS_QM_ISUDQ(dqp) ? "USR" : "GRP");
1587 cmn_err(CE_DEBUG, "---- fs = 0x%p", dqp->q_mount);
1588 cmn_err(CE_DEBUG, "---- blkno = 0x%x", (int) dqp->q_blkno);
1589 cmn_err(CE_DEBUG, "---- boffset = 0x%x", (int) dqp->q_bufoffset);
1590 cmn_err(CE_DEBUG, "---- blkhlimit = %Lu (0x%x)",
1591 INT_GET(dqp->q_core.d_blk_hardlimit, ARCH_CONVERT),
1592 (int) INT_GET(dqp->q_core.d_blk_hardlimit, ARCH_CONVERT));
1593 cmn_err(CE_DEBUG, "---- blkslimit = %Lu (0x%x)",
1594 INT_GET(dqp->q_core.d_blk_softlimit, ARCH_CONVERT),
1595 (int)INT_GET(dqp->q_core.d_blk_softlimit, ARCH_CONVERT));
1596 cmn_err(CE_DEBUG, "---- inohlimit = %Lu (0x%x)",
1597 INT_GET(dqp->q_core.d_ino_hardlimit, ARCH_CONVERT),
1598 (int)INT_GET(dqp->q_core.d_ino_hardlimit, ARCH_CONVERT));
1599 cmn_err(CE_DEBUG, "---- inoslimit = %Lu (0x%x)",
1600 INT_GET(dqp->q_core.d_ino_softlimit, ARCH_CONVERT),
1601 (int)INT_GET(dqp->q_core.d_ino_softlimit, ARCH_CONVERT));
1602 cmn_err(CE_DEBUG, "---- bcount = %Lu (0x%x)",
1603 INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT),
1604 (int)INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT));
1605 cmn_err(CE_DEBUG, "---- icount = %Lu (0x%x)",
1606 INT_GET(dqp->q_core.d_icount, ARCH_CONVERT),
1607 (int)INT_GET(dqp->q_core.d_icount, ARCH_CONVERT));
1608 cmn_err(CE_DEBUG, "---- btimer = %d",
1609 (int)INT_GET(dqp->q_core.d_btimer, ARCH_CONVERT));
1610 cmn_err(CE_DEBUG, "---- itimer = %d",
1611 (int)INT_GET(dqp->q_core.d_itimer, ARCH_CONVERT));
1612 cmn_err(CE_DEBUG, "---------------------------");
1613}
1614#endif
1615
1616/*
1617 * Give the buffer a little push if it is incore and
1618 * wait on the flush lock.
1619 */
1620void
1621xfs_qm_dqflock_pushbuf_wait(
1622 xfs_dquot_t *dqp)
1623{
1624 xfs_buf_t *bp;
1625
1626 /*
1627 * Check to see if the dquot has been flushed delayed
1628 * write. If so, grab its buffer and send it
1629 * out immediately. We'll be able to acquire
1630 * the flush lock when the I/O completes.
1631 */
1632 bp = xfs_incore(dqp->q_mount->m_ddev_targp, dqp->q_blkno,
1633 XFS_QI_DQCHUNKLEN(dqp->q_mount),
1634 XFS_INCORE_TRYLOCK);
1635 if (bp != NULL) {
1636 if (XFS_BUF_ISDELAYWRITE(bp)) {
1637 if (XFS_BUF_ISPINNED(bp)) {
1638 xfs_log_force(dqp->q_mount,
1639 (xfs_lsn_t)0,
1640 XFS_LOG_FORCE);
1641 }
1642 xfs_bawrite(dqp->q_mount, bp);
1643 } else {
1644 xfs_buf_relse(bp);
1645 }
1646 }
1647 xfs_dqflock(dqp);
1648}
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
new file mode 100644
index 000000000000..0c3fe3175baa
--- /dev/null
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -0,0 +1,224 @@
1/*
2 * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_DQUOT_H__
33#define __XFS_DQUOT_H__
34
35/*
36 * Dquots are structures that hold quota information about a user or a group,
37 * much like inodes are for files. In fact, dquots share many characteristics
38 * with inodes. However, dquots can also be a centralized resource, relative
39 * to a collection of inodes. In this respect, dquots share some characteristics
40 * of the superblock.
41 * XFS dquots exploit both those in its algorithms. They make every attempt
42 * to not be a bottleneck when quotas are on and have minimal impact, if any,
43 * when quotas are off.
44 */
45
46/*
47 * The hash chain headers (hash buckets)
48 */
49typedef struct xfs_dqhash {
50 struct xfs_dquot *qh_next;
51 mutex_t qh_lock;
52 uint qh_version; /* ever increasing version */
53 uint qh_nelems; /* number of dquots on the list */
54} xfs_dqhash_t;
55
56typedef struct xfs_dqlink {
57 struct xfs_dquot *ql_next; /* forward link */
58 struct xfs_dquot **ql_prevp; /* pointer to prev ql_next */
59} xfs_dqlink_t;
60
61struct xfs_mount;
62struct xfs_trans;
63
64/*
65 * This is the marker which is designed to occupy the first few
66 * bytes of the xfs_dquot_t structure. Even inside this, the freelist pointers
67 * must come first.
68 * This serves as the marker ("sentinel") when we have to restart list
69 * iterations because of locking considerations.
70 */
71typedef struct xfs_dqmarker {
72 struct xfs_dquot*dqm_flnext; /* link to freelist: must be first */
73 struct xfs_dquot*dqm_flprev;
74 xfs_dqlink_t dqm_mplist; /* link to mount's list of dquots */
75 xfs_dqlink_t dqm_hashlist; /* link to the hash chain */
76 uint dqm_flags; /* various flags (XFS_DQ_*) */
77} xfs_dqmarker_t;
78
79/*
80 * The incore dquot structure
81 */
82typedef struct xfs_dquot {
83 xfs_dqmarker_t q_lists; /* list ptrs, q_flags (marker) */
84 xfs_dqhash_t *q_hash; /* the hashchain header */
85 struct xfs_mount*q_mount; /* filesystem this relates to */
86 struct xfs_trans*q_transp; /* trans this belongs to currently */
87 uint q_nrefs; /* # active refs from inodes */
88 xfs_daddr_t q_blkno; /* blkno of dquot buffer */
89 int q_bufoffset; /* off of dq in buffer (# dquots) */
90 xfs_fileoff_t q_fileoffset; /* offset in quotas file */
91
92 struct xfs_dquot*q_gdquot; /* group dquot, hint only */
93 xfs_disk_dquot_t q_core; /* actual usage & quotas */
94 xfs_dq_logitem_t q_logitem; /* dquot log item */
95 xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */
96 xfs_qcnt_t q_res_icount; /* total inos allocd+reserved */
97 xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */
98 mutex_t q_qlock; /* quota lock */
99 sema_t q_flock; /* flush lock */
100 uint q_pincount; /* pin count for this dquot */
101 sv_t q_pinwait; /* sync var for pinning */
102#ifdef XFS_DQUOT_TRACE
103 struct ktrace *q_trace; /* trace header structure */
104#endif
105} xfs_dquot_t;
106
107
108#define dq_flnext q_lists.dqm_flnext
109#define dq_flprev q_lists.dqm_flprev
110#define dq_mplist q_lists.dqm_mplist
111#define dq_hashlist q_lists.dqm_hashlist
112#define dq_flags q_lists.dqm_flags
113
114#define XFS_DQHOLD(dqp) ((dqp)->q_nrefs++)
115
116/*
117 * Quota Accounting flags
118 */
119#define XFS_ALL_QUOTA_ACCT (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT)
120#define XFS_ALL_QUOTA_ENFD (XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD)
121#define XFS_ALL_QUOTA_CHKD (XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD)
122#define XFS_ALL_QUOTA_ACTV (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE)
123#define XFS_ALL_QUOTA_ACCT_ENFD (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
124 XFS_GQUOTA_ACCT|XFS_GQUOTA_ENFD)
125
126#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
127#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT)
128#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT)
129
130/*
131 * Quota Limit Enforcement flags
132 */
133#define XFS_IS_QUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ENFD)
134#define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD)
135#define XFS_IS_GQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_GQUOTA_ENFD)
136
137#ifdef DEBUG
138static inline int
139XFS_DQ_IS_LOCKED(xfs_dquot_t *dqp)
140{
141 if (mutex_trylock(&dqp->q_qlock)) {
142 mutex_unlock(&dqp->q_qlock);
143 return 0;
144 }
145 return 1;
146}
147#endif
148
149
150/*
151 * The following three routines simply manage the q_flock
152 * semaphore embedded in the dquot. This semaphore synchronizes
153 * processes attempting to flush the in-core dquot back to disk.
154 */
155#define xfs_dqflock(dqp) { psema(&((dqp)->q_flock), PINOD | PRECALC);\
156 (dqp)->dq_flags |= XFS_DQ_FLOCKED; }
157#define xfs_dqfunlock(dqp) { ASSERT(valusema(&((dqp)->q_flock)) <= 0); \
158 vsema(&((dqp)->q_flock)); \
159 (dqp)->dq_flags &= ~(XFS_DQ_FLOCKED); }
160
161#define XFS_DQ_PINLOCK(dqp) mutex_spinlock( \
162 &(XFS_DQ_TO_QINF(dqp)->qi_pinlock))
163#define XFS_DQ_PINUNLOCK(dqp, s) mutex_spinunlock( \
164 &(XFS_DQ_TO_QINF(dqp)->qi_pinlock), s)
165
166#define XFS_DQ_IS_FLUSH_LOCKED(dqp) (valusema(&((dqp)->q_flock)) <= 0)
167#define XFS_DQ_IS_ON_FREELIST(dqp) ((dqp)->dq_flnext != (dqp))
168#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY)
169#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER)
170#define XFS_DQ_TO_QINF(dqp) ((dqp)->q_mount->m_quotainfo)
171#define XFS_DQ_TO_QIP(dqp) (XFS_QM_ISUDQ(dqp) ? \
172 XFS_DQ_TO_QINF(dqp)->qi_uquotaip : \
173 XFS_DQ_TO_QINF(dqp)->qi_gquotaip)
174
175#define XFS_IS_THIS_QUOTA_OFF(d) (! (XFS_QM_ISUDQ(d) ? \
176 (XFS_IS_UQUOTA_ON((d)->q_mount)) : \
177 (XFS_IS_GQUOTA_ON((d)->q_mount))))
178
179#ifdef XFS_DQUOT_TRACE
180/*
181 * Dquot Tracing stuff.
182 */
183#define DQUOT_TRACE_SIZE 64
184#define DQUOT_KTRACE_ENTRY 1
185
186extern void __xfs_dqtrace_entry(xfs_dquot_t *dqp, char *func,
187 void *, xfs_inode_t *);
188#define xfs_dqtrace_entry_ino(a,b,ip) \
189 __xfs_dqtrace_entry((a), (b), (void*)__return_address, (ip))
190#define xfs_dqtrace_entry(a,b) \
191 __xfs_dqtrace_entry((a), (b), (void*)__return_address, NULL)
192#else
193#define xfs_dqtrace_entry(a,b)
194#define xfs_dqtrace_entry_ino(a,b,ip)
195#endif
196
197#ifdef QUOTADEBUG
198extern void xfs_qm_dqprint(xfs_dquot_t *);
199#else
200#define xfs_qm_dqprint(a)
201#endif
202
203extern void xfs_qm_dqdestroy(xfs_dquot_t *);
204extern int xfs_qm_dqflush(xfs_dquot_t *, uint);
205extern int xfs_qm_dqpurge(xfs_dquot_t *, uint);
206extern void xfs_qm_dqunpin_wait(xfs_dquot_t *);
207extern int xfs_qm_dqlock_nowait(xfs_dquot_t *);
208extern int xfs_qm_dqflock_nowait(xfs_dquot_t *);
209extern void xfs_qm_dqflock_pushbuf_wait(xfs_dquot_t *dqp);
210extern void xfs_qm_adjust_dqtimers(xfs_mount_t *,
211 xfs_disk_dquot_t *);
212extern void xfs_qm_adjust_dqlimits(xfs_mount_t *,
213 xfs_disk_dquot_t *);
214extern int xfs_qm_dqwarn(xfs_disk_dquot_t *, uint);
215extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *,
216 xfs_dqid_t, uint, uint, xfs_dquot_t **);
217extern void xfs_qm_dqput(xfs_dquot_t *);
218extern void xfs_qm_dqrele(xfs_dquot_t *);
219extern void xfs_dqlock(xfs_dquot_t *);
220extern void xfs_dqlock2(xfs_dquot_t *, xfs_dquot_t *);
221extern void xfs_dqunlock(xfs_dquot_t *);
222extern void xfs_dqunlock_nonotify(xfs_dquot_t *);
223
224#endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
new file mode 100644
index 000000000000..a5425ee6e7bd
--- /dev/null
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -0,0 +1,715 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34#include "xfs_fs.h"
35#include "xfs_inum.h"
36#include "xfs_log.h"
37#include "xfs_trans.h"
38#include "xfs_sb.h"
39#include "xfs_ag.h"
40#include "xfs_dir.h"
41#include "xfs_dir2.h"
42#include "xfs_alloc.h"
43#include "xfs_dmapi.h"
44#include "xfs_quota.h"
45#include "xfs_mount.h"
46#include "xfs_alloc_btree.h"
47#include "xfs_bmap_btree.h"
48#include "xfs_ialloc_btree.h"
49#include "xfs_btree.h"
50#include "xfs_ialloc.h"
51#include "xfs_attr_sf.h"
52#include "xfs_dir_sf.h"
53#include "xfs_dir2_sf.h"
54#include "xfs_dinode.h"
55#include "xfs_inode.h"
56#include "xfs_bmap.h"
57#include "xfs_bit.h"
58#include "xfs_rtalloc.h"
59#include "xfs_error.h"
60#include "xfs_itable.h"
61#include "xfs_rw.h"
62#include "xfs_acl.h"
63#include "xfs_cap.h"
64#include "xfs_mac.h"
65#include "xfs_attr.h"
66#include "xfs_buf_item.h"
67#include "xfs_trans_priv.h"
68
69#include "xfs_qm.h"
70
71
72/*
73 * returns the number of iovecs needed to log the given dquot item.
74 */
75/* ARGSUSED */
76STATIC uint
77xfs_qm_dquot_logitem_size(
78 xfs_dq_logitem_t *logitem)
79{
80 /*
81 * we need only two iovecs, one for the format, one for the real thing
82 */
83 return (2);
84}
85
86/*
87 * fills in the vector of log iovecs for the given dquot log item.
88 */
89STATIC void
90xfs_qm_dquot_logitem_format(
91 xfs_dq_logitem_t *logitem,
92 xfs_log_iovec_t *logvec)
93{
94 ASSERT(logitem);
95 ASSERT(logitem->qli_dquot);
96
97 logvec->i_addr = (xfs_caddr_t)&logitem->qli_format;
98 logvec->i_len = sizeof(xfs_dq_logformat_t);
99 logvec++;
100 logvec->i_addr = (xfs_caddr_t)&logitem->qli_dquot->q_core;
101 logvec->i_len = sizeof(xfs_disk_dquot_t);
102
103 ASSERT(2 == logitem->qli_item.li_desc->lid_size);
104 logitem->qli_format.qlf_size = 2;
105
106}
107
108/*
109 * Increment the pin count of the given dquot.
110 * This value is protected by pinlock spinlock in the xQM structure.
111 */
112STATIC void
113xfs_qm_dquot_logitem_pin(
114 xfs_dq_logitem_t *logitem)
115{
116 unsigned long s;
117 xfs_dquot_t *dqp;
118
119 dqp = logitem->qli_dquot;
120 ASSERT(XFS_DQ_IS_LOCKED(dqp));
121 s = XFS_DQ_PINLOCK(dqp);
122 dqp->q_pincount++;
123 XFS_DQ_PINUNLOCK(dqp, s);
124}
125
126/*
127 * Decrement the pin count of the given dquot, and wake up
128 * anyone in xfs_dqwait_unpin() if the count goes to 0. The
129 * dquot must have been previously pinned with a call to xfs_dqpin().
130 */
131/* ARGSUSED */
132STATIC void
133xfs_qm_dquot_logitem_unpin(
134 xfs_dq_logitem_t *logitem,
135 int stale)
136{
137 unsigned long s;
138 xfs_dquot_t *dqp;
139
140 dqp = logitem->qli_dquot;
141 ASSERT(dqp->q_pincount > 0);
142 s = XFS_DQ_PINLOCK(dqp);
143 dqp->q_pincount--;
144 if (dqp->q_pincount == 0) {
145 sv_broadcast(&dqp->q_pinwait);
146 }
147 XFS_DQ_PINUNLOCK(dqp, s);
148}
149
150/* ARGSUSED */
151STATIC void
152xfs_qm_dquot_logitem_unpin_remove(
153 xfs_dq_logitem_t *logitem,
154 xfs_trans_t *tp)
155{
156 xfs_qm_dquot_logitem_unpin(logitem, 0);
157}
158
159/*
160 * Given the logitem, this writes the corresponding dquot entry to disk
161 * asynchronously. This is called with the dquot entry securely locked;
162 * we simply get xfs_qm_dqflush() to do the work, and unlock the dquot
163 * at the end.
164 */
165STATIC void
166xfs_qm_dquot_logitem_push(
167 xfs_dq_logitem_t *logitem)
168{
169 xfs_dquot_t *dqp;
170
171 dqp = logitem->qli_dquot;
172
173 ASSERT(XFS_DQ_IS_LOCKED(dqp));
174 ASSERT(XFS_DQ_IS_FLUSH_LOCKED(dqp));
175
176 /*
177 * Since we were able to lock the dquot's flush lock and
178 * we found it on the AIL, the dquot must be dirty. This
179 * is because the dquot is removed from the AIL while still
180 * holding the flush lock in xfs_dqflush_done(). Thus, if
181 * we found it in the AIL and were able to obtain the flush
182 * lock without sleeping, then there must not have been
183 * anyone in the process of flushing the dquot.
184 */
185 xfs_qm_dqflush(dqp, XFS_B_DELWRI);
186 xfs_dqunlock(dqp);
187}
188
189/*ARGSUSED*/
190STATIC xfs_lsn_t
191xfs_qm_dquot_logitem_committed(
192 xfs_dq_logitem_t *l,
193 xfs_lsn_t lsn)
194{
195 /*
196 * We always re-log the entire dquot when it becomes dirty,
197 * so, the latest copy _is_ the only one that matters.
198 */
199 return (lsn);
200}
201
202
203/*
204 * This is called to wait for the given dquot to be unpinned.
205 * Most of these pin/unpin routines are plagiarized from inode code.
206 */
207void
208xfs_qm_dqunpin_wait(
209 xfs_dquot_t *dqp)
210{
211 SPLDECL(s);
212
213 ASSERT(XFS_DQ_IS_LOCKED(dqp));
214 if (dqp->q_pincount == 0) {
215 return;
216 }
217
218 /*
219 * Give the log a push so we don't wait here too long.
220 */
221 xfs_log_force(dqp->q_mount, (xfs_lsn_t)0, XFS_LOG_FORCE);
222 s = XFS_DQ_PINLOCK(dqp);
223 if (dqp->q_pincount == 0) {
224 XFS_DQ_PINUNLOCK(dqp, s);
225 return;
226 }
227 sv_wait(&(dqp->q_pinwait), PINOD,
228 &(XFS_DQ_TO_QINF(dqp)->qi_pinlock), s);
229}
230
231/*
232 * This is called when IOP_TRYLOCK returns XFS_ITEM_PUSHBUF to indicate that
233 * the dquot is locked by us, but the flush lock isn't. So, here we are
234 * going to see if the relevant dquot buffer is incore, waiting on DELWRI.
235 * If so, we want to push it out to help us take this item off the AIL as soon
236 * as possible.
237 *
238 * We must not be holding the AIL_LOCK at this point. Calling incore() to
239 * search the buffercache can be a time consuming thing, and AIL_LOCK is a
240 * spinlock.
241 */
242STATIC void
243xfs_qm_dquot_logitem_pushbuf(
244 xfs_dq_logitem_t *qip)
245{
246 xfs_dquot_t *dqp;
247 xfs_mount_t *mp;
248 xfs_buf_t *bp;
249 uint dopush;
250
251 dqp = qip->qli_dquot;
252 ASSERT(XFS_DQ_IS_LOCKED(dqp));
253
254 /*
255 * The qli_pushbuf_flag keeps others from
256 * trying to duplicate our effort.
257 */
258 ASSERT(qip->qli_pushbuf_flag != 0);
259 ASSERT(qip->qli_push_owner == get_thread_id());
260
261 /*
262 * If flushlock isn't locked anymore, chances are that the
263 * inode flush completed and the inode was taken off the AIL.
264 * So, just get out.
265 */
266 if ((valusema(&(dqp->q_flock)) > 0) ||
267 ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) {
268 qip->qli_pushbuf_flag = 0;
269 xfs_dqunlock(dqp);
270 return;
271 }
272 mp = dqp->q_mount;
273 bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno,
274 XFS_QI_DQCHUNKLEN(mp),
275 XFS_INCORE_TRYLOCK);
276 if (bp != NULL) {
277 if (XFS_BUF_ISDELAYWRITE(bp)) {
278 dopush = ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
279 (valusema(&(dqp->q_flock)) <= 0));
280 qip->qli_pushbuf_flag = 0;
281 xfs_dqunlock(dqp);
282
283 if (XFS_BUF_ISPINNED(bp)) {
284 xfs_log_force(mp, (xfs_lsn_t)0,
285 XFS_LOG_FORCE);
286 }
287 if (dopush) {
288#ifdef XFSRACEDEBUG
289 delay_for_intr();
290 delay(300);
291#endif
292 xfs_bawrite(mp, bp);
293 } else {
294 xfs_buf_relse(bp);
295 }
296 } else {
297 qip->qli_pushbuf_flag = 0;
298 xfs_dqunlock(dqp);
299 xfs_buf_relse(bp);
300 }
301 return;
302 }
303
304 qip->qli_pushbuf_flag = 0;
305 xfs_dqunlock(dqp);
306}
307
308/*
309 * This is called to attempt to lock the dquot associated with this
310 * dquot log item. Don't sleep on the dquot lock or the flush lock.
311 * If the flush lock is already held, indicating that the dquot has
312 * been or is in the process of being flushed, then see if we can
313 * find the dquot's buffer in the buffer cache without sleeping. If
314 * we can and it is marked delayed write, then we want to send it out.
315 * We delay doing so until the push routine, though, to avoid sleeping
316 * in any device strategy routines.
317 */
318STATIC uint
319xfs_qm_dquot_logitem_trylock(
320 xfs_dq_logitem_t *qip)
321{
322 xfs_dquot_t *dqp;
323 uint retval;
324
325 dqp = qip->qli_dquot;
326 if (dqp->q_pincount > 0)
327 return (XFS_ITEM_PINNED);
328
329 if (! xfs_qm_dqlock_nowait(dqp))
330 return (XFS_ITEM_LOCKED);
331
332 retval = XFS_ITEM_SUCCESS;
333 if (! xfs_qm_dqflock_nowait(dqp)) {
334 /*
335 * The dquot is already being flushed. It may have been
336 * flushed delayed write, however, and we don't want to
337 * get stuck waiting for that to complete. So, we want to check
338 * to see if we can lock the dquot's buffer without sleeping.
339 * If we can and it is marked for delayed write, then we
340 * hold it and send it out from the push routine. We don't
341 * want to do that now since we might sleep in the device
342 * strategy routine. We also don't want to grab the buffer lock
343 * here because we'd like not to call into the buffer cache
344 * while holding the AIL_LOCK.
345 * Make sure to only return PUSHBUF if we set pushbuf_flag
346 * ourselves. If someone else is doing it then we don't
347 * want to go to the push routine and duplicate their efforts.
348 */
349 if (qip->qli_pushbuf_flag == 0) {
350 qip->qli_pushbuf_flag = 1;
351 ASSERT(qip->qli_format.qlf_blkno == dqp->q_blkno);
352#ifdef DEBUG
353 qip->qli_push_owner = get_thread_id();
354#endif
355 /*
356 * The dquot is left locked.
357 */
358 retval = XFS_ITEM_PUSHBUF;
359 } else {
360 retval = XFS_ITEM_FLUSHING;
361 xfs_dqunlock_nonotify(dqp);
362 }
363 }
364
365 ASSERT(qip->qli_item.li_flags & XFS_LI_IN_AIL);
366 return (retval);
367}
368
369
370/*
371 * Unlock the dquot associated with the log item.
372 * Clear the fields of the dquot and dquot log item that
373 * are specific to the current transaction. If the
374 * hold flags is set, do not unlock the dquot.
375 */
376STATIC void
377xfs_qm_dquot_logitem_unlock(
378 xfs_dq_logitem_t *ql)
379{
380 xfs_dquot_t *dqp;
381
382 ASSERT(ql != NULL);
383 dqp = ql->qli_dquot;
384 ASSERT(XFS_DQ_IS_LOCKED(dqp));
385
386 /*
387 * Clear the transaction pointer in the dquot
388 */
389 dqp->q_transp = NULL;
390
391 /*
392 * dquots are never 'held' from getting unlocked at the end of
393 * a transaction. Their locking and unlocking is hidden inside the
394 * transaction layer, within trans_commit. Hence, no LI_HOLD flag
395 * for the logitem.
396 */
397 xfs_dqunlock(dqp);
398}
399
400
401/*
402 * The transaction with the dquot locked has aborted. The dquot
403 * must not be dirty within the transaction. We simply unlock just
404 * as if the transaction had been cancelled.
405 */
406STATIC void
407xfs_qm_dquot_logitem_abort(
408 xfs_dq_logitem_t *ql)
409{
410 xfs_qm_dquot_logitem_unlock(ql);
411}
412
413/*
414 * this needs to stamp an lsn into the dquot, I think.
415 * rpc's that look at user dquot's would then have to
416 * push on the dependency recorded in the dquot
417 */
418/* ARGSUSED */
419STATIC void
420xfs_qm_dquot_logitem_committing(
421 xfs_dq_logitem_t *l,
422 xfs_lsn_t lsn)
423{
424 return;
425}
426
427
428/*
429 * This is the ops vector for dquots
430 */
431struct xfs_item_ops xfs_dquot_item_ops = {
432 .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_size,
433 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
434 xfs_qm_dquot_logitem_format,
435 .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_pin,
436 .iop_unpin = (void(*)(xfs_log_item_t*, int))
437 xfs_qm_dquot_logitem_unpin,
438 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
439 xfs_qm_dquot_logitem_unpin_remove,
440 .iop_trylock = (uint(*)(xfs_log_item_t*))
441 xfs_qm_dquot_logitem_trylock,
442 .iop_unlock = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unlock,
443 .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
444 xfs_qm_dquot_logitem_committed,
445 .iop_push = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_push,
446 .iop_abort = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_abort,
447 .iop_pushbuf = (void(*)(xfs_log_item_t*))
448 xfs_qm_dquot_logitem_pushbuf,
449 .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
450 xfs_qm_dquot_logitem_committing
451};
452
453/*
454 * Initialize the dquot log item for a newly allocated dquot.
455 * The dquot isn't locked at this point, but it isn't on any of the lists
456 * either, so we don't care.
457 */
458void
459xfs_qm_dquot_logitem_init(
460 struct xfs_dquot *dqp)
461{
462 xfs_dq_logitem_t *lp;
463 lp = &dqp->q_logitem;
464
465 lp->qli_item.li_type = XFS_LI_DQUOT;
466 lp->qli_item.li_ops = &xfs_dquot_item_ops;
467 lp->qli_item.li_mountp = dqp->q_mount;
468 lp->qli_dquot = dqp;
469 lp->qli_format.qlf_type = XFS_LI_DQUOT;
470 lp->qli_format.qlf_id = INT_GET(dqp->q_core.d_id, ARCH_CONVERT);
471 lp->qli_format.qlf_blkno = dqp->q_blkno;
472 lp->qli_format.qlf_len = 1;
473 /*
474 * This is just the offset of this dquot within its buffer
475 * (which is currently 1 FSB and probably won't change).
476 * Hence 32 bits for this offset should be just fine.
477 * Alternatively, we can store (bufoffset / sizeof(xfs_dqblk_t))
478 * here, and recompute it at recovery time.
479 */
480 lp->qli_format.qlf_boffset = (__uint32_t)dqp->q_bufoffset;
481}
482
483/*------------------ QUOTAOFF LOG ITEMS -------------------*/
484
485/*
486 * This returns the number of iovecs needed to log the given quotaoff item.
487 * We only need 1 iovec for an quotaoff item. It just logs the
488 * quotaoff_log_format structure.
489 */
490/*ARGSUSED*/
491STATIC uint
492xfs_qm_qoff_logitem_size(xfs_qoff_logitem_t *qf)
493{
494 return (1);
495}
496
497/*
498 * This is called to fill in the vector of log iovecs for the
499 * given quotaoff log item. We use only 1 iovec, and we point that
500 * at the quotaoff_log_format structure embedded in the quotaoff item.
501 * It is at this point that we assert that all of the extent
502 * slots in the quotaoff item have been filled.
503 */
504STATIC void
505xfs_qm_qoff_logitem_format(xfs_qoff_logitem_t *qf,
506 xfs_log_iovec_t *log_vector)
507{
508 ASSERT(qf->qql_format.qf_type == XFS_LI_QUOTAOFF);
509
510 log_vector->i_addr = (xfs_caddr_t)&(qf->qql_format);
511 log_vector->i_len = sizeof(xfs_qoff_logitem_t);
512 qf->qql_format.qf_size = 1;
513}
514
515
516/*
517 * Pinning has no meaning for an quotaoff item, so just return.
518 */
519/*ARGSUSED*/
520STATIC void
521xfs_qm_qoff_logitem_pin(xfs_qoff_logitem_t *qf)
522{
523 return;
524}
525
526
527/*
528 * Since pinning has no meaning for an quotaoff item, unpinning does
529 * not either.
530 */
531/*ARGSUSED*/
532STATIC void
533xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf, int stale)
534{
535 return;
536}
537
538/*ARGSUSED*/
539STATIC void
540xfs_qm_qoff_logitem_unpin_remove(xfs_qoff_logitem_t *qf, xfs_trans_t *tp)
541{
542 return;
543}
544
545/*
546 * Quotaoff items have no locking, so just return success.
547 */
548/*ARGSUSED*/
549STATIC uint
550xfs_qm_qoff_logitem_trylock(xfs_qoff_logitem_t *qf)
551{
552 return XFS_ITEM_LOCKED;
553}
554
555/*
556 * Quotaoff items have no locking or pushing, so return failure
557 * so that the caller doesn't bother with us.
558 */
559/*ARGSUSED*/
560STATIC void
561xfs_qm_qoff_logitem_unlock(xfs_qoff_logitem_t *qf)
562{
563 return;
564}
565
566/*
567 * The quotaoff-start-item is logged only once and cannot be moved in the log,
568 * so simply return the lsn at which it's been logged.
569 */
570/*ARGSUSED*/
571STATIC xfs_lsn_t
572xfs_qm_qoff_logitem_committed(xfs_qoff_logitem_t *qf, xfs_lsn_t lsn)
573{
574 return (lsn);
575}
576
577/*
578 * The transaction of which this QUOTAOFF is a part has been aborted.
579 * Just clean up after ourselves.
580 * Shouldn't this never happen in the case of qoffend logitems? XXX
581 */
582STATIC void
583xfs_qm_qoff_logitem_abort(xfs_qoff_logitem_t *qf)
584{
585 kmem_free(qf, sizeof(xfs_qoff_logitem_t));
586}
587
588/*
589 * There isn't much you can do to push on an quotaoff item. It is simply
590 * stuck waiting for the log to be flushed to disk.
591 */
592/*ARGSUSED*/
593STATIC void
594xfs_qm_qoff_logitem_push(xfs_qoff_logitem_t *qf)
595{
596 return;
597}
598
599
600/*ARGSUSED*/
601STATIC xfs_lsn_t
602xfs_qm_qoffend_logitem_committed(
603 xfs_qoff_logitem_t *qfe,
604 xfs_lsn_t lsn)
605{
606 xfs_qoff_logitem_t *qfs;
607 SPLDECL(s);
608
609 qfs = qfe->qql_start_lip;
610 AIL_LOCK(qfs->qql_item.li_mountp,s);
611 /*
612 * Delete the qoff-start logitem from the AIL.
613 * xfs_trans_delete_ail() drops the AIL lock.
614 */
615 xfs_trans_delete_ail(qfs->qql_item.li_mountp, (xfs_log_item_t *)qfs, s);
616 kmem_free(qfs, sizeof(xfs_qoff_logitem_t));
617 kmem_free(qfe, sizeof(xfs_qoff_logitem_t));
618 return (xfs_lsn_t)-1;
619}
620
621/*
622 * XXX rcc - don't know quite what to do with this. I think we can
623 * just ignore it. The only time that isn't the case is if we allow
624 * the client to somehow see that quotas have been turned off in which
625 * we can't allow that to get back until the quotaoff hits the disk.
626 * So how would that happen? Also, do we need different routines for
627 * quotaoff start and quotaoff end? I suspect the answer is yes but
628 * to be sure, I need to look at the recovery code and see how quota off
629 * recovery is handled (do we roll forward or back or do something else).
630 * If we roll forwards or backwards, then we need two separate routines,
631 * one that does nothing and one that stamps in the lsn that matters
632 * (truly makes the quotaoff irrevocable). If we do something else,
633 * then maybe we don't need two.
634 */
635/* ARGSUSED */
636STATIC void
637xfs_qm_qoff_logitem_committing(xfs_qoff_logitem_t *qip, xfs_lsn_t commit_lsn)
638{
639 return;
640}
641
642/* ARGSUSED */
643STATIC void
644xfs_qm_qoffend_logitem_committing(xfs_qoff_logitem_t *qip, xfs_lsn_t commit_lsn)
645{
646 return;
647}
648
649struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
650 .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size,
651 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
652 xfs_qm_qoff_logitem_format,
653 .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
654 .iop_unpin = (void(*)(xfs_log_item_t* ,int))
655 xfs_qm_qoff_logitem_unpin,
656 .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
657 xfs_qm_qoff_logitem_unpin_remove,
658 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
659 .iop_unlock = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unlock,
660 .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
661 xfs_qm_qoffend_logitem_committed,
662 .iop_push = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push,
663 .iop_abort = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_abort,
664 .iop_pushbuf = NULL,
665 .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
666 xfs_qm_qoffend_logitem_committing
667};
668
669/*
670 * This is the ops vector shared by all quotaoff-start log items.
671 */
672struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
673 .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size,
674 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
675 xfs_qm_qoff_logitem_format,
676 .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
677 .iop_unpin = (void(*)(xfs_log_item_t*, int))
678 xfs_qm_qoff_logitem_unpin,
679 .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
680 xfs_qm_qoff_logitem_unpin_remove,
681 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
682 .iop_unlock = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unlock,
683 .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
684 xfs_qm_qoff_logitem_committed,
685 .iop_push = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push,
686 .iop_abort = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_abort,
687 .iop_pushbuf = NULL,
688 .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
689 xfs_qm_qoff_logitem_committing
690};
691
692/*
693 * Allocate and initialize an quotaoff item of the correct quota type(s).
694 */
695xfs_qoff_logitem_t *
696xfs_qm_qoff_logitem_init(
697 struct xfs_mount *mp,
698 xfs_qoff_logitem_t *start,
699 uint flags)
700{
701 xfs_qoff_logitem_t *qf;
702
703 qf = (xfs_qoff_logitem_t*) kmem_zalloc(sizeof(xfs_qoff_logitem_t), KM_SLEEP);
704
705 qf->qql_item.li_type = XFS_LI_QUOTAOFF;
706 if (start)
707 qf->qql_item.li_ops = &xfs_qm_qoffend_logitem_ops;
708 else
709 qf->qql_item.li_ops = &xfs_qm_qoff_logitem_ops;
710 qf->qql_item.li_mountp = mp;
711 qf->qql_format.qf_type = XFS_LI_QUOTAOFF;
712 qf->qql_format.qf_flags = flags;
713 qf->qql_start_lip = start;
714 return (qf);
715}
diff --git a/fs/xfs/quota/xfs_dquot_item.h b/fs/xfs/quota/xfs_dquot_item.h
new file mode 100644
index 000000000000..9c6500dabcaa
--- /dev/null
+++ b/fs/xfs/quota/xfs_dquot_item.h
@@ -0,0 +1,66 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_DQUOT_ITEM_H__
33#define __XFS_DQUOT_ITEM_H__
34
35struct xfs_dquot;
36struct xfs_trans;
37struct xfs_mount;
38struct xfs_qoff_logitem;
39
40typedef struct xfs_dq_logitem {
41 xfs_log_item_t qli_item; /* common portion */
42 struct xfs_dquot *qli_dquot; /* dquot ptr */
43 xfs_lsn_t qli_flush_lsn; /* lsn at last flush */
44 unsigned short qli_pushbuf_flag; /* 1 bit used in push_ail */
45#ifdef DEBUG
46 uint64_t qli_push_owner;
47#endif
48 xfs_dq_logformat_t qli_format; /* logged structure */
49} xfs_dq_logitem_t;
50
51typedef struct xfs_qoff_logitem {
52 xfs_log_item_t qql_item; /* common portion */
53 struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */
54 xfs_qoff_logformat_t qql_format; /* logged structure */
55} xfs_qoff_logitem_t;
56
57
58extern void xfs_qm_dquot_logitem_init(struct xfs_dquot *);
59extern xfs_qoff_logitem_t *xfs_qm_qoff_logitem_init(struct xfs_mount *,
60 struct xfs_qoff_logitem *, uint);
61extern xfs_qoff_logitem_t *xfs_trans_get_qoff_item(struct xfs_trans *,
62 struct xfs_qoff_logitem *, uint);
63extern void xfs_trans_log_quotaoff_item(struct xfs_trans *,
64 struct xfs_qoff_logitem *);
65
66#endif /* __XFS_DQUOT_ITEM_H__ */
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
new file mode 100644
index 000000000000..89f2cd656ebf
--- /dev/null
+++ b/fs/xfs/quota/xfs_qm.c
@@ -0,0 +1,2848 @@
1/*
2 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34#include "xfs_fs.h"
35#include "xfs_inum.h"
36#include "xfs_log.h"
37#include "xfs_clnt.h"
38#include "xfs_trans.h"
39#include "xfs_sb.h"
40#include "xfs_ag.h"
41#include "xfs_dir.h"
42#include "xfs_dir2.h"
43#include "xfs_alloc.h"
44#include "xfs_dmapi.h"
45#include "xfs_quota.h"
46#include "xfs_mount.h"
47#include "xfs_alloc_btree.h"
48#include "xfs_bmap_btree.h"
49#include "xfs_ialloc_btree.h"
50#include "xfs_btree.h"
51#include "xfs_ialloc.h"
52#include "xfs_attr_sf.h"
53#include "xfs_dir_sf.h"
54#include "xfs_dir2_sf.h"
55#include "xfs_dinode.h"
56#include "xfs_inode.h"
57#include "xfs_bmap.h"
58#include "xfs_bit.h"
59#include "xfs_rtalloc.h"
60#include "xfs_error.h"
61#include "xfs_itable.h"
62#include "xfs_rw.h"
63#include "xfs_acl.h"
64#include "xfs_cap.h"
65#include "xfs_mac.h"
66#include "xfs_attr.h"
67#include "xfs_buf_item.h"
68#include "xfs_trans_space.h"
69#include "xfs_utils.h"
70
71#include "xfs_qm.h"
72
73/*
74 * The global quota manager. There is only one of these for the entire
75 * system, _not_ one per file system. XQM keeps track of the overall
76 * quota functionality, including maintaining the freelist and hash
77 * tables of dquots.
78 */
79mutex_t xfs_Gqm_lock;
80struct xfs_qm *xfs_Gqm;
81
82kmem_zone_t *qm_dqzone;
83kmem_zone_t *qm_dqtrxzone;
84kmem_shaker_t xfs_qm_shaker;
85
86STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int);
87STATIC void xfs_qm_list_destroy(xfs_dqlist_t *);
88
89STATIC int xfs_qm_init_quotainos(xfs_mount_t *);
90STATIC int xfs_qm_shake(int, unsigned int);
91
92#ifdef DEBUG
93extern mutex_t qcheck_lock;
94#endif
95
96#ifdef QUOTADEBUG
97#define XQM_LIST_PRINT(l, NXT, title) \
98{ \
99 xfs_dquot_t *dqp; int i = 0; \
100 cmn_err(CE_DEBUG, "%s (#%d)", title, (int) (l)->qh_nelems); \
101 for (dqp = (l)->qh_next; dqp != NULL; dqp = dqp->NXT) { \
102 cmn_err(CE_DEBUG, " %d. \"%d (%s)\" " \
103 "bcnt = %d, icnt = %d, refs = %d", \
104 ++i, (int) INT_GET(dqp->q_core.d_id, ARCH_CONVERT), \
105 DQFLAGTO_TYPESTR(dqp), \
106 (int) INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT), \
107 (int) INT_GET(dqp->q_core.d_icount, ARCH_CONVERT), \
108 (int) dqp->q_nrefs); } \
109}
110#else
111#define XQM_LIST_PRINT(l, NXT, title) do { } while (0)
112#endif
113
114/*
115 * Initialize the XQM structure.
116 * Note that there is not one quota manager per file system.
117 */
118STATIC struct xfs_qm *
119xfs_Gqm_init(void)
120{
121 xfs_qm_t *xqm;
122 int hsize, i;
123
124 xqm = kmem_zalloc(sizeof(xfs_qm_t), KM_SLEEP);
125 ASSERT(xqm);
126
127 /*
128 * Initialize the dquot hash tables.
129 */
130 hsize = (DQUOT_HASH_HEURISTIC < XFS_QM_NCSIZE_THRESHOLD) ?
131 XFS_QM_HASHSIZE_LOW : XFS_QM_HASHSIZE_HIGH;
132 xqm->qm_dqhashmask = hsize - 1;
133
134 xqm->qm_usr_dqhtable = (xfs_dqhash_t *)kmem_zalloc(hsize *
135 sizeof(xfs_dqhash_t),
136 KM_SLEEP);
137 xqm->qm_grp_dqhtable = (xfs_dqhash_t *)kmem_zalloc(hsize *
138 sizeof(xfs_dqhash_t),
139 KM_SLEEP);
140 ASSERT(xqm->qm_usr_dqhtable != NULL);
141 ASSERT(xqm->qm_grp_dqhtable != NULL);
142
143 for (i = 0; i < hsize; i++) {
144 xfs_qm_list_init(&(xqm->qm_usr_dqhtable[i]), "uxdqh", i);
145 xfs_qm_list_init(&(xqm->qm_grp_dqhtable[i]), "gxdqh", i);
146 }
147
148 /*
149 * Freelist of all dquots of all file systems
150 */
151 xfs_qm_freelist_init(&(xqm->qm_dqfreelist));
152
153 /*
154 * dquot zone. we register our own low-memory callback.
155 */
156 if (!qm_dqzone) {
157 xqm->qm_dqzone = kmem_zone_init(sizeof(xfs_dquot_t),
158 "xfs_dquots");
159 qm_dqzone = xqm->qm_dqzone;
160 } else
161 xqm->qm_dqzone = qm_dqzone;
162
163 xfs_qm_shaker = kmem_shake_register(xfs_qm_shake);
164
165 /*
166 * The t_dqinfo portion of transactions.
167 */
168 if (!qm_dqtrxzone) {
169 xqm->qm_dqtrxzone = kmem_zone_init(sizeof(xfs_dquot_acct_t),
170 "xfs_dqtrx");
171 qm_dqtrxzone = xqm->qm_dqtrxzone;
172 } else
173 xqm->qm_dqtrxzone = qm_dqtrxzone;
174
175 atomic_set(&xqm->qm_totaldquots, 0);
176 xqm->qm_dqfree_ratio = XFS_QM_DQFREE_RATIO;
177 xqm->qm_nrefs = 0;
178#ifdef DEBUG
179 mutex_init(&qcheck_lock, MUTEX_DEFAULT, "qchk");
180#endif
181 return xqm;
182}
183
184/*
185 * Destroy the global quota manager when its reference count goes to zero.
186 */
187void
188xfs_qm_destroy(
189 struct xfs_qm *xqm)
190{
191 int hsize, i;
192
193 ASSERT(xqm != NULL);
194 ASSERT(xqm->qm_nrefs == 0);
195 kmem_shake_deregister(xfs_qm_shaker);
196 hsize = xqm->qm_dqhashmask + 1;
197 for (i = 0; i < hsize; i++) {
198 xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i]));
199 xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i]));
200 }
201 kmem_free(xqm->qm_usr_dqhtable, hsize * sizeof(xfs_dqhash_t));
202 kmem_free(xqm->qm_grp_dqhtable, hsize * sizeof(xfs_dqhash_t));
203 xqm->qm_usr_dqhtable = NULL;
204 xqm->qm_grp_dqhtable = NULL;
205 xqm->qm_dqhashmask = 0;
206 xfs_qm_freelist_destroy(&(xqm->qm_dqfreelist));
207#ifdef DEBUG
208 mutex_destroy(&qcheck_lock);
209#endif
210 kmem_free(xqm, sizeof(xfs_qm_t));
211}
212
213/*
214 * Called at mount time to let XQM know that another file system is
215 * starting quotas. This isn't crucial information as the individual mount
216 * structures are pretty independent, but it helps the XQM keep a
217 * global view of what's going on.
218 */
219/* ARGSUSED */
220STATIC int
221xfs_qm_hold_quotafs_ref(
222 struct xfs_mount *mp)
223{
224 /*
225 * Need to lock the xfs_Gqm structure for things like this. For example,
226 * the structure could disappear between the entry to this routine and
227 * a HOLD operation if not locked.
228 */
229 XFS_QM_LOCK(xfs_Gqm);
230
231 if (xfs_Gqm == NULL)
232 xfs_Gqm = xfs_Gqm_init();
233 /*
234 * We can keep a list of all filesystems with quotas mounted for
235 * debugging and statistical purposes, but ...
236 * Just take a reference and get out.
237 */
238 XFS_QM_HOLD(xfs_Gqm);
239 XFS_QM_UNLOCK(xfs_Gqm);
240
241 return 0;
242}
243
244
245/*
246 * Release the reference that a filesystem took at mount time,
247 * so that we know when we need to destroy the entire quota manager.
248 */
249/* ARGSUSED */
250STATIC void
251xfs_qm_rele_quotafs_ref(
252 struct xfs_mount *mp)
253{
254 xfs_dquot_t *dqp, *nextdqp;
255
256 ASSERT(xfs_Gqm);
257 ASSERT(xfs_Gqm->qm_nrefs > 0);
258
259 /*
260 * Go thru the freelist and destroy all inactive dquots.
261 */
262 xfs_qm_freelist_lock(xfs_Gqm);
263
264 for (dqp = xfs_Gqm->qm_dqfreelist.qh_next;
265 dqp != (xfs_dquot_t *)&(xfs_Gqm->qm_dqfreelist); ) {
266 xfs_dqlock(dqp);
267 nextdqp = dqp->dq_flnext;
268 if (dqp->dq_flags & XFS_DQ_INACTIVE) {
269 ASSERT(dqp->q_mount == NULL);
270 ASSERT(! XFS_DQ_IS_DIRTY(dqp));
271 ASSERT(dqp->HL_PREVP == NULL);
272 ASSERT(dqp->MPL_PREVP == NULL);
273 XQM_FREELIST_REMOVE(dqp);
274 xfs_dqunlock(dqp);
275 xfs_qm_dqdestroy(dqp);
276 } else {
277 xfs_dqunlock(dqp);
278 }
279 dqp = nextdqp;
280 }
281 xfs_qm_freelist_unlock(xfs_Gqm);
282
283 /*
284 * Destroy the entire XQM. If somebody mounts with quotaon, this'll
285 * be restarted.
286 */
287 XFS_QM_LOCK(xfs_Gqm);
288 XFS_QM_RELE(xfs_Gqm);
289 if (xfs_Gqm->qm_nrefs == 0) {
290 xfs_qm_destroy(xfs_Gqm);
291 xfs_Gqm = NULL;
292 }
293 XFS_QM_UNLOCK(xfs_Gqm);
294}
295
296/*
297 * This is called at mount time from xfs_mountfs to initialize the quotainfo
298 * structure and start the global quotamanager (xfs_Gqm) if it hasn't done
299 * so already. Note that the superblock has not been read in yet.
300 */
301void
302xfs_qm_mount_quotainit(
303 xfs_mount_t *mp,
304 uint flags)
305{
306 /*
307 * User or group quotas has to be on.
308 */
309 ASSERT(flags & (XFSMNT_UQUOTA | XFSMNT_GQUOTA));
310
311 /*
312 * Initialize the flags in the mount structure. From this point
313 * onwards we look at m_qflags to figure out if quotas's ON/OFF, etc.
314 * Note that we enforce nothing if accounting is off.
315 * ie. XFSMNT_*QUOTA must be ON for XFSMNT_*QUOTAENF.
316 * It isn't necessary to take the quotaoff lock to do this; this is
317 * called from mount.
318 */
319 if (flags & XFSMNT_UQUOTA) {
320 mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
321 if (flags & XFSMNT_UQUOTAENF)
322 mp->m_qflags |= XFS_UQUOTA_ENFD;
323 }
324 if (flags & XFSMNT_GQUOTA) {
325 mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
326 if (flags & XFSMNT_GQUOTAENF)
327 mp->m_qflags |= XFS_GQUOTA_ENFD;
328 }
329}
330
331/*
332 * Just destroy the quotainfo structure.
333 */
334void
335xfs_qm_unmount_quotadestroy(
336 xfs_mount_t *mp)
337{
338 if (mp->m_quotainfo)
339 xfs_qm_destroy_quotainfo(mp);
340}
341
342
343/*
344 * This is called from xfs_mountfs to start quotas and initialize all
345 * necessary data structures like quotainfo. This is also responsible for
346 * running a quotacheck as necessary. We are guaranteed that the superblock
347 * is consistently read in at this point.
348 */
349int
350xfs_qm_mount_quotas(
351 xfs_mount_t *mp,
352 int mfsi_flags)
353{
354 unsigned long s;
355 int error = 0;
356 uint sbf;
357
358 /*
359 * If a file system had quotas running earlier, but decided to
360 * mount without -o quota/uquota/gquota options, revoke the
361 * quotachecked license, and bail out.
362 */
363 if (! XFS_IS_QUOTA_ON(mp) &&
364 (mp->m_sb.sb_qflags & (XFS_UQUOTA_ACCT|XFS_GQUOTA_ACCT))) {
365 mp->m_qflags = 0;
366 goto write_changes;
367 }
368
369 /*
370 * If quotas on realtime volumes is not supported, we disable
371 * quotas immediately.
372 */
373 if (mp->m_sb.sb_rextents) {
374 cmn_err(CE_NOTE,
375 "Cannot turn on quotas for realtime filesystem %s",
376 mp->m_fsname);
377 mp->m_qflags = 0;
378 goto write_changes;
379 }
380
381#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
382 cmn_err(CE_NOTE, "Attempting to turn on disk quotas.");
383#endif
384
385 ASSERT(XFS_IS_QUOTA_RUNNING(mp));
386 /*
387 * Allocate the quotainfo structure inside the mount struct, and
388 * create quotainode(s), and change/rev superblock if necessary.
389 */
390 if ((error = xfs_qm_init_quotainfo(mp))) {
391 /*
392 * We must turn off quotas.
393 */
394 ASSERT(mp->m_quotainfo == NULL);
395 mp->m_qflags = 0;
396 goto write_changes;
397 }
398 /*
399 * If any of the quotas are not consistent, do a quotacheck.
400 */
401 if (XFS_QM_NEED_QUOTACHECK(mp) &&
402 !(mfsi_flags & XFS_MFSI_NO_QUOTACHECK)) {
403#ifdef DEBUG
404 cmn_err(CE_NOTE, "Doing a quotacheck. Please wait.");
405#endif
406 if ((error = xfs_qm_quotacheck(mp))) {
407 /* Quotacheck has failed and quotas have
408 * been disabled.
409 */
410 return XFS_ERROR(error);
411 }
412#ifdef DEBUG
413 cmn_err(CE_NOTE, "Done quotacheck.");
414#endif
415 }
416 write_changes:
417 /*
418 * We actually don't have to acquire the SB_LOCK at all.
419 * This can only be called from mount, and that's single threaded. XXX
420 */
421 s = XFS_SB_LOCK(mp);
422 sbf = mp->m_sb.sb_qflags;
423 mp->m_sb.sb_qflags = mp->m_qflags & XFS_MOUNT_QUOTA_ALL;
424 XFS_SB_UNLOCK(mp, s);
425
426 if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) {
427 if (xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS)) {
428 /*
429 * We could only have been turning quotas off.
430 * We aren't in very good shape actually because
431 * the incore structures are convinced that quotas are
432 * off, but the on disk superblock doesn't know that !
433 */
434 ASSERT(!(XFS_IS_QUOTA_RUNNING(mp)));
435 xfs_fs_cmn_err(CE_ALERT, mp,
436 "XFS mount_quotas: Superblock update failed!");
437 }
438 }
439
440 if (error) {
441 xfs_fs_cmn_err(CE_WARN, mp,
442 "Failed to initialize disk quotas.");
443 }
444 return XFS_ERROR(error);
445}
446
447/*
448 * Called from the vfsops layer.
449 */
450int
451xfs_qm_unmount_quotas(
452 xfs_mount_t *mp)
453{
454 xfs_inode_t *uqp, *gqp;
455 int error = 0;
456
457 /*
458 * Release the dquots that root inode, et al might be holding,
459 * before we flush quotas and blow away the quotainfo structure.
460 */
461 ASSERT(mp->m_rootip);
462 xfs_qm_dqdetach(mp->m_rootip);
463 if (mp->m_rbmip)
464 xfs_qm_dqdetach(mp->m_rbmip);
465 if (mp->m_rsumip)
466 xfs_qm_dqdetach(mp->m_rsumip);
467
468 /*
469 * Flush out the quota inodes.
470 */
471 uqp = gqp = NULL;
472 if (mp->m_quotainfo) {
473 if ((uqp = mp->m_quotainfo->qi_uquotaip) != NULL) {
474 xfs_ilock(uqp, XFS_ILOCK_EXCL);
475 xfs_iflock(uqp);
476 error = xfs_iflush(uqp, XFS_IFLUSH_SYNC);
477 xfs_iunlock(uqp, XFS_ILOCK_EXCL);
478 if (unlikely(error == EFSCORRUPTED)) {
479 XFS_ERROR_REPORT("xfs_qm_unmount_quotas(1)",
480 XFS_ERRLEVEL_LOW, mp);
481 goto out;
482 }
483 }
484 if ((gqp = mp->m_quotainfo->qi_gquotaip) != NULL) {
485 xfs_ilock(gqp, XFS_ILOCK_EXCL);
486 xfs_iflock(gqp);
487 error = xfs_iflush(gqp, XFS_IFLUSH_SYNC);
488 xfs_iunlock(gqp, XFS_ILOCK_EXCL);
489 if (unlikely(error == EFSCORRUPTED)) {
490 XFS_ERROR_REPORT("xfs_qm_unmount_quotas(2)",
491 XFS_ERRLEVEL_LOW, mp);
492 goto out;
493 }
494 }
495 }
496 if (uqp) {
497 XFS_PURGE_INODE(uqp);
498 mp->m_quotainfo->qi_uquotaip = NULL;
499 }
500 if (gqp) {
501 XFS_PURGE_INODE(gqp);
502 mp->m_quotainfo->qi_gquotaip = NULL;
503 }
504out:
505 return XFS_ERROR(error);
506}
507
508/*
509 * Flush all dquots of the given file system to disk. The dquots are
510 * _not_ purged from memory here, just their data written to disk.
511 */
512int
513xfs_qm_dqflush_all(
514 xfs_mount_t *mp,
515 int flags)
516{
517 int recl;
518 xfs_dquot_t *dqp;
519 int niters;
520 int error;
521
522 if (mp->m_quotainfo == NULL)
523 return (0);
524 niters = 0;
525again:
526 xfs_qm_mplist_lock(mp);
527 FOREACH_DQUOT_IN_MP(dqp, mp) {
528 xfs_dqlock(dqp);
529 if (! XFS_DQ_IS_DIRTY(dqp)) {
530 xfs_dqunlock(dqp);
531 continue;
532 }
533 xfs_dqtrace_entry(dqp, "FLUSHALL: DQDIRTY");
534 /* XXX a sentinel would be better */
535 recl = XFS_QI_MPLRECLAIMS(mp);
536 if (! xfs_qm_dqflock_nowait(dqp)) {
537 /*
538 * If we can't grab the flush lock then check
539 * to see if the dquot has been flushed delayed
540 * write. If so, grab its buffer and send it
541 * out immediately. We'll be able to acquire
542 * the flush lock when the I/O completes.
543 */
544 xfs_qm_dqflock_pushbuf_wait(dqp);
545 }
546 /*
547 * Let go of the mplist lock. We don't want to hold it
548 * across a disk write.
549 */
550 xfs_qm_mplist_unlock(mp);
551 error = xfs_qm_dqflush(dqp, flags);
552 xfs_dqunlock(dqp);
553 if (error)
554 return (error);
555
556 xfs_qm_mplist_lock(mp);
557 if (recl != XFS_QI_MPLRECLAIMS(mp)) {
558 xfs_qm_mplist_unlock(mp);
559 /* XXX restart limit */
560 goto again;
561 }
562 }
563
564 xfs_qm_mplist_unlock(mp);
565 /* return ! busy */
566 return (0);
567}
568/*
569 * Release the group dquot pointers the user dquots may be
570 * carrying around as a hint. mplist is locked on entry and exit.
571 */
572STATIC void
573xfs_qm_detach_gdquots(
574 xfs_mount_t *mp)
575{
576 xfs_dquot_t *dqp, *gdqp;
577 int nrecl;
578
579 again:
580 ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
581 dqp = XFS_QI_MPLNEXT(mp);
582 while (dqp) {
583 xfs_dqlock(dqp);
584 if ((gdqp = dqp->q_gdquot)) {
585 xfs_dqlock(gdqp);
586 dqp->q_gdquot = NULL;
587 }
588 xfs_dqunlock(dqp);
589
590 if (gdqp) {
591 /*
592 * Can't hold the mplist lock across a dqput.
593 * XXXmust convert to marker based iterations here.
594 */
595 nrecl = XFS_QI_MPLRECLAIMS(mp);
596 xfs_qm_mplist_unlock(mp);
597 xfs_qm_dqput(gdqp);
598
599 xfs_qm_mplist_lock(mp);
600 if (nrecl != XFS_QI_MPLRECLAIMS(mp))
601 goto again;
602 }
603 dqp = dqp->MPL_NEXT;
604 }
605}
606
607/*
608 * Go through all the incore dquots of this file system and take them
609 * off the mplist and hashlist, if the dquot type matches the dqtype
610 * parameter. This is used when turning off quota accounting for
611 * users and/or groups, as well as when the filesystem is unmounting.
612 */
613STATIC int
614xfs_qm_dqpurge_int(
615 xfs_mount_t *mp,
616 uint flags) /* QUOTAOFF/UMOUNTING/UQUOTA/GQUOTA */
617{
618 xfs_dquot_t *dqp;
619 uint dqtype;
620 int nrecl;
621 xfs_dquot_t *nextdqp;
622 int nmisses;
623
624 if (mp->m_quotainfo == NULL)
625 return (0);
626
627 dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0;
628 dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0;
629
630 xfs_qm_mplist_lock(mp);
631
632 /*
633 * In the first pass through all incore dquots of this filesystem,
634 * we release the group dquot pointers the user dquots may be
635 * carrying around as a hint. We need to do this irrespective of
636 * what's being turned off.
637 */
638 xfs_qm_detach_gdquots(mp);
639
640 again:
641 nmisses = 0;
642 ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
643 /*
644 * Try to get rid of all of the unwanted dquots. The idea is to
645 * get them off mplist and hashlist, but leave them on freelist.
646 */
647 dqp = XFS_QI_MPLNEXT(mp);
648 while (dqp) {
649 /*
650 * It's OK to look at the type without taking dqlock here.
651 * We're holding the mplist lock here, and that's needed for
652 * a dqreclaim.
653 */
654 if ((dqp->dq_flags & dqtype) == 0) {
655 dqp = dqp->MPL_NEXT;
656 continue;
657 }
658
659 if (! xfs_qm_dqhashlock_nowait(dqp)) {
660 nrecl = XFS_QI_MPLRECLAIMS(mp);
661 xfs_qm_mplist_unlock(mp);
662 XFS_DQ_HASH_LOCK(dqp->q_hash);
663 xfs_qm_mplist_lock(mp);
664
665 /*
666 * XXXTheoretically, we can get into a very long
667 * ping pong game here.
668 * No one can be adding dquots to the mplist at
669 * this point, but somebody might be taking things off.
670 */
671 if (nrecl != XFS_QI_MPLRECLAIMS(mp)) {
672 XFS_DQ_HASH_UNLOCK(dqp->q_hash);
673 goto again;
674 }
675 }
676
677 /*
678 * Take the dquot off the mplist and hashlist. It may remain on
679 * freelist in INACTIVE state.
680 */
681 nextdqp = dqp->MPL_NEXT;
682 nmisses += xfs_qm_dqpurge(dqp, flags);
683 dqp = nextdqp;
684 }
685 xfs_qm_mplist_unlock(mp);
686 return nmisses;
687}
688
689int
690xfs_qm_dqpurge_all(
691 xfs_mount_t *mp,
692 uint flags)
693{
694 int ndquots;
695
696 /*
697 * Purge the dquot cache.
698 * None of the dquots should really be busy at this point.
699 */
700 if (mp->m_quotainfo) {
701 while ((ndquots = xfs_qm_dqpurge_int(mp, flags))) {
702 delay(ndquots * 10);
703 }
704 }
705 return 0;
706}
707
708STATIC int
709xfs_qm_dqattach_one(
710 xfs_inode_t *ip,
711 xfs_dqid_t id,
712 uint type,
713 uint doalloc,
714 uint dolock,
715 xfs_dquot_t *udqhint, /* hint */
716 xfs_dquot_t **IO_idqpp)
717{
718 xfs_dquot_t *dqp;
719 int error;
720
721 ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
722 error = 0;
723 /*
724 * See if we already have it in the inode itself. IO_idqpp is
725 * &i_udquot or &i_gdquot. This made the code look weird, but
726 * made the logic a lot simpler.
727 */
728 if ((dqp = *IO_idqpp)) {
729 if (dolock)
730 xfs_dqlock(dqp);
731 xfs_dqtrace_entry(dqp, "DQATTACH: found in ip");
732 goto done;
733 }
734
735 /*
736 * udqhint is the i_udquot field in inode, and is non-NULL only
737 * when the type arg is XFS_DQ_GROUP. Its purpose is to save a
738 * lookup by dqid (xfs_qm_dqget) by caching a group dquot inside
739 * the user dquot.
740 */
741 ASSERT(!udqhint || type == XFS_DQ_GROUP);
742 if (udqhint && !dolock)
743 xfs_dqlock(udqhint);
744
745 /*
746 * No need to take dqlock to look at the id.
747 * The ID can't change until it gets reclaimed, and it won't
748 * be reclaimed as long as we have a ref from inode and we hold
749 * the ilock.
750 */
751 if (udqhint &&
752 (dqp = udqhint->q_gdquot) &&
753 (INT_GET(dqp->q_core.d_id, ARCH_CONVERT) == id)) {
754 ASSERT(XFS_DQ_IS_LOCKED(udqhint));
755 xfs_dqlock(dqp);
756 XFS_DQHOLD(dqp);
757 ASSERT(*IO_idqpp == NULL);
758 *IO_idqpp = dqp;
759 if (!dolock) {
760 xfs_dqunlock(dqp);
761 xfs_dqunlock(udqhint);
762 }
763 goto done;
764 }
765 /*
766 * We can't hold a dquot lock when we call the dqget code.
767 * We'll deadlock in no time, because of (not conforming to)
768 * lock ordering - the inodelock comes before any dquot lock,
769 * and we may drop and reacquire the ilock in xfs_qm_dqget().
770 */
771 if (udqhint)
772 xfs_dqunlock(udqhint);
773 /*
774 * Find the dquot from somewhere. This bumps the
775 * reference count of dquot and returns it locked.
776 * This can return ENOENT if dquot didn't exist on
777 * disk and we didn't ask it to allocate;
778 * ESRCH if quotas got turned off suddenly.
779 */
780 if ((error = xfs_qm_dqget(ip->i_mount, ip, id, type,
781 doalloc|XFS_QMOPT_DOWARN, &dqp))) {
782 if (udqhint && dolock)
783 xfs_dqlock(udqhint);
784 goto done;
785 }
786
787 xfs_dqtrace_entry(dqp, "DQATTACH: found by dqget");
788 /*
789 * dqget may have dropped and re-acquired the ilock, but it guarantees
790 * that the dquot returned is the one that should go in the inode.
791 */
792 *IO_idqpp = dqp;
793 ASSERT(dqp);
794 ASSERT(XFS_DQ_IS_LOCKED(dqp));
795 if (! dolock) {
796 xfs_dqunlock(dqp);
797 goto done;
798 }
799 if (! udqhint)
800 goto done;
801
802 ASSERT(udqhint);
803 ASSERT(dolock);
804 ASSERT(XFS_DQ_IS_LOCKED(dqp));
805 if (! xfs_qm_dqlock_nowait(udqhint)) {
806 xfs_dqunlock(dqp);
807 xfs_dqlock(udqhint);
808 xfs_dqlock(dqp);
809 }
810 done:
811#ifdef QUOTADEBUG
812 if (udqhint) {
813 if (dolock)
814 ASSERT(XFS_DQ_IS_LOCKED(udqhint));
815 }
816 if (! error) {
817 if (dolock)
818 ASSERT(XFS_DQ_IS_LOCKED(dqp));
819 }
820#endif
821 return (error);
822}
823
824
825/*
826 * Given a udquot and gdquot, attach a ptr to the group dquot in the
827 * udquot as a hint for future lookups. The idea sounds simple, but the
828 * execution isn't, because the udquot might have a group dquot attached
829 * already and getting rid of that gets us into lock ordering contraints.
830 * The process is complicated more by the fact that the dquots may or may not
831 * be locked on entry.
832 */
833STATIC void
834xfs_qm_dqattach_grouphint(
835 xfs_dquot_t *udq,
836 xfs_dquot_t *gdq,
837 uint locked)
838{
839 xfs_dquot_t *tmp;
840
841#ifdef QUOTADEBUG
842 if (locked) {
843 ASSERT(XFS_DQ_IS_LOCKED(udq));
844 ASSERT(XFS_DQ_IS_LOCKED(gdq));
845 }
846#endif
847 if (! locked)
848 xfs_dqlock(udq);
849
850 if ((tmp = udq->q_gdquot)) {
851 if (tmp == gdq) {
852 if (! locked)
853 xfs_dqunlock(udq);
854 return;
855 }
856
857 udq->q_gdquot = NULL;
858 /*
859 * We can't keep any dqlocks when calling dqrele,
860 * because the freelist lock comes before dqlocks.
861 */
862 xfs_dqunlock(udq);
863 if (locked)
864 xfs_dqunlock(gdq);
865 /*
866 * we took a hard reference once upon a time in dqget,
867 * so give it back when the udquot no longer points at it
868 * dqput() does the unlocking of the dquot.
869 */
870 xfs_qm_dqrele(tmp);
871
872 xfs_dqlock(udq);
873 xfs_dqlock(gdq);
874
875 } else {
876 ASSERT(XFS_DQ_IS_LOCKED(udq));
877 if (! locked) {
878 xfs_dqlock(gdq);
879 }
880 }
881
882 ASSERT(XFS_DQ_IS_LOCKED(udq));
883 ASSERT(XFS_DQ_IS_LOCKED(gdq));
884 /*
885 * Somebody could have attached a gdquot here,
886 * when we dropped the uqlock. If so, just do nothing.
887 */
888 if (udq->q_gdquot == NULL) {
889 XFS_DQHOLD(gdq);
890 udq->q_gdquot = gdq;
891 }
892 if (! locked) {
893 xfs_dqunlock(gdq);
894 xfs_dqunlock(udq);
895 }
896}
897
898
899/*
900 * Given a locked inode, attach dquot(s) to it, taking UQUOTAON / GQUOTAON
901 * in to account.
902 * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed.
903 * If XFS_QMOPT_DQLOCK, the dquot(s) will be returned locked. This option pretty
904 * much made this code a complete mess, but it has been pretty useful.
905 * If XFS_QMOPT_ILOCKED, then inode sent is already locked EXCL.
906 * Inode may get unlocked and relocked in here, and the caller must deal with
907 * the consequences.
908 */
909int
910xfs_qm_dqattach(
911 xfs_inode_t *ip,
912 uint flags)
913{
914 xfs_mount_t *mp = ip->i_mount;
915 uint nquotas = 0;
916 int error = 0;
917
918 if ((! XFS_IS_QUOTA_ON(mp)) ||
919 (! XFS_NOT_DQATTACHED(mp, ip)) ||
920 (ip->i_ino == mp->m_sb.sb_uquotino) ||
921 (ip->i_ino == mp->m_sb.sb_gquotino))
922 return (0);
923
924 ASSERT((flags & XFS_QMOPT_ILOCKED) == 0 ||
925 XFS_ISLOCKED_INODE_EXCL(ip));
926
927 if (! (flags & XFS_QMOPT_ILOCKED))
928 xfs_ilock(ip, XFS_ILOCK_EXCL);
929
930 if (XFS_IS_UQUOTA_ON(mp)) {
931 error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER,
932 flags & XFS_QMOPT_DQALLOC,
933 flags & XFS_QMOPT_DQLOCK,
934 NULL, &ip->i_udquot);
935 if (error)
936 goto done;
937 nquotas++;
938 }
939 ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
940 if (XFS_IS_GQUOTA_ON(mp)) {
941 error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
942 flags & XFS_QMOPT_DQALLOC,
943 flags & XFS_QMOPT_DQLOCK,
944 ip->i_udquot, &ip->i_gdquot);
945 /*
946 * Don't worry about the udquot that we may have
947 * attached above. It'll get detached, if not already.
948 */
949 if (error)
950 goto done;
951 nquotas++;
952 }
953
954 /*
955 * Attach this group quota to the user quota as a hint.
956 * This WON'T, in general, result in a thrash.
957 */
958 if (nquotas == 2) {
959 ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
960 ASSERT(ip->i_udquot);
961 ASSERT(ip->i_gdquot);
962
963 /*
964 * We may or may not have the i_udquot locked at this point,
965 * but this check is OK since we don't depend on the i_gdquot to
966 * be accurate 100% all the time. It is just a hint, and this
967 * will succeed in general.
968 */
969 if (ip->i_udquot->q_gdquot == ip->i_gdquot)
970 goto done;
971 /*
972 * Attach i_gdquot to the gdquot hint inside the i_udquot.
973 */
974 xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot,
975 flags & XFS_QMOPT_DQLOCK);
976 }
977
978 done:
979
980#ifdef QUOTADEBUG
981 if (! error) {
982 if (ip->i_udquot) {
983 if (flags & XFS_QMOPT_DQLOCK)
984 ASSERT(XFS_DQ_IS_LOCKED(ip->i_udquot));
985 }
986 if (ip->i_gdquot) {
987 if (flags & XFS_QMOPT_DQLOCK)
988 ASSERT(XFS_DQ_IS_LOCKED(ip->i_gdquot));
989 }
990 if (XFS_IS_UQUOTA_ON(mp))
991 ASSERT(ip->i_udquot);
992 if (XFS_IS_GQUOTA_ON(mp))
993 ASSERT(ip->i_gdquot);
994 }
995#endif
996
997 if (! (flags & XFS_QMOPT_ILOCKED))
998 xfs_iunlock(ip, XFS_ILOCK_EXCL);
999
1000#ifdef QUOTADEBUG
1001 else
1002 ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
1003#endif
1004 return (error);
1005}
1006
1007/*
1008 * Release dquots (and their references) if any.
1009 * The inode should be locked EXCL except when this's called by
1010 * xfs_ireclaim.
1011 */
1012void
1013xfs_qm_dqdetach(
1014 xfs_inode_t *ip)
1015{
1016 if (!(ip->i_udquot || ip->i_gdquot))
1017 return;
1018
1019 ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_uquotino);
1020 ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_gquotino);
1021 if (ip->i_udquot)
1022 xfs_dqtrace_entry_ino(ip->i_udquot, "DQDETTACH", ip);
1023 if (ip->i_udquot) {
1024 xfs_qm_dqrele(ip->i_udquot);
1025 ip->i_udquot = NULL;
1026 }
1027 if (ip->i_gdquot) {
1028 xfs_qm_dqrele(ip->i_gdquot);
1029 ip->i_gdquot = NULL;
1030 }
1031}
1032
1033/*
1034 * This is called by VFS_SYNC and flags arg determines the caller,
1035 * and its motives, as done in xfs_sync.
1036 *
1037 * vfs_sync: SYNC_FSDATA|SYNC_ATTR|SYNC_BDFLUSH 0x31
1038 * syscall sync: SYNC_FSDATA|SYNC_ATTR|SYNC_DELWRI 0x25
1039 * umountroot : SYNC_WAIT | SYNC_CLOSE | SYNC_ATTR | SYNC_FSDATA
1040 */
1041
1042int
1043xfs_qm_sync(
1044 xfs_mount_t *mp,
1045 short flags)
1046{
1047 int recl, restarts;
1048 xfs_dquot_t *dqp;
1049 uint flush_flags;
1050 boolean_t nowait;
1051 int error;
1052
1053 restarts = 0;
1054 /*
1055 * We won't block unless we are asked to.
1056 */
1057 nowait = (boolean_t)(flags & SYNC_BDFLUSH || (flags & SYNC_WAIT) == 0);
1058
1059 again:
1060 xfs_qm_mplist_lock(mp);
1061 /*
1062 * dqpurge_all() also takes the mplist lock and iterate thru all dquots
1063 * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared
1064 * when we have the mplist lock, we know that dquots will be consistent
1065 * as long as we have it locked.
1066 */
1067 if (! XFS_IS_QUOTA_ON(mp)) {
1068 xfs_qm_mplist_unlock(mp);
1069 return (0);
1070 }
1071 FOREACH_DQUOT_IN_MP(dqp, mp) {
1072 /*
1073 * If this is vfs_sync calling, then skip the dquots that
1074 * don't 'seem' to be dirty. ie. don't acquire dqlock.
1075 * This is very similar to what xfs_sync does with inodes.
1076 */
1077 if (flags & SYNC_BDFLUSH) {
1078 if (! XFS_DQ_IS_DIRTY(dqp))
1079 continue;
1080 }
1081
1082 if (nowait) {
1083 /*
1084 * Try to acquire the dquot lock. We are NOT out of
1085 * lock order, but we just don't want to wait for this
1086 * lock, unless somebody wanted us to.
1087 */
1088 if (! xfs_qm_dqlock_nowait(dqp))
1089 continue;
1090 } else {
1091 xfs_dqlock(dqp);
1092 }
1093
1094 /*
1095 * Now, find out for sure if this dquot is dirty or not.
1096 */
1097 if (! XFS_DQ_IS_DIRTY(dqp)) {
1098 xfs_dqunlock(dqp);
1099 continue;
1100 }
1101
1102 /* XXX a sentinel would be better */
1103 recl = XFS_QI_MPLRECLAIMS(mp);
1104 if (! xfs_qm_dqflock_nowait(dqp)) {
1105 if (nowait) {
1106 xfs_dqunlock(dqp);
1107 continue;
1108 }
1109 /*
1110 * If we can't grab the flush lock then if the caller
1111 * really wanted us to give this our best shot,
1112 * see if we can give a push to the buffer before we wait
1113 * on the flush lock. At this point, we know that
1114 * eventhough the dquot is being flushed,
1115 * it has (new) dirty data.
1116 */
1117 xfs_qm_dqflock_pushbuf_wait(dqp);
1118 }
1119 /*
1120 * Let go of the mplist lock. We don't want to hold it
1121 * across a disk write
1122 */
1123 flush_flags = (nowait) ? XFS_QMOPT_DELWRI : XFS_QMOPT_SYNC;
1124 xfs_qm_mplist_unlock(mp);
1125 xfs_dqtrace_entry(dqp, "XQM_SYNC: DQFLUSH");
1126 error = xfs_qm_dqflush(dqp, flush_flags);
1127 xfs_dqunlock(dqp);
1128 if (error && XFS_FORCED_SHUTDOWN(mp))
1129 return(0); /* Need to prevent umount failure */
1130 else if (error)
1131 return (error);
1132
1133 xfs_qm_mplist_lock(mp);
1134 if (recl != XFS_QI_MPLRECLAIMS(mp)) {
1135 if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS)
1136 break;
1137
1138 xfs_qm_mplist_unlock(mp);
1139 goto again;
1140 }
1141 }
1142
1143 xfs_qm_mplist_unlock(mp);
1144 return (0);
1145}
1146
1147
1148/*
1149 * This initializes all the quota information that's kept in the
1150 * mount structure
1151 */
1152int
1153xfs_qm_init_quotainfo(
1154 xfs_mount_t *mp)
1155{
1156 xfs_quotainfo_t *qinf;
1157 int error;
1158 xfs_dquot_t *dqp;
1159
1160 ASSERT(XFS_IS_QUOTA_RUNNING(mp));
1161
1162 /*
1163 * Tell XQM that we exist as soon as possible.
1164 */
1165 if ((error = xfs_qm_hold_quotafs_ref(mp))) {
1166 return (error);
1167 }
1168
1169 qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
1170
1171 /*
1172 * See if quotainodes are setup, and if not, allocate them,
1173 * and change the superblock accordingly.
1174 */
1175 if ((error = xfs_qm_init_quotainos(mp))) {
1176 kmem_free(qinf, sizeof(xfs_quotainfo_t));
1177 mp->m_quotainfo = NULL;
1178 return (error);
1179 }
1180
1181 spinlock_init(&qinf->qi_pinlock, "xfs_qinf_pin");
1182 xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0);
1183 qinf->qi_dqreclaims = 0;
1184
1185 /* mutex used to serialize quotaoffs */
1186 mutex_init(&qinf->qi_quotaofflock, MUTEX_DEFAULT, "qoff");
1187
1188 /* Precalc some constants */
1189 qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
1190 ASSERT(qinf->qi_dqchunklen);
1191 qinf->qi_dqperchunk = BBTOB(qinf->qi_dqchunklen);
1192 do_div(qinf->qi_dqperchunk, sizeof(xfs_dqblk_t));
1193
1194 mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD);
1195
1196 /*
1197 * We try to get the limits from the superuser's limits fields.
1198 * This is quite hacky, but it is standard quota practice.
1199 * We look at the USR dquot with id == 0 first, but if user quotas
1200 * are not enabled we goto the GRP dquot with id == 0.
1201 * We don't really care to keep separate default limits for user
1202 * and group quotas, at least not at this point.
1203 */
1204 error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)0,
1205 (XFS_IS_UQUOTA_RUNNING(mp)) ?
1206 XFS_DQ_USER : XFS_DQ_GROUP,
1207 XFS_QMOPT_DQSUSER|XFS_QMOPT_DOWARN,
1208 &dqp);
1209 if (! error) {
1210 xfs_disk_dquot_t *ddqp = &dqp->q_core;
1211
1212 /*
1213 * The warnings and timers set the grace period given to
1214 * a user or group before he or she can not perform any
1215 * more writing. If it is zero, a default is used.
1216 */
1217 qinf->qi_btimelimit =
1218 INT_GET(ddqp->d_btimer, ARCH_CONVERT) ?
1219 INT_GET(ddqp->d_btimer, ARCH_CONVERT) :
1220 XFS_QM_BTIMELIMIT;
1221 qinf->qi_itimelimit =
1222 INT_GET(ddqp->d_itimer, ARCH_CONVERT) ?
1223 INT_GET(ddqp->d_itimer, ARCH_CONVERT) :
1224 XFS_QM_ITIMELIMIT;
1225 qinf->qi_rtbtimelimit =
1226 INT_GET(ddqp->d_rtbtimer, ARCH_CONVERT) ?
1227 INT_GET(ddqp->d_rtbtimer, ARCH_CONVERT) :
1228 XFS_QM_RTBTIMELIMIT;
1229 qinf->qi_bwarnlimit =
1230 INT_GET(ddqp->d_bwarns, ARCH_CONVERT) ?
1231 INT_GET(ddqp->d_bwarns, ARCH_CONVERT) :
1232 XFS_QM_BWARNLIMIT;
1233 qinf->qi_iwarnlimit =
1234 INT_GET(ddqp->d_iwarns, ARCH_CONVERT) ?
1235 INT_GET(ddqp->d_iwarns, ARCH_CONVERT) :
1236 XFS_QM_IWARNLIMIT;
1237 qinf->qi_bhardlimit =
1238 INT_GET(ddqp->d_blk_hardlimit, ARCH_CONVERT);
1239 qinf->qi_bsoftlimit =
1240 INT_GET(ddqp->d_blk_softlimit, ARCH_CONVERT);
1241 qinf->qi_ihardlimit =
1242 INT_GET(ddqp->d_ino_hardlimit, ARCH_CONVERT);
1243 qinf->qi_isoftlimit =
1244 INT_GET(ddqp->d_ino_softlimit, ARCH_CONVERT);
1245 qinf->qi_rtbhardlimit =
1246 INT_GET(ddqp->d_rtb_hardlimit, ARCH_CONVERT);
1247 qinf->qi_rtbsoftlimit =
1248 INT_GET(ddqp->d_rtb_softlimit, ARCH_CONVERT);
1249
1250 /*
1251 * We sent the XFS_QMOPT_DQSUSER flag to dqget because
1252 * we don't want this dquot cached. We haven't done a
1253 * quotacheck yet, and quotacheck doesn't like incore dquots.
1254 */
1255 xfs_qm_dqdestroy(dqp);
1256 } else {
1257 qinf->qi_btimelimit = XFS_QM_BTIMELIMIT;
1258 qinf->qi_itimelimit = XFS_QM_ITIMELIMIT;
1259 qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT;
1260 qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT;
1261 qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT;
1262 }
1263
1264 return (0);
1265}
1266
1267
1268/*
1269 * Gets called when unmounting a filesystem or when all quotas get
1270 * turned off.
1271 * This purges the quota inodes, destroys locks and frees itself.
1272 */
1273void
1274xfs_qm_destroy_quotainfo(
1275 xfs_mount_t *mp)
1276{
1277 xfs_quotainfo_t *qi;
1278
1279 qi = mp->m_quotainfo;
1280 ASSERT(qi != NULL);
1281 ASSERT(xfs_Gqm != NULL);
1282
1283 /*
1284 * Release the reference that XQM kept, so that we know
1285 * when the XQM structure should be freed. We cannot assume
1286 * that xfs_Gqm is non-null after this point.
1287 */
1288 xfs_qm_rele_quotafs_ref(mp);
1289
1290 spinlock_destroy(&qi->qi_pinlock);
1291 xfs_qm_list_destroy(&qi->qi_dqlist);
1292
1293 if (qi->qi_uquotaip) {
1294 XFS_PURGE_INODE(qi->qi_uquotaip);
1295 qi->qi_uquotaip = NULL; /* paranoia */
1296 }
1297 if (qi->qi_gquotaip) {
1298 XFS_PURGE_INODE(qi->qi_gquotaip);
1299 qi->qi_gquotaip = NULL;
1300 }
1301 mutex_destroy(&qi->qi_quotaofflock);
1302 kmem_free(qi, sizeof(xfs_quotainfo_t));
1303 mp->m_quotainfo = NULL;
1304}
1305
1306
1307
1308/* ------------------- PRIVATE STATIC FUNCTIONS ----------------------- */
1309
1310/* ARGSUSED */
1311STATIC void
1312xfs_qm_list_init(
1313 xfs_dqlist_t *list,
1314 char *str,
1315 int n)
1316{
1317 mutex_init(&list->qh_lock, MUTEX_DEFAULT, str);
1318 list->qh_next = NULL;
1319 list->qh_version = 0;
1320 list->qh_nelems = 0;
1321}
1322
1323STATIC void
1324xfs_qm_list_destroy(
1325 xfs_dqlist_t *list)
1326{
1327 mutex_destroy(&(list->qh_lock));
1328}
1329
1330
1331/*
1332 * Stripped down version of dqattach. This doesn't attach, or even look at the
1333 * dquots attached to the inode. The rationale is that there won't be any
1334 * attached at the time this is called from quotacheck.
1335 */
1336STATIC int
1337xfs_qm_dqget_noattach(
1338 xfs_inode_t *ip,
1339 xfs_dquot_t **O_udqpp,
1340 xfs_dquot_t **O_gdqpp)
1341{
1342 int error;
1343 xfs_mount_t *mp;
1344 xfs_dquot_t *udqp, *gdqp;
1345
1346 ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
1347 mp = ip->i_mount;
1348 udqp = NULL;
1349 gdqp = NULL;
1350
1351 if (XFS_IS_UQUOTA_ON(mp)) {
1352 ASSERT(ip->i_udquot == NULL);
1353 /*
1354 * We want the dquot allocated if it doesn't exist.
1355 */
1356 if ((error = xfs_qm_dqget(mp, ip, ip->i_d.di_uid, XFS_DQ_USER,
1357 XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN,
1358 &udqp))) {
1359 /*
1360 * Shouldn't be able to turn off quotas here.
1361 */
1362 ASSERT(error != ESRCH);
1363 ASSERT(error != ENOENT);
1364 return (error);
1365 }
1366 ASSERT(udqp);
1367 }
1368
1369 if (XFS_IS_GQUOTA_ON(mp)) {
1370 ASSERT(ip->i_gdquot == NULL);
1371 if (udqp)
1372 xfs_dqunlock(udqp);
1373 if ((error = xfs_qm_dqget(mp, ip, ip->i_d.di_gid, XFS_DQ_GROUP,
1374 XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN,
1375 &gdqp))) {
1376 if (udqp)
1377 xfs_qm_dqrele(udqp);
1378 ASSERT(error != ESRCH);
1379 ASSERT(error != ENOENT);
1380 return (error);
1381 }
1382 ASSERT(gdqp);
1383
1384 /* Reacquire the locks in the right order */
1385 if (udqp) {
1386 if (! xfs_qm_dqlock_nowait(udqp)) {
1387 xfs_dqunlock(gdqp);
1388 xfs_dqlock(udqp);
1389 xfs_dqlock(gdqp);
1390 }
1391 }
1392 }
1393
1394 *O_udqpp = udqp;
1395 *O_gdqpp = gdqp;
1396
1397#ifdef QUOTADEBUG
1398 if (udqp) ASSERT(XFS_DQ_IS_LOCKED(udqp));
1399 if (gdqp) ASSERT(XFS_DQ_IS_LOCKED(gdqp));
1400#endif
1401 return (0);
1402}
1403
1404/*
1405 * Create an inode and return with a reference already taken, but unlocked
1406 * This is how we create quota inodes
1407 */
1408STATIC int
1409xfs_qm_qino_alloc(
1410 xfs_mount_t *mp,
1411 xfs_inode_t **ip,
1412 __int64_t sbfields,
1413 uint flags)
1414{
1415 xfs_trans_t *tp;
1416 int error;
1417 unsigned long s;
1418 cred_t zerocr;
1419 int committed;
1420
1421 tp = xfs_trans_alloc(mp,XFS_TRANS_QM_QINOCREATE);
1422 if ((error = xfs_trans_reserve(tp,
1423 XFS_QM_QINOCREATE_SPACE_RES(mp),
1424 XFS_CREATE_LOG_RES(mp), 0,
1425 XFS_TRANS_PERM_LOG_RES,
1426 XFS_CREATE_LOG_COUNT))) {
1427 xfs_trans_cancel(tp, 0);
1428 return (error);
1429 }
1430 memset(&zerocr, 0, sizeof(zerocr));
1431
1432 if ((error = xfs_dir_ialloc(&tp, mp->m_rootip, S_IFREG, 1, 0,
1433 &zerocr, 0, 1, ip, &committed))) {
1434 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
1435 XFS_TRANS_ABORT);
1436 return (error);
1437 }
1438
1439 /*
1440 * Keep an extra reference to this quota inode. This inode is
1441 * locked exclusively and joined to the transaction already.
1442 */
1443 ASSERT(XFS_ISLOCKED_INODE_EXCL(*ip));
1444 VN_HOLD(XFS_ITOV((*ip)));
1445
1446 /*
1447 * Make the changes in the superblock, and log those too.
1448 * sbfields arg may contain fields other than *QUOTINO;
1449 * VERSIONNUM for example.
1450 */
1451 s = XFS_SB_LOCK(mp);
1452 if (flags & XFS_QMOPT_SBVERSION) {
1453#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
1454 unsigned oldv = mp->m_sb.sb_versionnum;
1455#endif
1456 ASSERT(!XFS_SB_VERSION_HASQUOTA(&mp->m_sb));
1457 ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
1458 XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) ==
1459 (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
1460 XFS_SB_GQUOTINO | XFS_SB_QFLAGS));
1461
1462 XFS_SB_VERSION_ADDQUOTA(&mp->m_sb);
1463 mp->m_sb.sb_uquotino = NULLFSINO;
1464 mp->m_sb.sb_gquotino = NULLFSINO;
1465
1466 /* qflags will get updated _after_ quotacheck */
1467 mp->m_sb.sb_qflags = 0;
1468#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
1469 cmn_err(CE_NOTE,
1470 "Old superblock version %x, converting to %x.",
1471 oldv, mp->m_sb.sb_versionnum);
1472#endif
1473 }
1474 if (flags & XFS_QMOPT_UQUOTA)
1475 mp->m_sb.sb_uquotino = (*ip)->i_ino;
1476 else
1477 mp->m_sb.sb_gquotino = (*ip)->i_ino;
1478 XFS_SB_UNLOCK(mp, s);
1479 xfs_mod_sb(tp, sbfields);
1480
1481 if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES,
1482 NULL))) {
1483 xfs_fs_cmn_err(CE_ALERT, mp, "XFS qino_alloc failed!");
1484 return (error);
1485 }
1486 return (0);
1487}
1488
1489
1490STATIC int
1491xfs_qm_reset_dqcounts(
1492 xfs_mount_t *mp,
1493 xfs_buf_t *bp,
1494 xfs_dqid_t id,
1495 uint type)
1496{
1497 xfs_disk_dquot_t *ddq;
1498 int j;
1499
1500 xfs_buftrace("RESET DQUOTS", bp);
1501 /*
1502 * Reset all counters and timers. They'll be
1503 * started afresh by xfs_qm_quotacheck.
1504 */
1505#ifdef DEBUG
1506 j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
1507 do_div(j, sizeof(xfs_dqblk_t));
1508 ASSERT(XFS_QM_DQPERBLK(mp) == j);
1509#endif
1510 ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp);
1511 for (j = 0; j < XFS_QM_DQPERBLK(mp); j++) {
1512 /*
1513 * Do a sanity check, and if needed, repair the dqblk. Don't
1514 * output any warnings because it's perfectly possible to
1515 * find unitialized dquot blks. See comment in xfs_qm_dqcheck.
1516 */
1517 (void) xfs_qm_dqcheck(ddq, id+j, type, XFS_QMOPT_DQREPAIR,
1518 "xfs_quotacheck");
1519 INT_SET(ddq->d_bcount, ARCH_CONVERT, 0ULL);
1520 INT_SET(ddq->d_icount, ARCH_CONVERT, 0ULL);
1521 INT_SET(ddq->d_rtbcount, ARCH_CONVERT, 0ULL);
1522 INT_SET(ddq->d_btimer, ARCH_CONVERT, (time_t)0);
1523 INT_SET(ddq->d_itimer, ARCH_CONVERT, (time_t)0);
1524 INT_SET(ddq->d_bwarns, ARCH_CONVERT, 0UL);
1525 INT_SET(ddq->d_iwarns, ARCH_CONVERT, 0UL);
1526 ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1);
1527 }
1528
1529 return (0);
1530}
1531
1532STATIC int
1533xfs_qm_dqiter_bufs(
1534 xfs_mount_t *mp,
1535 xfs_dqid_t firstid,
1536 xfs_fsblock_t bno,
1537 xfs_filblks_t blkcnt,
1538 uint flags)
1539{
1540 xfs_buf_t *bp;
1541 int error;
1542 int notcommitted;
1543 int incr;
1544
1545 ASSERT(blkcnt > 0);
1546 notcommitted = 0;
1547 incr = (blkcnt > XFS_QM_MAX_DQCLUSTER_LOGSZ) ?
1548 XFS_QM_MAX_DQCLUSTER_LOGSZ : blkcnt;
1549 error = 0;
1550
1551 /*
1552 * Blkcnt arg can be a very big number, and might even be
1553 * larger than the log itself. So, we have to break it up into
1554 * manageable-sized transactions.
1555 * Note that we don't start a permanent transaction here; we might
1556 * not be able to get a log reservation for the whole thing up front,
1557 * and we don't really care to either, because we just discard
1558 * everything if we were to crash in the middle of this loop.
1559 */
1560 while (blkcnt--) {
1561 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
1562 XFS_FSB_TO_DADDR(mp, bno),
1563 (int)XFS_QI_DQCHUNKLEN(mp), 0, &bp);
1564 if (error)
1565 break;
1566
1567 (void) xfs_qm_reset_dqcounts(mp, bp, firstid,
1568 flags & XFS_QMOPT_UQUOTA ?
1569 XFS_DQ_USER : XFS_DQ_GROUP);
1570 xfs_bdwrite(mp, bp);
1571 /*
1572 * goto the next block.
1573 */
1574 bno++;
1575 firstid += XFS_QM_DQPERBLK(mp);
1576 }
1577 return (error);
1578}
1579
1580/*
1581 * Iterate over all allocated USR/GRP dquots in the system, calling a
1582 * caller supplied function for every chunk of dquots that we find.
1583 */
1584STATIC int
1585xfs_qm_dqiterate(
1586 xfs_mount_t *mp,
1587 xfs_inode_t *qip,
1588 uint flags)
1589{
1590 xfs_bmbt_irec_t *map;
1591 int i, nmaps; /* number of map entries */
1592 int error; /* return value */
1593 xfs_fileoff_t lblkno;
1594 xfs_filblks_t maxlblkcnt;
1595 xfs_dqid_t firstid;
1596 xfs_fsblock_t rablkno;
1597 xfs_filblks_t rablkcnt;
1598
1599 error = 0;
1600 /*
1601 * This looks racey, but we can't keep an inode lock across a
1602 * trans_reserve. But, this gets called during quotacheck, and that
1603 * happens only at mount time which is single threaded.
1604 */
1605 if (qip->i_d.di_nblocks == 0)
1606 return (0);
1607
1608 map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP);
1609
1610 lblkno = 0;
1611 maxlblkcnt = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
1612 do {
1613 nmaps = XFS_DQITER_MAP_SIZE;
1614 /*
1615 * We aren't changing the inode itself. Just changing
1616 * some of its data. No new blocks are added here, and
1617 * the inode is never added to the transaction.
1618 */
1619 xfs_ilock(qip, XFS_ILOCK_SHARED);
1620 error = xfs_bmapi(NULL, qip, lblkno,
1621 maxlblkcnt - lblkno,
1622 XFS_BMAPI_METADATA,
1623 NULL,
1624 0, map, &nmaps, NULL);
1625 xfs_iunlock(qip, XFS_ILOCK_SHARED);
1626 if (error)
1627 break;
1628
1629 ASSERT(nmaps <= XFS_DQITER_MAP_SIZE);
1630 for (i = 0; i < nmaps; i++) {
1631 ASSERT(map[i].br_startblock != DELAYSTARTBLOCK);
1632 ASSERT(map[i].br_blockcount);
1633
1634
1635 lblkno += map[i].br_blockcount;
1636
1637 if (map[i].br_startblock == HOLESTARTBLOCK)
1638 continue;
1639
1640 firstid = (xfs_dqid_t) map[i].br_startoff *
1641 XFS_QM_DQPERBLK(mp);
1642 /*
1643 * Do a read-ahead on the next extent.
1644 */
1645 if ((i+1 < nmaps) &&
1646 (map[i+1].br_startblock != HOLESTARTBLOCK)) {
1647 rablkcnt = map[i+1].br_blockcount;
1648 rablkno = map[i+1].br_startblock;
1649 while (rablkcnt--) {
1650 xfs_baread(mp->m_ddev_targp,
1651 XFS_FSB_TO_DADDR(mp, rablkno),
1652 (int)XFS_QI_DQCHUNKLEN(mp));
1653 rablkno++;
1654 }
1655 }
1656 /*
1657 * Iterate thru all the blks in the extent and
1658 * reset the counters of all the dquots inside them.
1659 */
1660 if ((error = xfs_qm_dqiter_bufs(mp,
1661 firstid,
1662 map[i].br_startblock,
1663 map[i].br_blockcount,
1664 flags))) {
1665 break;
1666 }
1667 }
1668
1669 if (error)
1670 break;
1671 } while (nmaps > 0);
1672
1673 kmem_free(map, XFS_DQITER_MAP_SIZE * sizeof(*map));
1674
1675 return (error);
1676}
1677
1678/*
1679 * Called by dqusage_adjust in doing a quotacheck.
1680 * Given the inode, and a dquot (either USR or GRP, doesn't matter),
1681 * this updates its incore copy as well as the buffer copy. This is
1682 * so that once the quotacheck is done, we can just log all the buffers,
1683 * as opposed to logging numerous updates to individual dquots.
1684 */
1685STATIC void
1686xfs_qm_quotacheck_dqadjust(
1687 xfs_dquot_t *dqp,
1688 xfs_qcnt_t nblks,
1689 xfs_qcnt_t rtblks)
1690{
1691 ASSERT(XFS_DQ_IS_LOCKED(dqp));
1692 xfs_dqtrace_entry(dqp, "QCHECK DQADJUST");
1693 /*
1694 * Adjust the inode count and the block count to reflect this inode's
1695 * resource usage.
1696 */
1697 INT_MOD(dqp->q_core.d_icount, ARCH_CONVERT, +1);
1698 dqp->q_res_icount++;
1699 if (nblks) {
1700 INT_MOD(dqp->q_core.d_bcount, ARCH_CONVERT, nblks);
1701 dqp->q_res_bcount += nblks;
1702 }
1703 if (rtblks) {
1704 INT_MOD(dqp->q_core.d_rtbcount, ARCH_CONVERT, rtblks);
1705 dqp->q_res_rtbcount += rtblks;
1706 }
1707
1708 /*
1709 * Set default limits, adjust timers (since we changed usages)
1710 */
1711 if (! XFS_IS_SUSER_DQUOT(dqp)) {
1712 xfs_qm_adjust_dqlimits(dqp->q_mount, &dqp->q_core);
1713 xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core);
1714 }
1715
1716 dqp->dq_flags |= XFS_DQ_DIRTY;
1717}
1718
1719STATIC int
1720xfs_qm_get_rtblks(
1721 xfs_inode_t *ip,
1722 xfs_qcnt_t *O_rtblks)
1723{
1724 xfs_filblks_t rtblks; /* total rt blks */
1725 xfs_ifork_t *ifp; /* inode fork pointer */
1726 xfs_extnum_t nextents; /* number of extent entries */
1727 xfs_bmbt_rec_t *base; /* base of extent array */
1728 xfs_bmbt_rec_t *ep; /* pointer to an extent entry */
1729 int error;
1730
1731 ASSERT(XFS_IS_REALTIME_INODE(ip));
1732 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
1733 if (!(ifp->if_flags & XFS_IFEXTENTS)) {
1734 if ((error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK)))
1735 return (error);
1736 }
1737 rtblks = 0;
1738 nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
1739 base = &ifp->if_u1.if_extents[0];
1740 for (ep = base; ep < &base[nextents]; ep++)
1741 rtblks += xfs_bmbt_get_blockcount(ep);
1742 *O_rtblks = (xfs_qcnt_t)rtblks;
1743 return (0);
1744}
1745
1746/*
1747 * callback routine supplied to bulkstat(). Given an inumber, find its
1748 * dquots and update them to account for resources taken by that inode.
1749 */
1750/* ARGSUSED */
1751STATIC int
1752xfs_qm_dqusage_adjust(
1753 xfs_mount_t *mp, /* mount point for filesystem */
1754 xfs_ino_t ino, /* inode number to get data for */
1755 void __user *buffer, /* not used */
1756 int ubsize, /* not used */
1757 void *private_data, /* not used */
1758 xfs_daddr_t bno, /* starting block of inode cluster */
1759 int *ubused, /* not used */
1760 void *dip, /* on-disk inode pointer (not used) */
1761 int *res) /* result code value */
1762{
1763 xfs_inode_t *ip;
1764 xfs_dquot_t *udqp, *gdqp;
1765 xfs_qcnt_t nblks, rtblks;
1766 int error;
1767
1768 ASSERT(XFS_IS_QUOTA_RUNNING(mp));
1769
1770 /*
1771 * rootino must have its resources accounted for, not so with the quota
1772 * inodes.
1773 */
1774 if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) {
1775 *res = BULKSTAT_RV_NOTHING;
1776 return XFS_ERROR(EINVAL);
1777 }
1778
1779 /*
1780 * We don't _need_ to take the ilock EXCL. However, the xfs_qm_dqget
1781 * interface expects the inode to be exclusively locked because that's
1782 * the case in all other instances. It's OK that we do this because
1783 * quotacheck is done only at mount time.
1784 */
1785 if ((error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip, bno))) {
1786 *res = BULKSTAT_RV_NOTHING;
1787 return (error);
1788 }
1789
1790 if (ip->i_d.di_mode == 0) {
1791 xfs_iput_new(ip, XFS_ILOCK_EXCL);
1792 *res = BULKSTAT_RV_NOTHING;
1793 return XFS_ERROR(ENOENT);
1794 }
1795
1796 /*
1797 * Obtain the locked dquots. In case of an error (eg. allocation
1798 * fails for ENOSPC), we return the negative of the error number
1799 * to bulkstat, so that it can get propagated to quotacheck() and
1800 * making us disable quotas for the file system.
1801 */
1802 if ((error = xfs_qm_dqget_noattach(ip, &udqp, &gdqp))) {
1803 xfs_iput(ip, XFS_ILOCK_EXCL);
1804 *res = BULKSTAT_RV_GIVEUP;
1805 return (error);
1806 }
1807
1808 rtblks = 0;
1809 if (! XFS_IS_REALTIME_INODE(ip)) {
1810 nblks = (xfs_qcnt_t)ip->i_d.di_nblocks;
1811 } else {
1812 /*
1813 * Walk thru the extent list and count the realtime blocks.
1814 */
1815 if ((error = xfs_qm_get_rtblks(ip, &rtblks))) {
1816 xfs_iput(ip, XFS_ILOCK_EXCL);
1817 if (udqp)
1818 xfs_qm_dqput(udqp);
1819 if (gdqp)
1820 xfs_qm_dqput(gdqp);
1821 *res = BULKSTAT_RV_GIVEUP;
1822 return (error);
1823 }
1824 nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks;
1825 }
1826 ASSERT(ip->i_delayed_blks == 0);
1827
1828 /*
1829 * We can't release the inode while holding its dquot locks.
1830 * The inode can go into inactive and might try to acquire the dquotlocks.
1831 * So, just unlock here and do a vn_rele at the end.
1832 */
1833 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1834
1835 /*
1836 * Add the (disk blocks and inode) resources occupied by this
1837 * inode to its dquots. We do this adjustment in the incore dquot,
1838 * and also copy the changes to its buffer.
1839 * We don't care about putting these changes in a transaction
1840 * envelope because if we crash in the middle of a 'quotacheck'
1841 * we have to start from the beginning anyway.
1842 * Once we're done, we'll log all the dquot bufs.
1843 *
1844 * The *QUOTA_ON checks below may look pretty racey, but quotachecks
1845 * and quotaoffs don't race. (Quotachecks happen at mount time only).
1846 */
1847 if (XFS_IS_UQUOTA_ON(mp)) {
1848 ASSERT(udqp);
1849 xfs_qm_quotacheck_dqadjust(udqp, nblks, rtblks);
1850 xfs_qm_dqput(udqp);
1851 }
1852 if (XFS_IS_GQUOTA_ON(mp)) {
1853 ASSERT(gdqp);
1854 xfs_qm_quotacheck_dqadjust(gdqp, nblks, rtblks);
1855 xfs_qm_dqput(gdqp);
1856 }
1857 /*
1858 * Now release the inode. This will send it to 'inactive', and
1859 * possibly even free blocks.
1860 */
1861 VN_RELE(XFS_ITOV(ip));
1862
1863 /*
1864 * Goto next inode.
1865 */
1866 *res = BULKSTAT_RV_DIDONE;
1867 return (0);
1868}
1869
1870/*
1871 * Walk thru all the filesystem inodes and construct a consistent view
1872 * of the disk quota world. If the quotacheck fails, disable quotas.
1873 */
1874int
1875xfs_qm_quotacheck(
1876 xfs_mount_t *mp)
1877{
1878 int done, count, error;
1879 xfs_ino_t lastino;
1880 size_t structsz;
1881 xfs_inode_t *uip, *gip;
1882 uint flags;
1883
1884 count = INT_MAX;
1885 structsz = 1;
1886 lastino = 0;
1887 flags = 0;
1888
1889 ASSERT(XFS_QI_UQIP(mp) || XFS_QI_GQIP(mp));
1890 ASSERT(XFS_IS_QUOTA_RUNNING(mp));
1891
1892 /*
1893 * There should be no cached dquots. The (simplistic) quotacheck
1894 * algorithm doesn't like that.
1895 */
1896 ASSERT(XFS_QI_MPLNDQUOTS(mp) == 0);
1897
1898 cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname);
1899
1900 /*
1901 * First we go thru all the dquots on disk, USR and GRP, and reset
1902 * their counters to zero. We need a clean slate.
1903 * We don't log our changes till later.
1904 */
1905 if ((uip = XFS_QI_UQIP(mp))) {
1906 if ((error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA)))
1907 goto error_return;
1908 flags |= XFS_UQUOTA_CHKD;
1909 }
1910
1911 if ((gip = XFS_QI_GQIP(mp))) {
1912 if ((error = xfs_qm_dqiterate(mp, gip, XFS_QMOPT_GQUOTA)))
1913 goto error_return;
1914 flags |= XFS_GQUOTA_CHKD;
1915 }
1916
1917 do {
1918 /*
1919 * Iterate thru all the inodes in the file system,
1920 * adjusting the corresponding dquot counters in core.
1921 */
1922 if ((error = xfs_bulkstat(mp, &lastino, &count,
1923 xfs_qm_dqusage_adjust, NULL,
1924 structsz, NULL,
1925 BULKSTAT_FG_IGET|BULKSTAT_FG_VFSLOCKED,
1926 &done)))
1927 break;
1928
1929 } while (! done);
1930
1931 /*
1932 * We can get this error if we couldn't do a dquot allocation inside
1933 * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the
1934 * dirty dquots that might be cached, we just want to get rid of them
1935 * and turn quotaoff. The dquots won't be attached to any of the inodes
1936 * at this point (because we intentionally didn't in dqget_noattach).
1937 */
1938 if (error) {
1939 xfs_qm_dqpurge_all(mp,
1940 XFS_QMOPT_UQUOTA|XFS_QMOPT_GQUOTA|
1941 XFS_QMOPT_QUOTAOFF);
1942 goto error_return;
1943 }
1944 /*
1945 * We've made all the changes that we need to make incore.
1946 * Now flush_them down to disk buffers.
1947 */
1948 xfs_qm_dqflush_all(mp, XFS_QMOPT_DELWRI);
1949
1950 /*
1951 * We didn't log anything, because if we crashed, we'll have to
1952 * start the quotacheck from scratch anyway. However, we must make
1953 * sure that our dquot changes are secure before we put the
1954 * quotacheck'd stamp on the superblock. So, here we do a synchronous
1955 * flush.
1956 */
1957 XFS_bflush(mp->m_ddev_targp);
1958
1959 /*
1960 * If one type of quotas is off, then it will lose its
1961 * quotachecked status, since we won't be doing accounting for
1962 * that type anymore.
1963 */
1964 mp->m_qflags &= ~(XFS_GQUOTA_CHKD | XFS_UQUOTA_CHKD);
1965 mp->m_qflags |= flags;
1966
1967 XQM_LIST_PRINT(&(XFS_QI_MPL_LIST(mp)), MPL_NEXT, "++++ Mp list +++");
1968
1969 error_return:
1970 if (error) {
1971 cmn_err(CE_WARN, "XFS quotacheck %s: Unsuccessful (Error %d): "
1972 "Disabling quotas.",
1973 mp->m_fsname, error);
1974 /*
1975 * We must turn off quotas.
1976 */
1977 ASSERT(mp->m_quotainfo != NULL);
1978 ASSERT(xfs_Gqm != NULL);
1979 xfs_qm_destroy_quotainfo(mp);
1980 xfs_mount_reset_sbqflags(mp);
1981 } else {
1982 cmn_err(CE_NOTE, "XFS quotacheck %s: Done.", mp->m_fsname);
1983 }
1984 return (error);
1985}
1986
1987/*
1988 * This is called after the superblock has been read in and we're ready to
1989 * iget the quota inodes.
1990 */
1991STATIC int
1992xfs_qm_init_quotainos(
1993 xfs_mount_t *mp)
1994{
1995 xfs_inode_t *uip, *gip;
1996 int error;
1997 __int64_t sbflags;
1998 uint flags;
1999
2000 ASSERT(mp->m_quotainfo);
2001 uip = gip = NULL;
2002 sbflags = 0;
2003 flags = 0;
2004
2005 /*
2006 * Get the uquota and gquota inodes
2007 */
2008 if (XFS_SB_VERSION_HASQUOTA(&mp->m_sb)) {
2009 if (XFS_IS_UQUOTA_ON(mp) &&
2010 mp->m_sb.sb_uquotino != NULLFSINO) {
2011 ASSERT(mp->m_sb.sb_uquotino > 0);
2012 if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
2013 0, 0, &uip, 0)))
2014 return XFS_ERROR(error);
2015 }
2016 if (XFS_IS_GQUOTA_ON(mp) &&
2017 mp->m_sb.sb_gquotino != NULLFSINO) {
2018 ASSERT(mp->m_sb.sb_gquotino > 0);
2019 if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
2020 0, 0, &gip, 0))) {
2021 if (uip)
2022 VN_RELE(XFS_ITOV(uip));
2023 return XFS_ERROR(error);
2024 }
2025 }
2026 } else {
2027 flags |= XFS_QMOPT_SBVERSION;
2028 sbflags |= (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
2029 XFS_SB_GQUOTINO | XFS_SB_QFLAGS);
2030 }
2031
2032 /*
2033 * Create the two inodes, if they don't exist already. The changes
2034 * made above will get added to a transaction and logged in one of
2035 * the qino_alloc calls below. If the device is readonly,
2036 * temporarily switch to read-write to do this.
2037 */
2038 if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) {
2039 if ((error = xfs_qm_qino_alloc(mp, &uip,
2040 sbflags | XFS_SB_UQUOTINO,
2041 flags | XFS_QMOPT_UQUOTA)))
2042 return XFS_ERROR(error);
2043
2044 flags &= ~XFS_QMOPT_SBVERSION;
2045 }
2046 if (XFS_IS_GQUOTA_ON(mp) && gip == NULL) {
2047 if ((error = xfs_qm_qino_alloc(mp, &gip,
2048 sbflags | XFS_SB_GQUOTINO,
2049 flags | XFS_QMOPT_GQUOTA))) {
2050 if (uip)
2051 VN_RELE(XFS_ITOV(uip));
2052
2053 return XFS_ERROR(error);
2054 }
2055 }
2056
2057 XFS_QI_UQIP(mp) = uip;
2058 XFS_QI_GQIP(mp) = gip;
2059
2060 return (0);
2061}
2062
2063
2064/*
2065 * Traverse the freelist of dquots and attempt to reclaim a maximum of
2066 * 'howmany' dquots. This operation races with dqlookup(), and attempts to
2067 * favor the lookup function ...
2068 * XXXsup merge this with qm_reclaim_one().
2069 */
2070STATIC int
2071xfs_qm_shake_freelist(
2072 int howmany)
2073{
2074 int nreclaimed;
2075 xfs_dqhash_t *hash;
2076 xfs_dquot_t *dqp, *nextdqp;
2077 int restarts;
2078 int nflushes;
2079
2080 if (howmany <= 0)
2081 return (0);
2082
2083 nreclaimed = 0;
2084 restarts = 0;
2085 nflushes = 0;
2086
2087#ifdef QUOTADEBUG
2088 cmn_err(CE_DEBUG, "Shake free 0x%x", howmany);
2089#endif
2090 /* lock order is : hashchainlock, freelistlock, mplistlock */
2091 tryagain:
2092 xfs_qm_freelist_lock(xfs_Gqm);
2093
2094 for (dqp = xfs_Gqm->qm_dqfreelist.qh_next;
2095 ((dqp != (xfs_dquot_t *) &xfs_Gqm->qm_dqfreelist) &&
2096 nreclaimed < howmany); ) {
2097 xfs_dqlock(dqp);
2098
2099 /*
2100 * We are racing with dqlookup here. Naturally we don't
2101 * want to reclaim a dquot that lookup wants.
2102 */
2103 if (dqp->dq_flags & XFS_DQ_WANT) {
2104 xfs_dqunlock(dqp);
2105 xfs_qm_freelist_unlock(xfs_Gqm);
2106 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
2107 return (nreclaimed);
2108 XQM_STATS_INC(xqmstats.xs_qm_dqwants);
2109 goto tryagain;
2110 }
2111
2112 /*
2113 * If the dquot is inactive, we are assured that it is
2114 * not on the mplist or the hashlist, and that makes our
2115 * life easier.
2116 */
2117 if (dqp->dq_flags & XFS_DQ_INACTIVE) {
2118 ASSERT(dqp->q_mount == NULL);
2119 ASSERT(! XFS_DQ_IS_DIRTY(dqp));
2120 ASSERT(dqp->HL_PREVP == NULL);
2121 ASSERT(dqp->MPL_PREVP == NULL);
2122 XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
2123 nextdqp = dqp->dq_flnext;
2124 goto off_freelist;
2125 }
2126
2127 ASSERT(dqp->MPL_PREVP);
2128 /*
2129 * Try to grab the flush lock. If this dquot is in the process of
2130 * getting flushed to disk, we don't want to reclaim it.
2131 */
2132 if (! xfs_qm_dqflock_nowait(dqp)) {
2133 xfs_dqunlock(dqp);
2134 dqp = dqp->dq_flnext;
2135 continue;
2136 }
2137
2138 /*
2139 * We have the flush lock so we know that this is not in the
2140 * process of being flushed. So, if this is dirty, flush it
2141 * DELWRI so that we don't get a freelist infested with
2142 * dirty dquots.
2143 */
2144 if (XFS_DQ_IS_DIRTY(dqp)) {
2145 xfs_dqtrace_entry(dqp, "DQSHAKE: DQDIRTY");
2146 /*
2147 * We flush it delayed write, so don't bother
2148 * releasing the mplock.
2149 */
2150 (void) xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
2151 xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
2152 dqp = dqp->dq_flnext;
2153 continue;
2154 }
2155 /*
2156 * We're trying to get the hashlock out of order. This races
2157 * with dqlookup; so, we giveup and goto the next dquot if
2158 * we couldn't get the hashlock. This way, we won't starve
2159 * a dqlookup process that holds the hashlock that is
2160 * waiting for the freelist lock.
2161 */
2162 if (! xfs_qm_dqhashlock_nowait(dqp)) {
2163 xfs_dqfunlock(dqp);
2164 xfs_dqunlock(dqp);
2165 dqp = dqp->dq_flnext;
2166 continue;
2167 }
2168 /*
2169 * This races with dquot allocation code as well as dqflush_all
2170 * and reclaim code. So, if we failed to grab the mplist lock,
2171 * giveup everything and start over.
2172 */
2173 hash = dqp->q_hash;
2174 ASSERT(hash);
2175 if (! xfs_qm_mplist_nowait(dqp->q_mount)) {
2176 /* XXX put a sentinel so that we can come back here */
2177 xfs_dqfunlock(dqp);
2178 xfs_dqunlock(dqp);
2179 XFS_DQ_HASH_UNLOCK(hash);
2180 xfs_qm_freelist_unlock(xfs_Gqm);
2181 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
2182 return (nreclaimed);
2183 goto tryagain;
2184 }
2185 xfs_dqtrace_entry(dqp, "DQSHAKE: UNLINKING");
2186#ifdef QUOTADEBUG
2187 cmn_err(CE_DEBUG, "Shake 0x%p, ID 0x%x\n",
2188 dqp, INT_GET(dqp->q_core.d_id, ARCH_CONVERT));
2189#endif
2190 ASSERT(dqp->q_nrefs == 0);
2191 nextdqp = dqp->dq_flnext;
2192 XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp);
2193 XQM_HASHLIST_REMOVE(hash, dqp);
2194 xfs_dqfunlock(dqp);
2195 xfs_qm_mplist_unlock(dqp->q_mount);
2196 XFS_DQ_HASH_UNLOCK(hash);
2197
2198 off_freelist:
2199 XQM_FREELIST_REMOVE(dqp);
2200 xfs_dqunlock(dqp);
2201 nreclaimed++;
2202 XQM_STATS_INC(xqmstats.xs_qm_dqshake_reclaims);
2203 xfs_qm_dqdestroy(dqp);
2204 dqp = nextdqp;
2205 }
2206 xfs_qm_freelist_unlock(xfs_Gqm);
2207 return (nreclaimed);
2208}
2209
2210
2211/*
2212 * The kmem_shake interface is invoked when memory is running low.
2213 */
2214/* ARGSUSED */
2215STATIC int
2216xfs_qm_shake(int nr_to_scan, unsigned int gfp_mask)
2217{
2218 int ndqused, nfree, n;
2219
2220 if (!kmem_shake_allow(gfp_mask))
2221 return (0);
2222 if (!xfs_Gqm)
2223 return (0);
2224
2225 nfree = xfs_Gqm->qm_dqfreelist.qh_nelems; /* free dquots */
2226 /* incore dquots in all f/s's */
2227 ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree;
2228
2229 ASSERT(ndqused >= 0);
2230
2231 if (nfree <= ndqused && nfree < ndquot)
2232 return (0);
2233
2234 ndqused *= xfs_Gqm->qm_dqfree_ratio; /* target # of free dquots */
2235 n = nfree - ndqused - ndquot; /* # over target */
2236
2237 return xfs_qm_shake_freelist(MAX(nfree, n));
2238}
2239
2240
2241/*
2242 * Just pop the least recently used dquot off the freelist and
2243 * recycle it. The returned dquot is locked.
2244 */
2245STATIC xfs_dquot_t *
2246xfs_qm_dqreclaim_one(void)
2247{
2248 xfs_dquot_t *dqpout;
2249 xfs_dquot_t *dqp;
2250 int restarts;
2251 int nflushes;
2252
2253 restarts = 0;
2254 dqpout = NULL;
2255 nflushes = 0;
2256
2257 /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
2258 startagain:
2259 xfs_qm_freelist_lock(xfs_Gqm);
2260
2261 FOREACH_DQUOT_IN_FREELIST(dqp, &(xfs_Gqm->qm_dqfreelist)) {
2262 xfs_dqlock(dqp);
2263
2264 /*
2265 * We are racing with dqlookup here. Naturally we don't
2266 * want to reclaim a dquot that lookup wants. We release the
2267 * freelist lock and start over, so that lookup will grab
2268 * both the dquot and the freelistlock.
2269 */
2270 if (dqp->dq_flags & XFS_DQ_WANT) {
2271 ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
2272 xfs_dqtrace_entry(dqp, "DQRECLAIM: DQWANT");
2273 xfs_dqunlock(dqp);
2274 xfs_qm_freelist_unlock(xfs_Gqm);
2275 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
2276 return (NULL);
2277 XQM_STATS_INC(xqmstats.xs_qm_dqwants);
2278 goto startagain;
2279 }
2280
2281 /*
2282 * If the dquot is inactive, we are assured that it is
2283 * not on the mplist or the hashlist, and that makes our
2284 * life easier.
2285 */
2286 if (dqp->dq_flags & XFS_DQ_INACTIVE) {
2287 ASSERT(dqp->q_mount == NULL);
2288 ASSERT(! XFS_DQ_IS_DIRTY(dqp));
2289 ASSERT(dqp->HL_PREVP == NULL);
2290 ASSERT(dqp->MPL_PREVP == NULL);
2291 XQM_FREELIST_REMOVE(dqp);
2292 xfs_dqunlock(dqp);
2293 dqpout = dqp;
2294 XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
2295 break;
2296 }
2297
2298 ASSERT(dqp->q_hash);
2299 ASSERT(dqp->MPL_PREVP);
2300
2301 /*
2302 * Try to grab the flush lock. If this dquot is in the process of
2303 * getting flushed to disk, we don't want to reclaim it.
2304 */
2305 if (! xfs_qm_dqflock_nowait(dqp)) {
2306 xfs_dqunlock(dqp);
2307 continue;
2308 }
2309
2310 /*
2311 * We have the flush lock so we know that this is not in the
2312 * process of being flushed. So, if this is dirty, flush it
2313 * DELWRI so that we don't get a freelist infested with
2314 * dirty dquots.
2315 */
2316 if (XFS_DQ_IS_DIRTY(dqp)) {
2317 xfs_dqtrace_entry(dqp, "DQRECLAIM: DQDIRTY");
2318 /*
2319 * We flush it delayed write, so don't bother
2320 * releasing the freelist lock.
2321 */
2322 (void) xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
2323 xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
2324 continue;
2325 }
2326
2327 if (! xfs_qm_mplist_nowait(dqp->q_mount)) {
2328 xfs_dqfunlock(dqp);
2329 xfs_dqunlock(dqp);
2330 continue;
2331 }
2332
2333 if (! xfs_qm_dqhashlock_nowait(dqp))
2334 goto mplistunlock;
2335
2336 ASSERT(dqp->q_nrefs == 0);
2337 xfs_dqtrace_entry(dqp, "DQRECLAIM: UNLINKING");
2338 XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp);
2339 XQM_HASHLIST_REMOVE(dqp->q_hash, dqp);
2340 XQM_FREELIST_REMOVE(dqp);
2341 dqpout = dqp;
2342 XFS_DQ_HASH_UNLOCK(dqp->q_hash);
2343 mplistunlock:
2344 xfs_qm_mplist_unlock(dqp->q_mount);
2345 xfs_dqfunlock(dqp);
2346 xfs_dqunlock(dqp);
2347 if (dqpout)
2348 break;
2349 }
2350
2351 xfs_qm_freelist_unlock(xfs_Gqm);
2352 return (dqpout);
2353}
2354
2355
2356/*------------------------------------------------------------------*/
2357
2358/*
2359 * Return a new incore dquot. Depending on the number of
2360 * dquots in the system, we either allocate a new one on the kernel heap,
2361 * or reclaim a free one.
2362 * Return value is B_TRUE if we allocated a new dquot, B_FALSE if we managed
2363 * to reclaim an existing one from the freelist.
2364 */
2365boolean_t
2366xfs_qm_dqalloc_incore(
2367 xfs_dquot_t **O_dqpp)
2368{
2369 xfs_dquot_t *dqp;
2370
2371 /*
2372 * Check against high water mark to see if we want to pop
2373 * a nincompoop dquot off the freelist.
2374 */
2375 if (atomic_read(&xfs_Gqm->qm_totaldquots) >= ndquot) {
2376 /*
2377 * Try to recycle a dquot from the freelist.
2378 */
2379 if ((dqp = xfs_qm_dqreclaim_one())) {
2380 XQM_STATS_INC(xqmstats.xs_qm_dqreclaims);
2381 /*
2382 * Just zero the core here. The rest will get
2383 * reinitialized by caller. XXX we shouldn't even
2384 * do this zero ...
2385 */
2386 memset(&dqp->q_core, 0, sizeof(dqp->q_core));
2387 *O_dqpp = dqp;
2388 return (B_FALSE);
2389 }
2390 XQM_STATS_INC(xqmstats.xs_qm_dqreclaim_misses);
2391 }
2392
2393 /*
2394 * Allocate a brand new dquot on the kernel heap and return it
2395 * to the caller to initialize.
2396 */
2397 ASSERT(xfs_Gqm->qm_dqzone != NULL);
2398 *O_dqpp = kmem_zone_zalloc(xfs_Gqm->qm_dqzone, KM_SLEEP);
2399 atomic_inc(&xfs_Gqm->qm_totaldquots);
2400
2401 return (B_TRUE);
2402}
2403
2404
2405/*
2406 * Start a transaction and write the incore superblock changes to
2407 * disk. flags parameter indicates which fields have changed.
2408 */
2409int
2410xfs_qm_write_sb_changes(
2411 xfs_mount_t *mp,
2412 __int64_t flags)
2413{
2414 xfs_trans_t *tp;
2415 int error;
2416
2417#ifdef QUOTADEBUG
2418 cmn_err(CE_NOTE, "Writing superblock quota changes :%s", mp->m_fsname);
2419#endif
2420 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
2421 if ((error = xfs_trans_reserve(tp, 0,
2422 mp->m_sb.sb_sectsize + 128, 0,
2423 0,
2424 XFS_DEFAULT_LOG_COUNT))) {
2425 xfs_trans_cancel(tp, 0);
2426 return (error);
2427 }
2428
2429 xfs_mod_sb(tp, flags);
2430 (void) xfs_trans_commit(tp, 0, NULL);
2431
2432 return (0);
2433}
2434
2435
2436/* --------------- utility functions for vnodeops ---------------- */
2437
2438
2439/*
2440 * Given an inode, a uid and gid (from cred_t) make sure that we have
2441 * allocated relevant dquot(s) on disk, and that we won't exceed inode
2442 * quotas by creating this file.
2443 * This also attaches dquot(s) to the given inode after locking it,
2444 * and returns the dquots corresponding to the uid and/or gid.
2445 *
2446 * in : inode (unlocked)
2447 * out : udquot, gdquot with references taken and unlocked
2448 */
2449int
2450xfs_qm_vop_dqalloc(
2451 xfs_mount_t *mp,
2452 xfs_inode_t *ip,
2453 uid_t uid,
2454 gid_t gid,
2455 uint flags,
2456 xfs_dquot_t **O_udqpp,
2457 xfs_dquot_t **O_gdqpp)
2458{
2459 int error;
2460 xfs_dquot_t *uq, *gq;
2461 uint lockflags;
2462
2463 if (!XFS_IS_QUOTA_ON(mp))
2464 return 0;
2465
2466 lockflags = XFS_ILOCK_EXCL;
2467 xfs_ilock(ip, lockflags);
2468
2469 if ((flags & XFS_QMOPT_INHERIT) &&
2470 XFS_INHERIT_GID(ip, XFS_MTOVFS(mp)))
2471 gid = ip->i_d.di_gid;
2472
2473 /*
2474 * Attach the dquot(s) to this inode, doing a dquot allocation
2475 * if necessary. The dquot(s) will not be locked.
2476 */
2477 if (XFS_NOT_DQATTACHED(mp, ip)) {
2478 if ((error = xfs_qm_dqattach(ip, XFS_QMOPT_DQALLOC |
2479 XFS_QMOPT_ILOCKED))) {
2480 xfs_iunlock(ip, lockflags);
2481 return (error);
2482 }
2483 }
2484
2485 uq = gq = NULL;
2486 if ((flags & XFS_QMOPT_UQUOTA) &&
2487 XFS_IS_UQUOTA_ON(mp)) {
2488 if (ip->i_d.di_uid != uid) {
2489 /*
2490 * What we need is the dquot that has this uid, and
2491 * if we send the inode to dqget, the uid of the inode
2492 * takes priority over what's sent in the uid argument.
2493 * We must unlock inode here before calling dqget if
2494 * we're not sending the inode, because otherwise
2495 * we'll deadlock by doing trans_reserve while
2496 * holding ilock.
2497 */
2498 xfs_iunlock(ip, lockflags);
2499 if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid,
2500 XFS_DQ_USER,
2501 XFS_QMOPT_DQALLOC |
2502 XFS_QMOPT_DOWARN,
2503 &uq))) {
2504 ASSERT(error != ENOENT);
2505 return (error);
2506 }
2507 /*
2508 * Get the ilock in the right order.
2509 */
2510 xfs_dqunlock(uq);
2511 lockflags = XFS_ILOCK_SHARED;
2512 xfs_ilock(ip, lockflags);
2513 } else {
2514 /*
2515 * Take an extra reference, because we'll return
2516 * this to caller
2517 */
2518 ASSERT(ip->i_udquot);
2519 uq = ip->i_udquot;
2520 xfs_dqlock(uq);
2521 XFS_DQHOLD(uq);
2522 xfs_dqunlock(uq);
2523 }
2524 }
2525 if ((flags & XFS_QMOPT_GQUOTA) &&
2526 XFS_IS_GQUOTA_ON(mp)) {
2527 if (ip->i_d.di_gid != gid) {
2528 xfs_iunlock(ip, lockflags);
2529 if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid,
2530 XFS_DQ_GROUP,
2531 XFS_QMOPT_DQALLOC |
2532 XFS_QMOPT_DOWARN,
2533 &gq))) {
2534 if (uq)
2535 xfs_qm_dqrele(uq);
2536 ASSERT(error != ENOENT);
2537 return (error);
2538 }
2539 xfs_dqunlock(gq);
2540 lockflags = XFS_ILOCK_SHARED;
2541 xfs_ilock(ip, lockflags);
2542 } else {
2543 ASSERT(ip->i_gdquot);
2544 gq = ip->i_gdquot;
2545 xfs_dqlock(gq);
2546 XFS_DQHOLD(gq);
2547 xfs_dqunlock(gq);
2548 }
2549 }
2550 if (uq)
2551 xfs_dqtrace_entry_ino(uq, "DQALLOC", ip);
2552
2553 xfs_iunlock(ip, lockflags);
2554 if (O_udqpp)
2555 *O_udqpp = uq;
2556 else if (uq)
2557 xfs_qm_dqrele(uq);
2558 if (O_gdqpp)
2559 *O_gdqpp = gq;
2560 else if (gq)
2561 xfs_qm_dqrele(gq);
2562 return (0);
2563}
2564
2565/*
2566 * Actually transfer ownership, and do dquot modifications.
2567 * These were already reserved.
2568 */
2569xfs_dquot_t *
2570xfs_qm_vop_chown(
2571 xfs_trans_t *tp,
2572 xfs_inode_t *ip,
2573 xfs_dquot_t **IO_olddq,
2574 xfs_dquot_t *newdq)
2575{
2576 xfs_dquot_t *prevdq;
2577 ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
2578 ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount));
2579
2580 /* old dquot */
2581 prevdq = *IO_olddq;
2582 ASSERT(prevdq);
2583 ASSERT(prevdq != newdq);
2584
2585 xfs_trans_mod_dquot(tp, prevdq,
2586 XFS_TRANS_DQ_BCOUNT,
2587 -(ip->i_d.di_nblocks));
2588 xfs_trans_mod_dquot(tp, prevdq,
2589 XFS_TRANS_DQ_ICOUNT,
2590 -1);
2591
2592 /* the sparkling new dquot */
2593 xfs_trans_mod_dquot(tp, newdq,
2594 XFS_TRANS_DQ_BCOUNT,
2595 ip->i_d.di_nblocks);
2596 xfs_trans_mod_dquot(tp, newdq,
2597 XFS_TRANS_DQ_ICOUNT,
2598 1);
2599
2600 /*
2601 * Take an extra reference, because the inode
2602 * is going to keep this dquot pointer even
2603 * after the trans_commit.
2604 */
2605 xfs_dqlock(newdq);
2606 XFS_DQHOLD(newdq);
2607 xfs_dqunlock(newdq);
2608 *IO_olddq = newdq;
2609
2610 return (prevdq);
2611}
2612
2613/*
2614 * Quota reservations for setattr(AT_UID|AT_GID).
2615 */
2616int
2617xfs_qm_vop_chown_reserve(
2618 xfs_trans_t *tp,
2619 xfs_inode_t *ip,
2620 xfs_dquot_t *udqp,
2621 xfs_dquot_t *gdqp,
2622 uint flags)
2623{
2624 int error;
2625 xfs_mount_t *mp;
2626 uint delblks;
2627 xfs_dquot_t *unresudq, *unresgdq, *delblksudq, *delblksgdq;
2628
2629 ASSERT(XFS_ISLOCKED_INODE(ip));
2630 mp = ip->i_mount;
2631 ASSERT(XFS_IS_QUOTA_RUNNING(mp));
2632
2633 delblks = ip->i_delayed_blks;
2634 delblksudq = delblksgdq = unresudq = unresgdq = NULL;
2635
2636 if (XFS_IS_UQUOTA_ON(mp) && udqp &&
2637 ip->i_d.di_uid != (uid_t)INT_GET(udqp->q_core.d_id, ARCH_CONVERT)) {
2638 delblksudq = udqp;
2639 /*
2640 * If there are delayed allocation blocks, then we have to
2641 * unreserve those from the old dquot, and add them to the
2642 * new dquot.
2643 */
2644 if (delblks) {
2645 ASSERT(ip->i_udquot);
2646 unresudq = ip->i_udquot;
2647 }
2648 }
2649 if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp &&
2650 ip->i_d.di_gid != INT_GET(gdqp->q_core.d_id, ARCH_CONVERT)) {
2651 delblksgdq = gdqp;
2652 if (delblks) {
2653 ASSERT(ip->i_gdquot);
2654 unresgdq = ip->i_gdquot;
2655 }
2656 }
2657
2658 if ((error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
2659 delblksudq, delblksgdq, ip->i_d.di_nblocks, 1,
2660 flags | XFS_QMOPT_RES_REGBLKS)))
2661 return (error);
2662
2663 /*
2664 * Do the delayed blks reservations/unreservations now. Since, these
2665 * are done without the help of a transaction, if a reservation fails
2666 * its previous reservations won't be automatically undone by trans
2667 * code. So, we have to do it manually here.
2668 */
2669 if (delblks) {
2670 /*
2671 * Do the reservations first. Unreservation can't fail.
2672 */
2673 ASSERT(delblksudq || delblksgdq);
2674 ASSERT(unresudq || unresgdq);
2675 if ((error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
2676 delblksudq, delblksgdq, (xfs_qcnt_t)delblks, 0,
2677 flags | XFS_QMOPT_RES_REGBLKS)))
2678 return (error);
2679 xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
2680 unresudq, unresgdq, -((xfs_qcnt_t)delblks), 0,
2681 XFS_QMOPT_RES_REGBLKS);
2682 }
2683
2684 return (0);
2685}
2686
2687int
2688xfs_qm_vop_rename_dqattach(
2689 xfs_inode_t **i_tab)
2690{
2691 xfs_inode_t *ip;
2692 int i;
2693 int error;
2694
2695 ip = i_tab[0];
2696
2697 if (! XFS_IS_QUOTA_ON(ip->i_mount))
2698 return (0);
2699
2700 if (XFS_NOT_DQATTACHED(ip->i_mount, ip)) {
2701 error = xfs_qm_dqattach(ip, 0);
2702 if (error)
2703 return (error);
2704 }
2705 for (i = 1; (i < 4 && i_tab[i]); i++) {
2706 /*
2707 * Watch out for duplicate entries in the table.
2708 */
2709 if ((ip = i_tab[i]) != i_tab[i-1]) {
2710 if (XFS_NOT_DQATTACHED(ip->i_mount, ip)) {
2711 error = xfs_qm_dqattach(ip, 0);
2712 if (error)
2713 return (error);
2714 }
2715 }
2716 }
2717 return (0);
2718}
2719
2720void
2721xfs_qm_vop_dqattach_and_dqmod_newinode(
2722 xfs_trans_t *tp,
2723 xfs_inode_t *ip,
2724 xfs_dquot_t *udqp,
2725 xfs_dquot_t *gdqp)
2726{
2727 if (!XFS_IS_QUOTA_ON(tp->t_mountp))
2728 return;
2729
2730 ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
2731 ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp));
2732
2733 if (udqp) {
2734 xfs_dqlock(udqp);
2735 XFS_DQHOLD(udqp);
2736 xfs_dqunlock(udqp);
2737 ASSERT(ip->i_udquot == NULL);
2738 ip->i_udquot = udqp;
2739 ASSERT(ip->i_d.di_uid == INT_GET(udqp->q_core.d_id, ARCH_CONVERT));
2740 xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1);
2741 }
2742 if (gdqp) {
2743 xfs_dqlock(gdqp);
2744 XFS_DQHOLD(gdqp);
2745 xfs_dqunlock(gdqp);
2746 ASSERT(ip->i_gdquot == NULL);
2747 ip->i_gdquot = gdqp;
2748 ASSERT(ip->i_d.di_gid == INT_GET(gdqp->q_core.d_id, ARCH_CONVERT));
2749 xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
2750 }
2751}
2752
2753/* ------------- list stuff -----------------*/
2754void
2755xfs_qm_freelist_init(xfs_frlist_t *ql)
2756{
2757 ql->qh_next = ql->qh_prev = (xfs_dquot_t *) ql;
2758 mutex_init(&ql->qh_lock, MUTEX_DEFAULT, "dqf");
2759 ql->qh_version = 0;
2760 ql->qh_nelems = 0;
2761}
2762
2763void
2764xfs_qm_freelist_destroy(xfs_frlist_t *ql)
2765{
2766 xfs_dquot_t *dqp, *nextdqp;
2767
2768 mutex_lock(&ql->qh_lock, PINOD);
2769 for (dqp = ql->qh_next;
2770 dqp != (xfs_dquot_t *)ql; ) {
2771 xfs_dqlock(dqp);
2772 nextdqp = dqp->dq_flnext;
2773#ifdef QUOTADEBUG
2774 cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp);
2775#endif
2776 XQM_FREELIST_REMOVE(dqp);
2777 xfs_dqunlock(dqp);
2778 xfs_qm_dqdestroy(dqp);
2779 dqp = nextdqp;
2780 }
2781 /*
2782 * Don't bother about unlocking.
2783 */
2784 mutex_destroy(&ql->qh_lock);
2785
2786 ASSERT(ql->qh_nelems == 0);
2787}
2788
2789void
2790xfs_qm_freelist_insert(xfs_frlist_t *ql, xfs_dquot_t *dq)
2791{
2792 dq->dq_flnext = ql->qh_next;
2793 dq->dq_flprev = (xfs_dquot_t *)ql;
2794 ql->qh_next = dq;
2795 dq->dq_flnext->dq_flprev = dq;
2796 xfs_Gqm->qm_dqfreelist.qh_nelems++;
2797 xfs_Gqm->qm_dqfreelist.qh_version++;
2798}
2799
2800void
2801xfs_qm_freelist_unlink(xfs_dquot_t *dq)
2802{
2803 xfs_dquot_t *next = dq->dq_flnext;
2804 xfs_dquot_t *prev = dq->dq_flprev;
2805
2806 next->dq_flprev = prev;
2807 prev->dq_flnext = next;
2808 dq->dq_flnext = dq->dq_flprev = dq;
2809 xfs_Gqm->qm_dqfreelist.qh_nelems--;
2810 xfs_Gqm->qm_dqfreelist.qh_version++;
2811}
2812
2813void
2814xfs_qm_freelist_append(xfs_frlist_t *ql, xfs_dquot_t *dq)
2815{
2816 xfs_qm_freelist_insert((xfs_frlist_t *)ql->qh_prev, dq);
2817}
2818
2819int
2820xfs_qm_dqhashlock_nowait(
2821 xfs_dquot_t *dqp)
2822{
2823 int locked;
2824
2825 locked = mutex_trylock(&((dqp)->q_hash->qh_lock));
2826 return (locked);
2827}
2828
2829int
2830xfs_qm_freelist_lock_nowait(
2831 xfs_qm_t *xqm)
2832{
2833 int locked;
2834
2835 locked = mutex_trylock(&(xqm->qm_dqfreelist.qh_lock));
2836 return (locked);
2837}
2838
2839int
2840xfs_qm_mplist_nowait(
2841 xfs_mount_t *mp)
2842{
2843 int locked;
2844
2845 ASSERT(mp->m_quotainfo);
2846 locked = mutex_trylock(&(XFS_QI_MPLLOCK(mp)));
2847 return (locked);
2848}
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
new file mode 100644
index 000000000000..dcf1a7a831d8
--- /dev/null
+++ b/fs/xfs/quota/xfs_qm.h
@@ -0,0 +1,236 @@
1/*
2 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_QM_H__
33#define __XFS_QM_H__
34
35#include "xfs_dquot_item.h"
36#include "xfs_dquot.h"
37#include "xfs_quota_priv.h"
38#include "xfs_qm_stats.h"
39
40struct xfs_qm;
41struct xfs_inode;
42
43extern mutex_t xfs_Gqm_lock;
44extern struct xfs_qm *xfs_Gqm;
45extern kmem_zone_t *qm_dqzone;
46extern kmem_zone_t *qm_dqtrxzone;
47
48/*
49 * Used in xfs_qm_sync called by xfs_sync to count the max times that it can
50 * iterate over the mountpt's dquot list in one call.
51 */
52#define XFS_QM_SYNC_MAX_RESTARTS 7
53
54/*
55 * Ditto, for xfs_qm_dqreclaim_one.
56 */
57#define XFS_QM_RECLAIM_MAX_RESTARTS 4
58
59/*
60 * Ideal ratio of free to in use dquots. Quota manager makes an attempt
61 * to keep this balance.
62 */
63#define XFS_QM_DQFREE_RATIO 2
64
65/*
66 * Dquot hashtable constants/threshold values.
67 */
68#define XFS_QM_NCSIZE_THRESHOLD 5000
69#define XFS_QM_HASHSIZE_LOW 32
70#define XFS_QM_HASHSIZE_HIGH 64
71
72/*
73 * We output a cmn_err when quotachecking a quota file with more than
74 * this many fsbs.
75 */
76#define XFS_QM_BIG_QCHECK_NBLKS 500
77
78/*
79 * This defines the unit of allocation of dquots.
80 * Currently, it is just one file system block, and a 4K blk contains 30
81 * (136 * 30 = 4080) dquots. It's probably not worth trying to make
82 * this more dynamic.
83 * XXXsup However, if this number is changed, we have to make sure that we don't
84 * implicitly assume that we do allocations in chunks of a single filesystem
85 * block in the dquot/xqm code.
86 */
87#define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1
88/*
89 * When doing a quotacheck, we log dquot clusters of this many FSBs at most
90 * in a single transaction. We don't want to ask for too huge a log reservation.
91 */
92#define XFS_QM_MAX_DQCLUSTER_LOGSZ 3
93
94typedef xfs_dqhash_t xfs_dqlist_t;
95/*
96 * The freelist head. The first two fields match the first two in the
97 * xfs_dquot_t structure (in xfs_dqmarker_t)
98 */
99typedef struct xfs_frlist {
100 struct xfs_dquot *qh_next;
101 struct xfs_dquot *qh_prev;
102 mutex_t qh_lock;
103 uint qh_version;
104 uint qh_nelems;
105} xfs_frlist_t;
106
107/*
108 * Quota Manager (global) structure. Lives only in core.
109 */
110typedef struct xfs_qm {
111 xfs_dqlist_t *qm_usr_dqhtable;/* udquot hash table */
112 xfs_dqlist_t *qm_grp_dqhtable;/* gdquot hash table */
113 uint qm_dqhashmask; /* # buckets in dq hashtab - 1 */
114 xfs_frlist_t qm_dqfreelist; /* freelist of dquots */
115 atomic_t qm_totaldquots; /* total incore dquots */
116 uint qm_nrefs; /* file systems with quota on */
117 int qm_dqfree_ratio;/* ratio of free to inuse dquots */
118 kmem_zone_t *qm_dqzone; /* dquot mem-alloc zone */
119 kmem_zone_t *qm_dqtrxzone; /* t_dqinfo of transactions */
120} xfs_qm_t;
121
122/*
123 * Various quota information for individual filesystems.
124 * The mount structure keeps a pointer to this.
125 */
126typedef struct xfs_quotainfo {
127 xfs_inode_t *qi_uquotaip; /* user quota inode */
128 xfs_inode_t *qi_gquotaip; /* group quota inode */
129 lock_t qi_pinlock; /* dquot pinning mutex */
130 xfs_dqlist_t qi_dqlist; /* all dquots in filesys */
131 int qi_dqreclaims; /* a change here indicates
132 a removal in the dqlist */
133 time_t qi_btimelimit; /* limit for blks timer */
134 time_t qi_itimelimit; /* limit for inodes timer */
135 time_t qi_rtbtimelimit;/* limit for rt blks timer */
136 xfs_qwarncnt_t qi_bwarnlimit; /* limit for num warnings */
137 xfs_qwarncnt_t qi_iwarnlimit; /* limit for num warnings */
138 mutex_t qi_quotaofflock;/* to serialize quotaoff */
139 xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */
140 uint qi_dqperchunk; /* # ondisk dqs in above chunk */
141 xfs_qcnt_t qi_bhardlimit; /* default data blk hard limit */
142 xfs_qcnt_t qi_bsoftlimit; /* default data blk soft limit */
143 xfs_qcnt_t qi_ihardlimit; /* default inode count hard limit */
144 xfs_qcnt_t qi_isoftlimit; /* default inode count soft limit */
145 xfs_qcnt_t qi_rtbhardlimit;/* default realtime blk hard limit */
146 xfs_qcnt_t qi_rtbsoftlimit;/* default realtime blk soft limit */
147} xfs_quotainfo_t;
148
149
150extern xfs_dqtrxops_t xfs_trans_dquot_ops;
151
152extern void xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long);
153extern int xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *,
154 xfs_dquot_t *, xfs_dquot_t *, long, long, uint);
155extern void xfs_trans_dqjoin(xfs_trans_t *, xfs_dquot_t *);
156extern void xfs_trans_log_dquot(xfs_trans_t *, xfs_dquot_t *);
157
158/*
159 * We keep the usr and grp dquots separately so that locking will be easier
160 * to do at commit time. All transactions that we know of at this point
161 * affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value.
162 */
163#define XFS_QM_TRANS_MAXDQS 2
164typedef struct xfs_dquot_acct {
165 xfs_dqtrx_t dqa_usrdquots[XFS_QM_TRANS_MAXDQS];
166 xfs_dqtrx_t dqa_grpdquots[XFS_QM_TRANS_MAXDQS];
167} xfs_dquot_acct_t;
168
169/*
170 * Users are allowed to have a usage exceeding their softlimit for
171 * a period this long.
172 */
173#define XFS_QM_BTIMELIMIT (7 * 24*60*60) /* 1 week */
174#define XFS_QM_RTBTIMELIMIT (7 * 24*60*60) /* 1 week */
175#define XFS_QM_ITIMELIMIT (7 * 24*60*60) /* 1 week */
176
177#define XFS_QM_BWARNLIMIT 5
178#define XFS_QM_IWARNLIMIT 5
179
180#define XFS_QM_LOCK(xqm) (mutex_lock(&xqm##_lock, PINOD))
181#define XFS_QM_UNLOCK(xqm) (mutex_unlock(&xqm##_lock))
182#define XFS_QM_HOLD(xqm) ((xqm)->qm_nrefs++)
183#define XFS_QM_RELE(xqm) ((xqm)->qm_nrefs--)
184
185extern void xfs_mount_reset_sbqflags(xfs_mount_t *);
186
187extern int xfs_qm_init_quotainfo(xfs_mount_t *);
188extern void xfs_qm_destroy_quotainfo(xfs_mount_t *);
189extern int xfs_qm_mount_quotas(xfs_mount_t *, int);
190extern void xfs_qm_mount_quotainit(xfs_mount_t *, uint);
191extern int xfs_qm_quotacheck(xfs_mount_t *);
192extern void xfs_qm_unmount_quotadestroy(xfs_mount_t *);
193extern int xfs_qm_unmount_quotas(xfs_mount_t *);
194extern int xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
195extern int xfs_qm_sync(xfs_mount_t *, short);
196
197/* dquot stuff */
198extern boolean_t xfs_qm_dqalloc_incore(xfs_dquot_t **);
199extern int xfs_qm_dqattach(xfs_inode_t *, uint);
200extern void xfs_qm_dqdetach(xfs_inode_t *);
201extern int xfs_qm_dqpurge_all(xfs_mount_t *, uint);
202extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint);
203
204/* vop stuff */
205extern int xfs_qm_vop_dqalloc(xfs_mount_t *, xfs_inode_t *,
206 uid_t, gid_t, uint,
207 xfs_dquot_t **, xfs_dquot_t **);
208extern void xfs_qm_vop_dqattach_and_dqmod_newinode(
209 xfs_trans_t *, xfs_inode_t *,
210 xfs_dquot_t *, xfs_dquot_t *);
211extern int xfs_qm_vop_rename_dqattach(xfs_inode_t **);
212extern xfs_dquot_t * xfs_qm_vop_chown(xfs_trans_t *, xfs_inode_t *,
213 xfs_dquot_t **, xfs_dquot_t *);
214extern int xfs_qm_vop_chown_reserve(xfs_trans_t *, xfs_inode_t *,
215 xfs_dquot_t *, xfs_dquot_t *, uint);
216
217/* list stuff */
218extern void xfs_qm_freelist_init(xfs_frlist_t *);
219extern void xfs_qm_freelist_destroy(xfs_frlist_t *);
220extern void xfs_qm_freelist_insert(xfs_frlist_t *, xfs_dquot_t *);
221extern void xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *);
222extern void xfs_qm_freelist_unlink(xfs_dquot_t *);
223extern int xfs_qm_freelist_lock_nowait(xfs_qm_t *);
224extern int xfs_qm_mplist_nowait(xfs_mount_t *);
225extern int xfs_qm_dqhashlock_nowait(xfs_dquot_t *);
226
227/* system call interface */
228extern int xfs_qm_quotactl(bhv_desc_t *, int, int, xfs_caddr_t);
229
230#ifdef DEBUG
231extern int xfs_qm_internalqcheck(xfs_mount_t *);
232#else
233#define xfs_qm_internalqcheck(mp) (0)
234#endif
235
236#endif /* __XFS_QM_H__ */
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
new file mode 100644
index 000000000000..be67d9c265f8
--- /dev/null
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -0,0 +1,410 @@
1/*
2 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34#include "xfs_fs.h"
35#include "xfs_inum.h"
36#include "xfs_log.h"
37#include "xfs_clnt.h"
38#include "xfs_trans.h"
39#include "xfs_sb.h"
40#include "xfs_dir.h"
41#include "xfs_dir2.h"
42#include "xfs_alloc.h"
43#include "xfs_dmapi.h"
44#include "xfs_quota.h"
45#include "xfs_mount.h"
46#include "xfs_alloc_btree.h"
47#include "xfs_bmap_btree.h"
48#include "xfs_ialloc_btree.h"
49#include "xfs_btree.h"
50#include "xfs_ialloc.h"
51#include "xfs_attr_sf.h"
52#include "xfs_dir_sf.h"
53#include "xfs_dir2_sf.h"
54#include "xfs_dinode.h"
55#include "xfs_inode.h"
56#include "xfs_bmap.h"
57#include "xfs_bit.h"
58#include "xfs_rtalloc.h"
59#include "xfs_error.h"
60#include "xfs_itable.h"
61#include "xfs_rw.h"
62#include "xfs_acl.h"
63#include "xfs_cap.h"
64#include "xfs_mac.h"
65#include "xfs_attr.h"
66#include "xfs_buf_item.h"
67
68#include "xfs_qm.h"
69
70#define MNTOPT_QUOTA "quota" /* disk quotas (user) */
71#define MNTOPT_NOQUOTA "noquota" /* no quotas */
72#define MNTOPT_USRQUOTA "usrquota" /* user quota enabled */
73#define MNTOPT_GRPQUOTA "grpquota" /* group quota enabled */
74#define MNTOPT_UQUOTA "uquota" /* user quota (IRIX variant) */
75#define MNTOPT_GQUOTA "gquota" /* group quota (IRIX variant) */
76#define MNTOPT_UQUOTANOENF "uqnoenforce"/* user quota limit enforcement */
77#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
78#define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */
79
80STATIC int
81xfs_qm_parseargs(
82 struct bhv_desc *bhv,
83 char *options,
84 struct xfs_mount_args *args,
85 int update)
86{
87 size_t length;
88 char *local_options = options;
89 char *this_char;
90 int error;
91 int referenced = update;
92
93 while ((this_char = strsep(&local_options, ",")) != NULL) {
94 length = strlen(this_char);
95 if (local_options)
96 length++;
97
98 if (!strcmp(this_char, MNTOPT_NOQUOTA)) {
99 args->flags &= ~(XFSMNT_UQUOTAENF|XFSMNT_UQUOTA);
100 args->flags &= ~(XFSMNT_GQUOTAENF|XFSMNT_GQUOTA);
101 referenced = update;
102 } else if (!strcmp(this_char, MNTOPT_QUOTA) ||
103 !strcmp(this_char, MNTOPT_UQUOTA) ||
104 !strcmp(this_char, MNTOPT_USRQUOTA)) {
105 args->flags |= XFSMNT_UQUOTA | XFSMNT_UQUOTAENF;
106 referenced = 1;
107 } else if (!strcmp(this_char, MNTOPT_QUOTANOENF) ||
108 !strcmp(this_char, MNTOPT_UQUOTANOENF)) {
109 args->flags |= XFSMNT_UQUOTA;
110 args->flags &= ~XFSMNT_UQUOTAENF;
111 referenced = 1;
112 } else if (!strcmp(this_char, MNTOPT_GQUOTA) ||
113 !strcmp(this_char, MNTOPT_GRPQUOTA)) {
114 args->flags |= XFSMNT_GQUOTA | XFSMNT_GQUOTAENF;
115 referenced = 1;
116 } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
117 args->flags |= XFSMNT_GQUOTA;
118 args->flags &= ~XFSMNT_GQUOTAENF;
119 referenced = 1;
120 } else {
121 if (local_options)
122 *(local_options-1) = ',';
123 continue;
124 }
125
126 while (length--)
127 *this_char++ = ',';
128 }
129
130 PVFS_PARSEARGS(BHV_NEXT(bhv), options, args, update, error);
131 if (!error && !referenced)
132 bhv_remove_vfsops(bhvtovfs(bhv), VFS_POSITION_QM);
133 return error;
134}
135
136STATIC int
137xfs_qm_showargs(
138 struct bhv_desc *bhv,
139 struct seq_file *m)
140{
141 struct vfs *vfsp = bhvtovfs(bhv);
142 struct xfs_mount *mp = XFS_VFSTOM(vfsp);
143 int error;
144
145 if (mp->m_qflags & XFS_UQUOTA_ACCT) {
146 (mp->m_qflags & XFS_UQUOTA_ENFD) ?
147 seq_puts(m, "," MNTOPT_USRQUOTA) :
148 seq_puts(m, "," MNTOPT_UQUOTANOENF);
149 }
150
151 if (mp->m_qflags & XFS_GQUOTA_ACCT) {
152 (mp->m_qflags & XFS_GQUOTA_ENFD) ?
153 seq_puts(m, "," MNTOPT_GRPQUOTA) :
154 seq_puts(m, "," MNTOPT_GQUOTANOENF);
155 }
156
157 if (!(mp->m_qflags & (XFS_UQUOTA_ACCT|XFS_GQUOTA_ACCT)))
158 seq_puts(m, "," MNTOPT_NOQUOTA);
159
160 PVFS_SHOWARGS(BHV_NEXT(bhv), m, error);
161 return error;
162}
163
164STATIC int
165xfs_qm_mount(
166 struct bhv_desc *bhv,
167 struct xfs_mount_args *args,
168 struct cred *cr)
169{
170 struct vfs *vfsp = bhvtovfs(bhv);
171 struct xfs_mount *mp = XFS_VFSTOM(vfsp);
172 int error;
173
174 if (args->flags & (XFSMNT_UQUOTA | XFSMNT_GQUOTA))
175 xfs_qm_mount_quotainit(mp, args->flags);
176 PVFS_MOUNT(BHV_NEXT(bhv), args, cr, error);
177 return error;
178}
179
180STATIC int
181xfs_qm_syncall(
182 struct bhv_desc *bhv,
183 int flags,
184 cred_t *credp)
185{
186 struct vfs *vfsp = bhvtovfs(bhv);
187 struct xfs_mount *mp = XFS_VFSTOM(vfsp);
188 int error;
189
190 /*
191 * Get the Quota Manager to flush the dquots.
192 */
193 if (XFS_IS_QUOTA_ON(mp)) {
194 if ((error = xfs_qm_sync(mp, flags))) {
195 /*
196 * If we got an IO error, we will be shutting down.
197 * So, there's nothing more for us to do here.
198 */
199 ASSERT(error != EIO || XFS_FORCED_SHUTDOWN(mp));
200 if (XFS_FORCED_SHUTDOWN(mp)) {
201 return XFS_ERROR(error);
202 }
203 }
204 }
205 PVFS_SYNC(BHV_NEXT(bhv), flags, credp, error);
206 return error;
207}
208
209/*
210 * Clear the quotaflags in memory and in the superblock.
211 */
212void
213xfs_mount_reset_sbqflags(
214 xfs_mount_t *mp)
215{
216 xfs_trans_t *tp;
217 unsigned long s;
218
219 mp->m_qflags = 0;
220 /*
221 * It is OK to look at sb_qflags here in mount path,
222 * without SB_LOCK.
223 */
224 if (mp->m_sb.sb_qflags == 0)
225 return;
226 s = XFS_SB_LOCK(mp);
227 mp->m_sb.sb_qflags = 0;
228 XFS_SB_UNLOCK(mp, s);
229
230 /*
231 * if the fs is readonly, let the incore superblock run
232 * with quotas off but don't flush the update out to disk
233 */
234 if (XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY)
235 return;
236#ifdef QUOTADEBUG
237 xfs_fs_cmn_err(CE_NOTE, mp, "Writing superblock quota changes");
238#endif
239 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
240 if (xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
241 XFS_DEFAULT_LOG_COUNT)) {
242 xfs_trans_cancel(tp, 0);
243 xfs_fs_cmn_err(CE_ALERT, mp,
244 "xfs_mount_reset_sbqflags: Superblock update failed!");
245 return;
246 }
247 xfs_mod_sb(tp, XFS_SB_QFLAGS);
248 xfs_trans_commit(tp, 0, NULL);
249}
250
251STATIC int
252xfs_qm_newmount(
253 xfs_mount_t *mp,
254 uint *needquotamount,
255 uint *quotaflags)
256{
257 uint quotaondisk;
258 uint uquotaondisk = 0, gquotaondisk = 0;
259
260 *quotaflags = 0;
261 *needquotamount = B_FALSE;
262
263 quotaondisk = XFS_SB_VERSION_HASQUOTA(&mp->m_sb) &&
264 mp->m_sb.sb_qflags & (XFS_UQUOTA_ACCT|XFS_GQUOTA_ACCT);
265
266 if (quotaondisk) {
267 uquotaondisk = mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT;
268 gquotaondisk = mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT;
269 }
270
271 /*
272 * If the device itself is read-only, we can't allow
273 * the user to change the state of quota on the mount -
274 * this would generate a transaction on the ro device,
275 * which would lead to an I/O error and shutdown
276 */
277
278 if (((uquotaondisk && !XFS_IS_UQUOTA_ON(mp)) ||
279 (!uquotaondisk && XFS_IS_UQUOTA_ON(mp)) ||
280 (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) ||
281 (!gquotaondisk && XFS_IS_GQUOTA_ON(mp))) &&
282 xfs_dev_is_read_only(mp, "changing quota state")) {
283 cmn_err(CE_WARN,
284 "XFS: please mount with%s%s%s.",
285 (!quotaondisk ? "out quota" : ""),
286 (uquotaondisk ? " usrquota" : ""),
287 (gquotaondisk ? " grpquota" : ""));
288 return XFS_ERROR(EPERM);
289 }
290
291 if (XFS_IS_QUOTA_ON(mp) || quotaondisk) {
292 /*
293 * Call mount_quotas at this point only if we won't have to do
294 * a quotacheck.
295 */
296 if (quotaondisk && !XFS_QM_NEED_QUOTACHECK(mp)) {
297 /*
298 * If an error occured, qm_mount_quotas code
299 * has already disabled quotas. So, just finish
300 * mounting, and get on with the boring life
301 * without disk quotas.
302 */
303 xfs_qm_mount_quotas(mp, 0);
304 } else {
305 /*
306 * Clear the quota flags, but remember them. This
307 * is so that the quota code doesn't get invoked
308 * before we're ready. This can happen when an
309 * inode goes inactive and wants to free blocks,
310 * or via xfs_log_mount_finish.
311 */
312 *needquotamount = B_TRUE;
313 *quotaflags = mp->m_qflags;
314 mp->m_qflags = 0;
315 }
316 }
317
318 return 0;
319}
320
321STATIC int
322xfs_qm_endmount(
323 xfs_mount_t *mp,
324 uint needquotamount,
325 uint quotaflags,
326 int mfsi_flags)
327{
328 if (needquotamount) {
329 ASSERT(mp->m_qflags == 0);
330 mp->m_qflags = quotaflags;
331 xfs_qm_mount_quotas(mp, mfsi_flags);
332 }
333
334#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
335 if (! (XFS_IS_QUOTA_ON(mp)))
336 xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas not turned on");
337 else
338 xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas turned on");
339#endif
340
341#ifdef QUOTADEBUG
342 if (XFS_IS_QUOTA_ON(mp) && xfs_qm_internalqcheck(mp))
343 cmn_err(CE_WARN, "XFS: mount internalqcheck failed");
344#endif
345
346 return 0;
347}
348
349STATIC void
350xfs_qm_dqrele_null(
351 xfs_dquot_t *dq)
352{
353 /*
354 * Called from XFS, where we always check first for a NULL dquot.
355 */
356 if (!dq)
357 return;
358 xfs_qm_dqrele(dq);
359}
360
361
362struct xfs_qmops xfs_qmcore_xfs = {
363 .xfs_qminit = xfs_qm_newmount,
364 .xfs_qmdone = xfs_qm_unmount_quotadestroy,
365 .xfs_qmmount = xfs_qm_endmount,
366 .xfs_qmunmount = xfs_qm_unmount_quotas,
367 .xfs_dqrele = xfs_qm_dqrele_null,
368 .xfs_dqattach = xfs_qm_dqattach,
369 .xfs_dqdetach = xfs_qm_dqdetach,
370 .xfs_dqpurgeall = xfs_qm_dqpurge_all,
371 .xfs_dqvopalloc = xfs_qm_vop_dqalloc,
372 .xfs_dqvopcreate = xfs_qm_vop_dqattach_and_dqmod_newinode,
373 .xfs_dqvoprename = xfs_qm_vop_rename_dqattach,
374 .xfs_dqvopchown = xfs_qm_vop_chown,
375 .xfs_dqvopchownresv = xfs_qm_vop_chown_reserve,
376 .xfs_dqtrxops = &xfs_trans_dquot_ops,
377};
378
379struct bhv_vfsops xfs_qmops = { {
380 BHV_IDENTITY_INIT(VFS_BHV_QM, VFS_POSITION_QM),
381 .vfs_parseargs = xfs_qm_parseargs,
382 .vfs_showargs = xfs_qm_showargs,
383 .vfs_mount = xfs_qm_mount,
384 .vfs_sync = xfs_qm_syncall,
385 .vfs_quotactl = xfs_qm_quotactl, },
386};
387
388
389void __init
390xfs_qm_init(void)
391{
392 static char message[] __initdata =
393 KERN_INFO "SGI XFS Quota Management subsystem\n";
394
395 printk(message);
396 mutex_init(&xfs_Gqm_lock, MUTEX_DEFAULT, "xfs_qmlock");
397 vfs_bhv_set_custom(&xfs_qmops, &xfs_qmcore_xfs);
398 xfs_qm_init_procfs();
399}
400
401void __exit
402xfs_qm_exit(void)
403{
404 vfs_bhv_clr_custom(&xfs_qmops);
405 xfs_qm_cleanup_procfs();
406 if (qm_dqzone)
407 kmem_cache_destroy(qm_dqzone);
408 if (qm_dqtrxzone)
409 kmem_cache_destroy(qm_dqtrxzone);
410}
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
new file mode 100644
index 000000000000..29978e037fee
--- /dev/null
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -0,0 +1,149 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34#include "xfs_fs.h"
35#include "xfs_inum.h"
36#include "xfs_log.h"
37#include "xfs_trans.h"
38#include "xfs_sb.h"
39#include "xfs_dir.h"
40#include "xfs_dir2.h"
41#include "xfs_alloc.h"
42#include "xfs_dmapi.h"
43#include "xfs_quota.h"
44#include "xfs_mount.h"
45#include "xfs_alloc_btree.h"
46#include "xfs_bmap_btree.h"
47#include "xfs_ialloc_btree.h"
48#include "xfs_btree.h"
49#include "xfs_ialloc.h"
50#include "xfs_attr_sf.h"
51#include "xfs_dir_sf.h"
52#include "xfs_dir2_sf.h"
53#include "xfs_dinode.h"
54#include "xfs_inode.h"
55#include "xfs_bmap.h"
56#include "xfs_bit.h"
57#include "xfs_rtalloc.h"
58#include "xfs_error.h"
59#include "xfs_itable.h"
60#include "xfs_rw.h"
61#include "xfs_acl.h"
62#include "xfs_cap.h"
63#include "xfs_mac.h"
64#include "xfs_attr.h"
65#include "xfs_buf_item.h"
66
67#include "xfs_qm.h"
68
69struct xqmstats xqmstats;
70
71STATIC int
72xfs_qm_read_xfsquota(
73 char *buffer,
74 char **start,
75 off_t offset,
76 int count,
77 int *eof,
78 void *data)
79{
80 int len;
81
82 /* maximum; incore; ratio free to inuse; freelist */
83 len = sprintf(buffer, "%d\t%d\t%d\t%u\n",
84 ndquot,
85 xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
86 xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
87 xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0);
88
89 if (offset >= len) {
90 *start = buffer;
91 *eof = 1;
92 return 0;
93 }
94 *start = buffer + offset;
95 if ((len -= offset) > count)
96 return count;
97 *eof = 1;
98
99 return len;
100}
101
102STATIC int
103xfs_qm_read_stats(
104 char *buffer,
105 char **start,
106 off_t offset,
107 int count,
108 int *eof,
109 void *data)
110{
111 int len;
112
113 /* quota performance statistics */
114 len = sprintf(buffer, "qm %u %u %u %u %u %u %u %u\n",
115 xqmstats.xs_qm_dqreclaims,
116 xqmstats.xs_qm_dqreclaim_misses,
117 xqmstats.xs_qm_dquot_dups,
118 xqmstats.xs_qm_dqcachemisses,
119 xqmstats.xs_qm_dqcachehits,
120 xqmstats.xs_qm_dqwants,
121 xqmstats.xs_qm_dqshake_reclaims,
122 xqmstats.xs_qm_dqinact_reclaims);
123
124 if (offset >= len) {
125 *start = buffer;
126 *eof = 1;
127 return 0;
128 }
129 *start = buffer + offset;
130 if ((len -= offset) > count)
131 return count;
132 *eof = 1;
133
134 return len;
135}
136
137void
138xfs_qm_init_procfs(void)
139{
140 create_proc_read_entry("fs/xfs/xqmstat", 0, NULL, xfs_qm_read_stats, NULL);
141 create_proc_read_entry("fs/xfs/xqm", 0, NULL, xfs_qm_read_xfsquota, NULL);
142}
143
144void
145xfs_qm_cleanup_procfs(void)
146{
147 remove_proc_entry("fs/xfs/xqm", NULL);
148 remove_proc_entry("fs/xfs/xqmstat", NULL);
149}
diff --git a/fs/xfs/quota/xfs_qm_stats.h b/fs/xfs/quota/xfs_qm_stats.h
new file mode 100644
index 000000000000..8093c5c284ec
--- /dev/null
+++ b/fs/xfs/quota/xfs_qm_stats.h
@@ -0,0 +1,68 @@
1/*
2 * Copyright (c) 2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_QM_STATS_H__
33#define __XFS_QM_STATS_H__
34
35
36#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF)
37
38/*
39 * XQM global statistics
40 */
41struct xqmstats {
42 __uint32_t xs_qm_dqreclaims;
43 __uint32_t xs_qm_dqreclaim_misses;
44 __uint32_t xs_qm_dquot_dups;
45 __uint32_t xs_qm_dqcachemisses;
46 __uint32_t xs_qm_dqcachehits;
47 __uint32_t xs_qm_dqwants;
48 __uint32_t xs_qm_dqshake_reclaims;
49 __uint32_t xs_qm_dqinact_reclaims;
50};
51
52extern struct xqmstats xqmstats;
53
54# define XQM_STATS_INC(count) ( (count)++ )
55
56extern void xfs_qm_init_procfs(void);
57extern void xfs_qm_cleanup_procfs(void);
58
59#else
60
61# define XQM_STATS_INC(count) do { } while (0)
62
63static __inline void xfs_qm_init_procfs(void) { };
64static __inline void xfs_qm_cleanup_procfs(void) { };
65
66#endif
67
68#endif /* __XFS_QM_STATS_H__ */
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
new file mode 100644
index 000000000000..229f5b5a2d25
--- /dev/null
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -0,0 +1,1458 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34#include "xfs_fs.h"
35#include "xfs_inum.h"
36#include "xfs_log.h"
37#include "xfs_trans.h"
38#include "xfs_sb.h"
39#include "xfs_dir.h"
40#include "xfs_dir2.h"
41#include "xfs_alloc.h"
42#include "xfs_dmapi.h"
43#include "xfs_quota.h"
44#include "xfs_mount.h"
45#include "xfs_alloc_btree.h"
46#include "xfs_bmap_btree.h"
47#include "xfs_ialloc_btree.h"
48#include "xfs_btree.h"
49#include "xfs_ialloc.h"
50#include "xfs_attr_sf.h"
51#include "xfs_dir_sf.h"
52#include "xfs_dir2_sf.h"
53#include "xfs_dinode.h"
54#include "xfs_inode.h"
55#include "xfs_bmap.h"
56#include "xfs_bit.h"
57#include "xfs_rtalloc.h"
58#include "xfs_error.h"
59#include "xfs_itable.h"
60#include "xfs_rw.h"
61#include "xfs_acl.h"
62#include "xfs_cap.h"
63#include "xfs_mac.h"
64#include "xfs_attr.h"
65#include "xfs_buf_item.h"
66#include "xfs_utils.h"
67
68#include "xfs_qm.h"
69
70#ifdef DEBUG
71# define qdprintk(s, args...) cmn_err(CE_DEBUG, s, ## args)
72#else
73# define qdprintk(s, args...) do { } while (0)
74#endif
75
76STATIC int xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint);
77STATIC int xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint,
78 fs_disk_quota_t *);
79STATIC int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
80STATIC int xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint,
81 fs_disk_quota_t *);
82STATIC int xfs_qm_scall_quotaon(xfs_mount_t *, uint);
83STATIC int xfs_qm_scall_quotaoff(xfs_mount_t *, uint, boolean_t);
84STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
85STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
86 uint);
87STATIC uint xfs_qm_import_flags(uint);
88STATIC uint xfs_qm_export_flags(uint);
89STATIC uint xfs_qm_import_qtype_flags(uint);
90STATIC uint xfs_qm_export_qtype_flags(uint);
91STATIC void xfs_qm_export_dquot(xfs_mount_t *, xfs_disk_dquot_t *,
92 fs_disk_quota_t *);
93
94
95/*
96 * The main distribution switch of all XFS quotactl system calls.
97 */
98int
99xfs_qm_quotactl(
100 struct bhv_desc *bdp,
101 int cmd,
102 int id,
103 xfs_caddr_t addr)
104{
105 xfs_mount_t *mp;
106 int error;
107 struct vfs *vfsp;
108
109 vfsp = bhvtovfs(bdp);
110 mp = XFS_VFSTOM(vfsp);
111
112 if (addr == NULL && cmd != Q_SYNC)
113 return XFS_ERROR(EINVAL);
114 if (id < 0 && cmd != Q_SYNC)
115 return XFS_ERROR(EINVAL);
116
117 /*
118 * The following commands are valid even when quotaoff.
119 */
120 switch (cmd) {
121 /*
122 * truncate quota files. quota must be off.
123 */
124 case Q_XQUOTARM:
125 if (XFS_IS_QUOTA_ON(mp) || addr == NULL)
126 return XFS_ERROR(EINVAL);
127 if (vfsp->vfs_flag & VFS_RDONLY)
128 return XFS_ERROR(EROFS);
129 return (xfs_qm_scall_trunc_qfiles(mp,
130 xfs_qm_import_qtype_flags(*(uint *)addr)));
131 /*
132 * Get quota status information.
133 */
134 case Q_XGETQSTAT:
135 return (xfs_qm_scall_getqstat(mp, (fs_quota_stat_t *)addr));
136
137 /*
138 * QUOTAON for root f/s and quota enforcement on others..
139 * Quota accounting for non-root f/s's must be turned on
140 * at mount time.
141 */
142 case Q_XQUOTAON:
143 if (addr == NULL)
144 return XFS_ERROR(EINVAL);
145 if (vfsp->vfs_flag & VFS_RDONLY)
146 return XFS_ERROR(EROFS);
147 return (xfs_qm_scall_quotaon(mp,
148 xfs_qm_import_flags(*(uint *)addr)));
149 case Q_XQUOTAOFF:
150 if (vfsp->vfs_flag & VFS_RDONLY)
151 return XFS_ERROR(EROFS);
152 break;
153
154 default:
155 break;
156 }
157
158 if (! XFS_IS_QUOTA_ON(mp))
159 return XFS_ERROR(ESRCH);
160
161 switch (cmd) {
162 case Q_XQUOTAOFF:
163 if (vfsp->vfs_flag & VFS_RDONLY)
164 return XFS_ERROR(EROFS);
165 error = xfs_qm_scall_quotaoff(mp,
166 xfs_qm_import_flags(*(uint *)addr),
167 B_FALSE);
168 break;
169
170 /*
171 * Defaults to XFS_GETUQUOTA.
172 */
173 case Q_XGETQUOTA:
174 error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_USER,
175 (fs_disk_quota_t *)addr);
176 break;
177 /*
178 * Set limits, both hard and soft. Defaults to Q_SETUQLIM.
179 */
180 case Q_XSETQLIM:
181 if (vfsp->vfs_flag & VFS_RDONLY)
182 return XFS_ERROR(EROFS);
183 error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_USER,
184 (fs_disk_quota_t *)addr);
185 break;
186
187 case Q_XSETGQLIM:
188 if (vfsp->vfs_flag & VFS_RDONLY)
189 return XFS_ERROR(EROFS);
190 error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_GROUP,
191 (fs_disk_quota_t *)addr);
192 break;
193
194
195 case Q_XGETGQUOTA:
196 error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_GROUP,
197 (fs_disk_quota_t *)addr);
198 break;
199
200 /*
201 * Quotas are entirely undefined after quotaoff in XFS quotas.
202 * For instance, there's no way to set limits when quotaoff.
203 */
204
205 default:
206 error = XFS_ERROR(EINVAL);
207 break;
208 }
209
210 return (error);
211}
212
213/*
214 * Turn off quota accounting and/or enforcement for all udquots and/or
215 * gdquots. Called only at unmount time.
216 *
217 * This assumes that there are no dquots of this file system cached
218 * incore, and modifies the ondisk dquot directly. Therefore, for example,
219 * it is an error to call this twice, without purging the cache.
220 */
221STATIC int
222xfs_qm_scall_quotaoff(
223 xfs_mount_t *mp,
224 uint flags,
225 boolean_t force)
226{
227 uint dqtype;
228 unsigned long s;
229 int error;
230 uint inactivate_flags;
231 xfs_qoff_logitem_t *qoffstart;
232 int nculprits;
233
234 if (!force && !capable(CAP_SYS_ADMIN))
235 return XFS_ERROR(EPERM);
236 /*
237 * No file system can have quotas enabled on disk but not in core.
238 * Note that quota utilities (like quotaoff) _expect_
239 * errno == EEXIST here.
240 */
241 if ((mp->m_qflags & flags) == 0)
242 return XFS_ERROR(EEXIST);
243 error = 0;
244
245 flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
246
247 /*
248 * We don't want to deal with two quotaoffs messing up each other,
249 * so we're going to serialize it. quotaoff isn't exactly a performance
250 * critical thing.
251 * If quotaoff, then we must be dealing with the root filesystem.
252 */
253 ASSERT(mp->m_quotainfo);
254 if (mp->m_quotainfo)
255 mutex_lock(&(XFS_QI_QOFFLOCK(mp)), PINOD);
256
257 ASSERT(mp->m_quotainfo);
258
259 /*
260 * If we're just turning off quota enforcement, change mp and go.
261 */
262 if ((flags & XFS_ALL_QUOTA_ACCT) == 0) {
263 mp->m_qflags &= ~(flags);
264
265 s = XFS_SB_LOCK(mp);
266 mp->m_sb.sb_qflags = mp->m_qflags;
267 XFS_SB_UNLOCK(mp, s);
268 mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
269
270 /* XXX what to do if error ? Revert back to old vals incore ? */
271 error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS);
272 return (error);
273 }
274
275 dqtype = 0;
276 inactivate_flags = 0;
277 /*
278 * If accounting is off, we must turn enforcement off, clear the
279 * quota 'CHKD' certificate to make it known that we have to
280 * do a quotacheck the next time this quota is turned on.
281 */
282 if (flags & XFS_UQUOTA_ACCT) {
283 dqtype |= XFS_QMOPT_UQUOTA;
284 flags |= (XFS_UQUOTA_CHKD | XFS_UQUOTA_ENFD);
285 inactivate_flags |= XFS_UQUOTA_ACTIVE;
286 }
287 if (flags & XFS_GQUOTA_ACCT) {
288 dqtype |= XFS_QMOPT_GQUOTA;
289 flags |= (XFS_GQUOTA_CHKD | XFS_GQUOTA_ENFD);
290 inactivate_flags |= XFS_GQUOTA_ACTIVE;
291 }
292
293 /*
294 * Nothing to do? Don't complain. This happens when we're just
295 * turning off quota enforcement.
296 */
297 if ((mp->m_qflags & flags) == 0) {
298 mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
299 return (0);
300 }
301
302 /*
303 * Write the LI_QUOTAOFF log record, and do SB changes atomically,
304 * and synchronously.
305 */
306 xfs_qm_log_quotaoff(mp, &qoffstart, flags);
307
308 /*
309 * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct
310 * to take care of the race between dqget and quotaoff. We don't take
311 * any special locks to reset these bits. All processes need to check
312 * these bits *after* taking inode lock(s) to see if the particular
313 * quota type is in the process of being turned off. If *ACTIVE, it is
314 * guaranteed that all dquot structures and all quotainode ptrs will all
315 * stay valid as long as that inode is kept locked.
316 *
317 * There is no turning back after this.
318 */
319 mp->m_qflags &= ~inactivate_flags;
320
321 /*
322 * Give back all the dquot reference(s) held by inodes.
323 * Here we go thru every single incore inode in this file system, and
324 * do a dqrele on the i_udquot/i_gdquot that it may have.
325 * Essentially, as long as somebody has an inode locked, this guarantees
326 * that quotas will not be turned off. This is handy because in a
327 * transaction once we lock the inode(s) and check for quotaon, we can
328 * depend on the quota inodes (and other things) being valid as long as
329 * we keep the lock(s).
330 */
331 xfs_qm_dqrele_all_inodes(mp, flags);
332
333 /*
334 * Next we make the changes in the quota flag in the mount struct.
335 * This isn't protected by a particular lock directly, because we
336 * don't want to take a mrlock everytime we depend on quotas being on.
337 */
338 mp->m_qflags &= ~(flags);
339
340 /*
341 * Go through all the dquots of this file system and purge them,
342 * according to what was turned off. We may not be able to get rid
343 * of all dquots, because dquots can have temporary references that
344 * are not attached to inodes. eg. xfs_setattr, xfs_create.
345 * So, if we couldn't purge all the dquots from the filesystem,
346 * we can't get rid of the incore data structures.
347 */
348 while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype|XFS_QMOPT_QUOTAOFF)))
349 delay(10 * nculprits);
350
351 /*
352 * Transactions that had started before ACTIVE state bit was cleared
353 * could have logged many dquots, so they'd have higher LSNs than
354 * the first QUOTAOFF log record does. If we happen to crash when
355 * the tail of the log has gone past the QUOTAOFF record, but
356 * before the last dquot modification, those dquots __will__
357 * recover, and that's not good.
358 *
359 * So, we have QUOTAOFF start and end logitems; the start
360 * logitem won't get overwritten until the end logitem appears...
361 */
362 xfs_qm_log_quotaoff_end(mp, qoffstart, flags);
363
364 /*
365 * If quotas is completely disabled, close shop.
366 */
367 if ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_ALL) {
368 mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
369 xfs_qm_destroy_quotainfo(mp);
370 return (0);
371 }
372
373 /*
374 * Release our quotainode references, and vn_purge them,
375 * if we don't need them anymore.
376 */
377 if ((dqtype & XFS_QMOPT_UQUOTA) && XFS_QI_UQIP(mp)) {
378 XFS_PURGE_INODE(XFS_QI_UQIP(mp));
379 XFS_QI_UQIP(mp) = NULL;
380 }
381 if ((dqtype & XFS_QMOPT_GQUOTA) && XFS_QI_GQIP(mp)) {
382 XFS_PURGE_INODE(XFS_QI_GQIP(mp));
383 XFS_QI_GQIP(mp) = NULL;
384 }
385 mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
386
387 return (error);
388}
389
390STATIC int
391xfs_qm_scall_trunc_qfiles(
392 xfs_mount_t *mp,
393 uint flags)
394{
395 int error;
396 xfs_inode_t *qip;
397
398 if (!capable(CAP_SYS_ADMIN))
399 return XFS_ERROR(EPERM);
400 error = 0;
401 if (!XFS_SB_VERSION_HASQUOTA(&mp->m_sb) || flags == 0) {
402 qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags);
403 return XFS_ERROR(EINVAL);
404 }
405
406 if ((flags & XFS_DQ_USER) && mp->m_sb.sb_uquotino != NULLFSINO) {
407 error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &qip, 0);
408 if (! error) {
409 (void) xfs_truncate_file(mp, qip);
410 VN_RELE(XFS_ITOV(qip));
411 }
412 }
413
414 if ((flags & XFS_DQ_GROUP) && mp->m_sb.sb_gquotino != NULLFSINO) {
415 error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip, 0);
416 if (! error) {
417 (void) xfs_truncate_file(mp, qip);
418 VN_RELE(XFS_ITOV(qip));
419 }
420 }
421
422 return (error);
423}
424
425
426/*
427 * Switch on (a given) quota enforcement for a filesystem. This takes
428 * effect immediately.
429 * (Switching on quota accounting must be done at mount time.)
430 */
431STATIC int
432xfs_qm_scall_quotaon(
433 xfs_mount_t *mp,
434 uint flags)
435{
436 int error;
437 unsigned long s;
438 uint qf;
439 uint accflags;
440 __int64_t sbflags;
441
442 if (!capable(CAP_SYS_ADMIN))
443 return XFS_ERROR(EPERM);
444
445 flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
446 /*
447 * Switching on quota accounting must be done at mount time.
448 */
449 accflags = flags & XFS_ALL_QUOTA_ACCT;
450 flags &= ~(XFS_ALL_QUOTA_ACCT);
451
452 sbflags = 0;
453
454 if (flags == 0) {
455 qdprintk("quotaon: zero flags, m_qflags=%x\n", mp->m_qflags);
456 return XFS_ERROR(EINVAL);
457 }
458
459 /* No fs can turn on quotas with a delayed effect */
460 ASSERT((flags & XFS_ALL_QUOTA_ACCT) == 0);
461
462 /*
463 * Can't enforce without accounting. We check the superblock
464 * qflags here instead of m_qflags because rootfs can have
465 * quota acct on ondisk without m_qflags' knowing.
466 */
467 if (((flags & XFS_UQUOTA_ACCT) == 0 &&
468 (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
469 (flags & XFS_UQUOTA_ENFD))
470 ||
471 ((flags & XFS_GQUOTA_ACCT) == 0 &&
472 (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
473 (flags & XFS_GQUOTA_ENFD))) {
474 qdprintk("Can't enforce without acct, flags=%x sbflags=%x\n",
475 flags, mp->m_sb.sb_qflags);
476 return XFS_ERROR(EINVAL);
477 }
478 /*
479 * If everything's upto-date incore, then don't waste time.
480 */
481 if ((mp->m_qflags & flags) == flags)
482 return XFS_ERROR(EEXIST);
483
484 /*
485 * Change sb_qflags on disk but not incore mp->qflags
486 * if this is the root filesystem.
487 */
488 s = XFS_SB_LOCK(mp);
489 qf = mp->m_sb.sb_qflags;
490 mp->m_sb.sb_qflags = qf | flags;
491 XFS_SB_UNLOCK(mp, s);
492
493 /*
494 * There's nothing to change if it's the same.
495 */
496 if ((qf & flags) == flags && sbflags == 0)
497 return XFS_ERROR(EEXIST);
498 sbflags |= XFS_SB_QFLAGS;
499
500 if ((error = xfs_qm_write_sb_changes(mp, sbflags)))
501 return (error);
502 /*
503 * If we aren't trying to switch on quota enforcement, we are done.
504 */
505 if (((mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) !=
506 (mp->m_qflags & XFS_UQUOTA_ACCT)) ||
507 (flags & XFS_ALL_QUOTA_ENFD) == 0)
508 return (0);
509
510 if (! XFS_IS_QUOTA_RUNNING(mp))
511 return XFS_ERROR(ESRCH);
512
513 /*
514 * Switch on quota enforcement in core.
515 */
516 mutex_lock(&(XFS_QI_QOFFLOCK(mp)), PINOD);
517 mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD);
518 mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
519
520 return (0);
521}
522
523
524
525/*
526 * Return quota status information, such as uquota-off, enforcements, etc.
527 */
528STATIC int
529xfs_qm_scall_getqstat(
530 xfs_mount_t *mp,
531 fs_quota_stat_t *out)
532{
533 xfs_inode_t *uip, *gip;
534 boolean_t tempuqip, tempgqip;
535
536 uip = gip = NULL;
537 tempuqip = tempgqip = B_FALSE;
538 memset(out, 0, sizeof(fs_quota_stat_t));
539
540 out->qs_version = FS_QSTAT_VERSION;
541 if (! XFS_SB_VERSION_HASQUOTA(&mp->m_sb)) {
542 out->qs_uquota.qfs_ino = NULLFSINO;
543 out->qs_gquota.qfs_ino = NULLFSINO;
544 return (0);
545 }
546 out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
547 (XFS_ALL_QUOTA_ACCT|
548 XFS_ALL_QUOTA_ENFD));
549 out->qs_pad = 0;
550 out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino;
551 out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
552
553 if (mp->m_quotainfo) {
554 uip = mp->m_quotainfo->qi_uquotaip;
555 gip = mp->m_quotainfo->qi_gquotaip;
556 }
557 if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
558 if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
559 0, 0, &uip, 0) == 0)
560 tempuqip = B_TRUE;
561 }
562 if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) {
563 if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
564 0, 0, &gip, 0) == 0)
565 tempgqip = B_TRUE;
566 }
567 if (uip) {
568 out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks;
569 out->qs_uquota.qfs_nextents = uip->i_d.di_nextents;
570 if (tempuqip)
571 VN_RELE(XFS_ITOV(uip));
572 }
573 if (gip) {
574 out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks;
575 out->qs_gquota.qfs_nextents = gip->i_d.di_nextents;
576 if (tempgqip)
577 VN_RELE(XFS_ITOV(gip));
578 }
579 if (mp->m_quotainfo) {
580 out->qs_incoredqs = XFS_QI_MPLNDQUOTS(mp);
581 out->qs_btimelimit = XFS_QI_BTIMELIMIT(mp);
582 out->qs_itimelimit = XFS_QI_ITIMELIMIT(mp);
583 out->qs_rtbtimelimit = XFS_QI_RTBTIMELIMIT(mp);
584 out->qs_bwarnlimit = XFS_QI_BWARNLIMIT(mp);
585 out->qs_iwarnlimit = XFS_QI_IWARNLIMIT(mp);
586 }
587 return (0);
588}
589
590/*
591 * Adjust quota limits, and start/stop timers accordingly.
592 */
593STATIC int
594xfs_qm_scall_setqlim(
595 xfs_mount_t *mp,
596 xfs_dqid_t id,
597 uint type,
598 fs_disk_quota_t *newlim)
599{
600 xfs_disk_dquot_t *ddq;
601 xfs_dquot_t *dqp;
602 xfs_trans_t *tp;
603 int error;
604 xfs_qcnt_t hard, soft;
605
606 if (!capable(CAP_SYS_ADMIN))
607 return XFS_ERROR(EPERM);
608
609 if ((newlim->d_fieldmask & (FS_DQ_LIMIT_MASK|FS_DQ_TIMER_MASK)) == 0)
610 return (0);
611
612 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
613 if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128,
614 0, 0, XFS_DEFAULT_LOG_COUNT))) {
615 xfs_trans_cancel(tp, 0);
616 return (error);
617 }
618
619 /*
620 * We don't want to race with a quotaoff so take the quotaoff lock.
621 * (We don't hold an inode lock, so there's nothing else to stop
622 * a quotaoff from happening). (XXXThis doesn't currently happen
623 * because we take the vfslock before calling xfs_qm_sysent).
624 */
625 mutex_lock(&(XFS_QI_QOFFLOCK(mp)), PINOD);
626
627 /*
628 * Get the dquot (locked), and join it to the transaction.
629 * Allocate the dquot if this doesn't exist.
630 */
631 if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) {
632 xfs_trans_cancel(tp, XFS_TRANS_ABORT);
633 mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
634 ASSERT(error != ENOENT);
635 return (error);
636 }
637 xfs_dqtrace_entry(dqp, "Q_SETQLIM: AFT DQGET");
638 xfs_trans_dqjoin(tp, dqp);
639 ddq = &dqp->q_core;
640
641 /*
642 * Make sure that hardlimits are >= soft limits before changing.
643 */
644 hard = (newlim->d_fieldmask & FS_DQ_BHARD) ?
645 (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_hardlimit) :
646 INT_GET(ddq->d_blk_hardlimit, ARCH_CONVERT);
647 soft = (newlim->d_fieldmask & FS_DQ_BSOFT) ?
648 (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_softlimit) :
649 INT_GET(ddq->d_blk_softlimit, ARCH_CONVERT);
650 if (hard == 0 || hard >= soft) {
651 INT_SET(ddq->d_blk_hardlimit, ARCH_CONVERT, hard);
652 INT_SET(ddq->d_blk_softlimit, ARCH_CONVERT, soft);
653 if (id == 0) {
654 mp->m_quotainfo->qi_bhardlimit = hard;
655 mp->m_quotainfo->qi_bsoftlimit = soft;
656 }
657 } else {
658 qdprintk("blkhard %Ld < blksoft %Ld\n", hard, soft);
659 }
660 hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ?
661 (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) :
662 INT_GET(ddq->d_rtb_hardlimit, ARCH_CONVERT);
663 soft = (newlim->d_fieldmask & FS_DQ_RTBSOFT) ?
664 (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_softlimit) :
665 INT_GET(ddq->d_rtb_softlimit, ARCH_CONVERT);
666 if (hard == 0 || hard >= soft) {
667 INT_SET(ddq->d_rtb_hardlimit, ARCH_CONVERT, hard);
668 INT_SET(ddq->d_rtb_softlimit, ARCH_CONVERT, soft);
669 if (id == 0) {
670 mp->m_quotainfo->qi_rtbhardlimit = hard;
671 mp->m_quotainfo->qi_rtbsoftlimit = soft;
672 }
673 } else {
674 qdprintk("rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
675 }
676
677 hard = (newlim->d_fieldmask & FS_DQ_IHARD) ?
678 (xfs_qcnt_t) newlim->d_ino_hardlimit :
679 INT_GET(ddq->d_ino_hardlimit, ARCH_CONVERT);
680 soft = (newlim->d_fieldmask & FS_DQ_ISOFT) ?
681 (xfs_qcnt_t) newlim->d_ino_softlimit :
682 INT_GET(ddq->d_ino_softlimit, ARCH_CONVERT);
683 if (hard == 0 || hard >= soft) {
684 INT_SET(ddq->d_ino_hardlimit, ARCH_CONVERT, hard);
685 INT_SET(ddq->d_ino_softlimit, ARCH_CONVERT, soft);
686 if (id == 0) {
687 mp->m_quotainfo->qi_ihardlimit = hard;
688 mp->m_quotainfo->qi_isoftlimit = soft;
689 }
690 } else {
691 qdprintk("ihard %Ld < isoft %Ld\n", hard, soft);
692 }
693
694 if (id == 0) {
695 /*
696 * Timelimits for the super user set the relative time
697 * the other users can be over quota for this file system.
698 * If it is zero a default is used. Ditto for the default
699 * soft and hard limit values (already done, above).
700 */
701 if (newlim->d_fieldmask & FS_DQ_BTIMER) {
702 mp->m_quotainfo->qi_btimelimit = newlim->d_btimer;
703 INT_SET(ddq->d_btimer, ARCH_CONVERT, newlim->d_btimer);
704 }
705 if (newlim->d_fieldmask & FS_DQ_ITIMER) {
706 mp->m_quotainfo->qi_itimelimit = newlim->d_itimer;
707 INT_SET(ddq->d_itimer, ARCH_CONVERT, newlim->d_itimer);
708 }
709 if (newlim->d_fieldmask & FS_DQ_RTBTIMER) {
710 mp->m_quotainfo->qi_rtbtimelimit = newlim->d_rtbtimer;
711 INT_SET(ddq->d_rtbtimer, ARCH_CONVERT, newlim->d_rtbtimer);
712 }
713 } else /* if (XFS_IS_QUOTA_ENFORCED(mp)) */ {
714 /*
715 * If the user is now over quota, start the timelimit.
716 * The user will not be 'warned'.
717 * Note that we keep the timers ticking, whether enforcement
718 * is on or off. We don't really want to bother with iterating
719 * over all ondisk dquots and turning the timers on/off.
720 */
721 xfs_qm_adjust_dqtimers(mp, ddq);
722 }
723 dqp->dq_flags |= XFS_DQ_DIRTY;
724 xfs_trans_log_dquot(tp, dqp);
725
726 xfs_dqtrace_entry(dqp, "Q_SETQLIM: COMMIT");
727 xfs_trans_commit(tp, 0, NULL);
728 xfs_qm_dqprint(dqp);
729 xfs_qm_dqrele(dqp);
730 mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
731
732 return (0);
733}
734
735STATIC int
736xfs_qm_scall_getquota(
737 xfs_mount_t *mp,
738 xfs_dqid_t id,
739 uint type,
740 fs_disk_quota_t *out)
741{
742 xfs_dquot_t *dqp;
743 int error;
744
745 /*
746 * Try to get the dquot. We don't want it allocated on disk, so
747 * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't
748 * exist, we'll get ENOENT back.
749 */
750 if ((error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp))) {
751 return (error);
752 }
753
754 xfs_dqtrace_entry(dqp, "Q_GETQUOTA SUCCESS");
755 /*
756 * If everything's NULL, this dquot doesn't quite exist as far as
757 * our utility programs are concerned.
758 */
759 if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
760 xfs_qm_dqput(dqp);
761 return XFS_ERROR(ENOENT);
762 }
763 /* xfs_qm_dqprint(dqp); */
764 /*
765 * Convert the disk dquot to the exportable format
766 */
767 xfs_qm_export_dquot(mp, &dqp->q_core, out);
768 xfs_qm_dqput(dqp);
769 return (error ? XFS_ERROR(EFAULT) : 0);
770}
771
772
773STATIC int
774xfs_qm_log_quotaoff_end(
775 xfs_mount_t *mp,
776 xfs_qoff_logitem_t *startqoff,
777 uint flags)
778{
779 xfs_trans_t *tp;
780 int error;
781 xfs_qoff_logitem_t *qoffi;
782
783 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END);
784
785 if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_qoff_logitem_t) * 2,
786 0, 0, XFS_DEFAULT_LOG_COUNT))) {
787 xfs_trans_cancel(tp, 0);
788 return (error);
789 }
790
791 qoffi = xfs_trans_get_qoff_item(tp, startqoff,
792 flags & XFS_ALL_QUOTA_ACCT);
793 xfs_trans_log_quotaoff_item(tp, qoffi);
794
795 /*
796 * We have to make sure that the transaction is secure on disk before we
797 * return and actually stop quota accounting. So, make it synchronous.
798 * We don't care about quotoff's performance.
799 */
800 xfs_trans_set_sync(tp);
801 error = xfs_trans_commit(tp, 0, NULL);
802 return (error);
803}
804
805
806STATIC int
807xfs_qm_log_quotaoff(
808 xfs_mount_t *mp,
809 xfs_qoff_logitem_t **qoffstartp,
810 uint flags)
811{
812 xfs_trans_t *tp;
813 int error;
814 unsigned long s;
815 xfs_qoff_logitem_t *qoffi=NULL;
816 uint oldsbqflag=0;
817
818 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF);
819 if ((error = xfs_trans_reserve(tp, 0,
820 sizeof(xfs_qoff_logitem_t) * 2 +
821 mp->m_sb.sb_sectsize + 128,
822 0,
823 0,
824 XFS_DEFAULT_LOG_COUNT))) {
825 goto error0;
826 }
827
828 qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT);
829 xfs_trans_log_quotaoff_item(tp, qoffi);
830
831 s = XFS_SB_LOCK(mp);
832 oldsbqflag = mp->m_sb.sb_qflags;
833 mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL;
834 XFS_SB_UNLOCK(mp, s);
835
836 xfs_mod_sb(tp, XFS_SB_QFLAGS);
837
838 /*
839 * We have to make sure that the transaction is secure on disk before we
840 * return and actually stop quota accounting. So, make it synchronous.
841 * We don't care about quotoff's performance.
842 */
843 xfs_trans_set_sync(tp);
844 error = xfs_trans_commit(tp, 0, NULL);
845
846error0:
847 if (error) {
848 xfs_trans_cancel(tp, 0);
849 /*
850 * No one else is modifying sb_qflags, so this is OK.
851 * We still hold the quotaofflock.
852 */
853 s = XFS_SB_LOCK(mp);
854 mp->m_sb.sb_qflags = oldsbqflag;
855 XFS_SB_UNLOCK(mp, s);
856 }
857 *qoffstartp = qoffi;
858 return (error);
859}
860
861
862/*
863 * Translate an internal style on-disk-dquot to the exportable format.
864 * The main differences are that the counters/limits are all in Basic
865 * Blocks (BBs) instead of the internal FSBs, and all on-disk data has
866 * to be converted to the native endianness.
867 */
868STATIC void
869xfs_qm_export_dquot(
870 xfs_mount_t *mp,
871 xfs_disk_dquot_t *src,
872 struct fs_disk_quota *dst)
873{
874 memset(dst, 0, sizeof(*dst));
875 dst->d_version = FS_DQUOT_VERSION; /* different from src->d_version */
876 dst->d_flags =
877 xfs_qm_export_qtype_flags(INT_GET(src->d_flags, ARCH_CONVERT));
878 dst->d_id = INT_GET(src->d_id, ARCH_CONVERT);
879 dst->d_blk_hardlimit = (__uint64_t)
880 XFS_FSB_TO_BB(mp, INT_GET(src->d_blk_hardlimit, ARCH_CONVERT));
881 dst->d_blk_softlimit = (__uint64_t)
882 XFS_FSB_TO_BB(mp, INT_GET(src->d_blk_softlimit, ARCH_CONVERT));
883 dst->d_ino_hardlimit = (__uint64_t)
884 INT_GET(src->d_ino_hardlimit, ARCH_CONVERT);
885 dst->d_ino_softlimit = (__uint64_t)
886 INT_GET(src->d_ino_softlimit, ARCH_CONVERT);
887 dst->d_bcount = (__uint64_t)
888 XFS_FSB_TO_BB(mp, INT_GET(src->d_bcount, ARCH_CONVERT));
889 dst->d_icount = (__uint64_t) INT_GET(src->d_icount, ARCH_CONVERT);
890 dst->d_btimer = (__uint32_t) INT_GET(src->d_btimer, ARCH_CONVERT);
891 dst->d_itimer = (__uint32_t) INT_GET(src->d_itimer, ARCH_CONVERT);
892 dst->d_iwarns = INT_GET(src->d_iwarns, ARCH_CONVERT);
893 dst->d_bwarns = INT_GET(src->d_bwarns, ARCH_CONVERT);
894
895 dst->d_rtb_hardlimit = (__uint64_t)
896 XFS_FSB_TO_BB(mp, INT_GET(src->d_rtb_hardlimit, ARCH_CONVERT));
897 dst->d_rtb_softlimit = (__uint64_t)
898 XFS_FSB_TO_BB(mp, INT_GET(src->d_rtb_softlimit, ARCH_CONVERT));
899 dst->d_rtbcount = (__uint64_t)
900 XFS_FSB_TO_BB(mp, INT_GET(src->d_rtbcount, ARCH_CONVERT));
901 dst->d_rtbtimer = (__uint32_t) INT_GET(src->d_rtbtimer, ARCH_CONVERT);
902 dst->d_rtbwarns = INT_GET(src->d_rtbwarns, ARCH_CONVERT);
903
904 /*
905 * Internally, we don't reset all the timers when quota enforcement
906 * gets turned off. No need to confuse the userlevel code,
907 * so return zeroes in that case.
908 */
909 if (! XFS_IS_QUOTA_ENFORCED(mp)) {
910 dst->d_btimer = 0;
911 dst->d_itimer = 0;
912 dst->d_rtbtimer = 0;
913 }
914
915#ifdef DEBUG
916 if (XFS_IS_QUOTA_ENFORCED(mp) && dst->d_id != 0) {
917 if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) &&
918 (dst->d_blk_softlimit > 0)) {
919 ASSERT(dst->d_btimer != 0);
920 }
921 if (((int) dst->d_icount >= (int) dst->d_ino_softlimit) &&
922 (dst->d_ino_softlimit > 0)) {
923 ASSERT(dst->d_itimer != 0);
924 }
925 }
926#endif
927}
928
929STATIC uint
930xfs_qm_import_qtype_flags(
931 uint uflags)
932{
933 /*
934 * Can't be both at the same time.
935 */
936 if (((uflags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) ==
937 (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) ||
938 ((uflags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) == 0))
939 return (0);
940
941 return (uflags & XFS_USER_QUOTA) ?
942 XFS_DQ_USER : XFS_DQ_GROUP;
943}
944
945STATIC uint
946xfs_qm_export_qtype_flags(
947 uint flags)
948{
949 /*
950 * Can't be both at the same time.
951 */
952 ASSERT((flags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) !=
953 (XFS_GROUP_QUOTA | XFS_USER_QUOTA));
954 ASSERT((flags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) != 0);
955
956 return (flags & XFS_DQ_USER) ?
957 XFS_USER_QUOTA : XFS_GROUP_QUOTA;
958}
959
960STATIC uint
961xfs_qm_import_flags(
962 uint uflags)
963{
964 uint flags = 0;
965
966 if (uflags & XFS_QUOTA_UDQ_ACCT)
967 flags |= XFS_UQUOTA_ACCT;
968 if (uflags & XFS_QUOTA_GDQ_ACCT)
969 flags |= XFS_GQUOTA_ACCT;
970 if (uflags & XFS_QUOTA_UDQ_ENFD)
971 flags |= XFS_UQUOTA_ENFD;
972 if (uflags & XFS_QUOTA_GDQ_ENFD)
973 flags |= XFS_GQUOTA_ENFD;
974 return (flags);
975}
976
977
978STATIC uint
979xfs_qm_export_flags(
980 uint flags)
981{
982 uint uflags;
983
984 uflags = 0;
985 if (flags & XFS_UQUOTA_ACCT)
986 uflags |= XFS_QUOTA_UDQ_ACCT;
987 if (flags & XFS_GQUOTA_ACCT)
988 uflags |= XFS_QUOTA_GDQ_ACCT;
989 if (flags & XFS_UQUOTA_ENFD)
990 uflags |= XFS_QUOTA_UDQ_ENFD;
991 if (flags & XFS_GQUOTA_ENFD)
992 uflags |= XFS_QUOTA_GDQ_ENFD;
993 return (uflags);
994}
995
996
997/*
998 * Go thru all the inodes in the file system, releasing their dquots.
999 * Note that the mount structure gets modified to indicate that quotas are off
1000 * AFTER this, in the case of quotaoff. This also gets called from
1001 * xfs_rootumount.
1002 */
1003void
1004xfs_qm_dqrele_all_inodes(
1005 struct xfs_mount *mp,
1006 uint flags)
1007{
1008 vmap_t vmap;
1009 xfs_inode_t *ip, *topino;
1010 uint ireclaims;
1011 vnode_t *vp;
1012 boolean_t vnode_refd;
1013
1014 ASSERT(mp->m_quotainfo);
1015
1016again:
1017 XFS_MOUNT_ILOCK(mp);
1018 ip = mp->m_inodes;
1019 if (ip == NULL) {
1020 XFS_MOUNT_IUNLOCK(mp);
1021 return;
1022 }
1023 do {
1024 /* Skip markers inserted by xfs_sync */
1025 if (ip->i_mount == NULL) {
1026 ip = ip->i_mnext;
1027 continue;
1028 }
1029 /* Root inode, rbmip and rsumip have associated blocks */
1030 if (ip == XFS_QI_UQIP(mp) || ip == XFS_QI_GQIP(mp)) {
1031 ASSERT(ip->i_udquot == NULL);
1032 ASSERT(ip->i_gdquot == NULL);
1033 ip = ip->i_mnext;
1034 continue;
1035 }
1036 vp = XFS_ITOV_NULL(ip);
1037 if (!vp) {
1038 ASSERT(ip->i_udquot == NULL);
1039 ASSERT(ip->i_gdquot == NULL);
1040 ip = ip->i_mnext;
1041 continue;
1042 }
1043 vnode_refd = B_FALSE;
1044 if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) {
1045 /*
1046 * Sample vp mapping while holding the mplock, lest
1047 * we come across a non-existent vnode.
1048 */
1049 VMAP(vp, vmap);
1050 ireclaims = mp->m_ireclaims;
1051 topino = mp->m_inodes;
1052 XFS_MOUNT_IUNLOCK(mp);
1053
1054 /* XXX restart limit ? */
1055 if ( ! (vp = vn_get(vp, &vmap)))
1056 goto again;
1057 xfs_ilock(ip, XFS_ILOCK_EXCL);
1058 vnode_refd = B_TRUE;
1059 } else {
1060 ireclaims = mp->m_ireclaims;
1061 topino = mp->m_inodes;
1062 XFS_MOUNT_IUNLOCK(mp);
1063 }
1064
1065 /*
1066 * We don't keep the mountlock across the dqrele() call,
1067 * since it can take a while..
1068 */
1069 if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
1070 xfs_qm_dqrele(ip->i_udquot);
1071 ip->i_udquot = NULL;
1072 }
1073 if ((flags & XFS_GQUOTA_ACCT) && ip->i_gdquot) {
1074 xfs_qm_dqrele(ip->i_gdquot);
1075 ip->i_gdquot = NULL;
1076 }
1077 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1078 /*
1079 * Wait until we've dropped the ilock and mountlock to
1080 * do the vn_rele. Or be condemned to an eternity in the
1081 * inactive code in hell.
1082 */
1083 if (vnode_refd)
1084 VN_RELE(vp);
1085 XFS_MOUNT_ILOCK(mp);
1086 /*
1087 * If an inode was inserted or removed, we gotta
1088 * start over again.
1089 */
1090 if (topino != mp->m_inodes || mp->m_ireclaims != ireclaims) {
1091 /* XXX use a sentinel */
1092 XFS_MOUNT_IUNLOCK(mp);
1093 goto again;
1094 }
1095 ip = ip->i_mnext;
1096 } while (ip != mp->m_inodes);
1097
1098 XFS_MOUNT_IUNLOCK(mp);
1099}
1100
1101/*------------------------------------------------------------------------*/
1102#ifdef DEBUG
1103/*
1104 * This contains all the test functions for XFS disk quotas.
1105 * Currently it does a quota accounting check. ie. it walks through
1106 * all inodes in the file system, calculating the dquot accounting fields,
1107 * and prints out any inconsistencies.
1108 */
1109xfs_dqhash_t *qmtest_udqtab;
1110xfs_dqhash_t *qmtest_gdqtab;
1111int qmtest_hashmask;
1112int qmtest_nfails;
1113mutex_t qcheck_lock;
1114
1115#define DQTEST_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \
1116 (__psunsigned_t)(id)) & \
1117 (qmtest_hashmask - 1))
1118
1119#define DQTEST_HASH(mp, id, type) ((type & XFS_DQ_USER) ? \
1120 (qmtest_udqtab + \
1121 DQTEST_HASHVAL(mp, id)) : \
1122 (qmtest_gdqtab + \
1123 DQTEST_HASHVAL(mp, id)))
1124
1125#define DQTEST_LIST_PRINT(l, NXT, title) \
1126{ \
1127 xfs_dqtest_t *dqp; int i = 0;\
1128 cmn_err(CE_DEBUG, "%s (#%d)", title, (int) (l)->qh_nelems); \
1129 for (dqp = (xfs_dqtest_t *)(l)->qh_next; dqp != NULL; \
1130 dqp = (xfs_dqtest_t *)dqp->NXT) { \
1131 cmn_err(CE_DEBUG, " %d. \"%d (%s)\" bcnt = %d, icnt = %d", \
1132 ++i, dqp->d_id, DQFLAGTO_TYPESTR(dqp), \
1133 dqp->d_bcount, dqp->d_icount); } \
1134}
1135
1136typedef struct dqtest {
1137 xfs_dqmarker_t q_lists;
1138 xfs_dqhash_t *q_hash; /* the hashchain header */
1139 xfs_mount_t *q_mount; /* filesystem this relates to */
1140 xfs_dqid_t d_id; /* user id or group id */
1141 xfs_qcnt_t d_bcount; /* # disk blocks owned by the user */
1142 xfs_qcnt_t d_icount; /* # inodes owned by the user */
1143} xfs_dqtest_t;
1144
1145STATIC void
1146xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp)
1147{
1148 xfs_dquot_t *d;
1149 if (((d) = (h)->qh_next))
1150 (d)->HL_PREVP = &((dqp)->HL_NEXT);
1151 (dqp)->HL_NEXT = d;
1152 (dqp)->HL_PREVP = &((h)->qh_next);
1153 (h)->qh_next = (xfs_dquot_t *)dqp;
1154 (h)->qh_version++;
1155 (h)->qh_nelems++;
1156}
1157STATIC void
1158xfs_qm_dqtest_print(
1159 xfs_dqtest_t *d)
1160{
1161 cmn_err(CE_DEBUG, "-----------DQTEST DQUOT----------------");
1162 cmn_err(CE_DEBUG, "---- dquot ID = %d", d->d_id);
1163 cmn_err(CE_DEBUG, "---- type = %s", XFS_QM_ISUDQ(d)? "USR" : "GRP");
1164 cmn_err(CE_DEBUG, "---- fs = 0x%p", d->q_mount);
1165 cmn_err(CE_DEBUG, "---- bcount = %Lu (0x%x)",
1166 d->d_bcount, (int)d->d_bcount);
1167 cmn_err(CE_DEBUG, "---- icount = %Lu (0x%x)",
1168 d->d_icount, (int)d->d_icount);
1169 cmn_err(CE_DEBUG, "---------------------------");
1170}
1171
1172STATIC void
1173xfs_qm_dqtest_failed(
1174 xfs_dqtest_t *d,
1175 xfs_dquot_t *dqp,
1176 char *reason,
1177 xfs_qcnt_t a,
1178 xfs_qcnt_t b,
1179 int error)
1180{
1181 qmtest_nfails++;
1182 if (error)
1183 cmn_err(CE_DEBUG, "quotacheck failed id=%d, err=%d\nreason: %s",
1184 INT_GET(d->d_id, ARCH_CONVERT), error, reason);
1185 else
1186 cmn_err(CE_DEBUG, "quotacheck failed id=%d (%s) [%d != %d]",
1187 INT_GET(d->d_id, ARCH_CONVERT), reason, (int)a, (int)b);
1188 xfs_qm_dqtest_print(d);
1189 if (dqp)
1190 xfs_qm_dqprint(dqp);
1191}
1192
1193STATIC int
1194xfs_dqtest_cmp2(
1195 xfs_dqtest_t *d,
1196 xfs_dquot_t *dqp)
1197{
1198 int err = 0;
1199 if (INT_GET(dqp->q_core.d_icount, ARCH_CONVERT) != d->d_icount) {
1200 xfs_qm_dqtest_failed(d, dqp, "icount mismatch",
1201 INT_GET(dqp->q_core.d_icount, ARCH_CONVERT),
1202 d->d_icount, 0);
1203 err++;
1204 }
1205 if (INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT) != d->d_bcount) {
1206 xfs_qm_dqtest_failed(d, dqp, "bcount mismatch",
1207 INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT),
1208 d->d_bcount, 0);
1209 err++;
1210 }
1211 if (INT_GET(dqp->q_core.d_blk_softlimit, ARCH_CONVERT) &&
1212 INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT) >=
1213 INT_GET(dqp->q_core.d_blk_softlimit, ARCH_CONVERT)) {
1214 if (!dqp->q_core.d_btimer && dqp->q_core.d_id) {
1215 cmn_err(CE_DEBUG,
1216 "%d [%s] [0x%p] BLK TIMER NOT STARTED",
1217 d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount);
1218 err++;
1219 }
1220 }
1221 if (INT_GET(dqp->q_core.d_ino_softlimit, ARCH_CONVERT) &&
1222 INT_GET(dqp->q_core.d_icount, ARCH_CONVERT) >=
1223 INT_GET(dqp->q_core.d_ino_softlimit, ARCH_CONVERT)) {
1224 if (!dqp->q_core.d_itimer && dqp->q_core.d_id) {
1225 cmn_err(CE_DEBUG,
1226 "%d [%s] [0x%p] INO TIMER NOT STARTED",
1227 d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount);
1228 err++;
1229 }
1230 }
1231#ifdef QUOTADEBUG
1232 if (!err) {
1233 cmn_err(CE_DEBUG, "%d [%s] [0x%p] qchecked",
1234 d->d_id, XFS_QM_ISUDQ(d) ? "USR" : "GRP", d->q_mount);
1235 }
1236#endif
1237 return (err);
1238}
1239
1240STATIC void
1241xfs_dqtest_cmp(
1242 xfs_dqtest_t *d)
1243{
1244 xfs_dquot_t *dqp;
1245 int error;
1246
1247 /* xfs_qm_dqtest_print(d); */
1248 if ((error = xfs_qm_dqget(d->q_mount, NULL, d->d_id, d->dq_flags, 0,
1249 &dqp))) {
1250 xfs_qm_dqtest_failed(d, NULL, "dqget failed", 0, 0, error);
1251 return;
1252 }
1253 xfs_dqtest_cmp2(d, dqp);
1254 xfs_qm_dqput(dqp);
1255}
1256
1257STATIC int
1258xfs_qm_internalqcheck_dqget(
1259 xfs_mount_t *mp,
1260 xfs_dqid_t id,
1261 uint type,
1262 xfs_dqtest_t **O_dq)
1263{
1264 xfs_dqtest_t *d;
1265 xfs_dqhash_t *h;
1266
1267 h = DQTEST_HASH(mp, id, type);
1268 for (d = (xfs_dqtest_t *) h->qh_next; d != NULL;
1269 d = (xfs_dqtest_t *) d->HL_NEXT) {
1270 /* DQTEST_LIST_PRINT(h, HL_NEXT, "@@@@@ dqtestlist @@@@@"); */
1271 if (d->d_id == id && mp == d->q_mount) {
1272 *O_dq = d;
1273 return (0);
1274 }
1275 }
1276 d = kmem_zalloc(sizeof(xfs_dqtest_t), KM_SLEEP);
1277 d->dq_flags = type;
1278 d->d_id = id;
1279 d->q_mount = mp;
1280 d->q_hash = h;
1281 xfs_qm_hashinsert(h, d);
1282 *O_dq = d;
1283 return (0);
1284}
1285
1286STATIC void
1287xfs_qm_internalqcheck_get_dquots(
1288 xfs_mount_t *mp,
1289 xfs_dqid_t uid,
1290 xfs_dqid_t gid,
1291 xfs_dqtest_t **ud,
1292 xfs_dqtest_t **gd)
1293{
1294 if (XFS_IS_UQUOTA_ON(mp))
1295 xfs_qm_internalqcheck_dqget(mp, uid, XFS_DQ_USER, ud);
1296 if (XFS_IS_GQUOTA_ON(mp))
1297 xfs_qm_internalqcheck_dqget(mp, gid, XFS_DQ_GROUP, gd);
1298}
1299
1300
1301STATIC void
1302xfs_qm_internalqcheck_dqadjust(
1303 xfs_inode_t *ip,
1304 xfs_dqtest_t *d)
1305{
1306 d->d_icount++;
1307 d->d_bcount += (xfs_qcnt_t)ip->i_d.di_nblocks;
1308}
1309
1310STATIC int
1311xfs_qm_internalqcheck_adjust(
1312 xfs_mount_t *mp, /* mount point for filesystem */
1313 xfs_ino_t ino, /* inode number to get data for */
1314 void __user *buffer, /* not used */
1315 int ubsize, /* not used */
1316 void *private_data, /* not used */
1317 xfs_daddr_t bno, /* starting block of inode cluster */
1318 int *ubused, /* not used */
1319 void *dip, /* not used */
1320 int *res) /* bulkstat result code */
1321{
1322 xfs_inode_t *ip;
1323 xfs_dqtest_t *ud, *gd;
1324 uint lock_flags;
1325 boolean_t ipreleased;
1326 int error;
1327
1328 ASSERT(XFS_IS_QUOTA_RUNNING(mp));
1329
1330 if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) {
1331 *res = BULKSTAT_RV_NOTHING;
1332 qdprintk("internalqcheck: ino=%llu, uqino=%llu, gqino=%llu\n",
1333 (unsigned long long) ino,
1334 (unsigned long long) mp->m_sb.sb_uquotino,
1335 (unsigned long long) mp->m_sb.sb_gquotino);
1336 return XFS_ERROR(EINVAL);
1337 }
1338 ipreleased = B_FALSE;
1339 again:
1340 lock_flags = XFS_ILOCK_SHARED;
1341 if ((error = xfs_iget(mp, NULL, ino, 0, lock_flags, &ip, bno))) {
1342 *res = BULKSTAT_RV_NOTHING;
1343 return (error);
1344 }
1345
1346 if (ip->i_d.di_mode == 0) {
1347 xfs_iput_new(ip, lock_flags);
1348 *res = BULKSTAT_RV_NOTHING;
1349 return XFS_ERROR(ENOENT);
1350 }
1351
1352 /*
1353 * This inode can have blocks after eof which can get released
1354 * when we send it to inactive. Since we don't check the dquot
1355 * until the after all our calculations are done, we must get rid
1356 * of those now.
1357 */
1358 if (! ipreleased) {
1359 xfs_iput(ip, lock_flags);
1360 ipreleased = B_TRUE;
1361 goto again;
1362 }
1363 xfs_qm_internalqcheck_get_dquots(mp,
1364 (xfs_dqid_t) ip->i_d.di_uid,
1365 (xfs_dqid_t) ip->i_d.di_gid,
1366 &ud, &gd);
1367 if (XFS_IS_UQUOTA_ON(mp)) {
1368 ASSERT(ud);
1369 xfs_qm_internalqcheck_dqadjust(ip, ud);
1370 }
1371 if (XFS_IS_GQUOTA_ON(mp)) {
1372 ASSERT(gd);
1373 xfs_qm_internalqcheck_dqadjust(ip, gd);
1374 }
1375 xfs_iput(ip, lock_flags);
1376 *res = BULKSTAT_RV_DIDONE;
1377 return (0);
1378}
1379
1380
1381/* PRIVATE, debugging */
1382int
1383xfs_qm_internalqcheck(
1384 xfs_mount_t *mp)
1385{
1386 xfs_ino_t lastino;
1387 int done, count;
1388 int i;
1389 xfs_dqtest_t *d, *e;
1390 xfs_dqhash_t *h1;
1391 int error;
1392
1393 lastino = 0;
1394 qmtest_hashmask = 32;
1395 count = 5;
1396 done = 0;
1397 qmtest_nfails = 0;
1398
1399 if (! XFS_IS_QUOTA_ON(mp))
1400 return XFS_ERROR(ESRCH);
1401
1402 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
1403 XFS_bflush(mp->m_ddev_targp);
1404 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
1405 XFS_bflush(mp->m_ddev_targp);
1406
1407 mutex_lock(&qcheck_lock, PINOD);
1408 /* There should be absolutely no quota activity while this
1409 is going on. */
1410 qmtest_udqtab = kmem_zalloc(qmtest_hashmask *
1411 sizeof(xfs_dqhash_t), KM_SLEEP);
1412 qmtest_gdqtab = kmem_zalloc(qmtest_hashmask *
1413 sizeof(xfs_dqhash_t), KM_SLEEP);
1414 do {
1415 /*
1416 * Iterate thru all the inodes in the file system,
1417 * adjusting the corresponding dquot counters
1418 */
1419 if ((error = xfs_bulkstat(mp, &lastino, &count,
1420 xfs_qm_internalqcheck_adjust, NULL,
1421 0, NULL, BULKSTAT_FG_IGET, &done))) {
1422 break;
1423 }
1424 } while (! done);
1425 if (error) {
1426 cmn_err(CE_DEBUG, "Bulkstat returned error 0x%x", error);
1427 }
1428 cmn_err(CE_DEBUG, "Checking results against system dquots");
1429 for (i = 0; i < qmtest_hashmask; i++) {
1430 h1 = &qmtest_udqtab[i];
1431 for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) {
1432 xfs_dqtest_cmp(d);
1433 e = (xfs_dqtest_t *) d->HL_NEXT;
1434 kmem_free(d, sizeof(xfs_dqtest_t));
1435 d = e;
1436 }
1437 h1 = &qmtest_gdqtab[i];
1438 for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) {
1439 xfs_dqtest_cmp(d);
1440 e = (xfs_dqtest_t *) d->HL_NEXT;
1441 kmem_free(d, sizeof(xfs_dqtest_t));
1442 d = e;
1443 }
1444 }
1445
1446 if (qmtest_nfails) {
1447 cmn_err(CE_DEBUG, "******** quotacheck failed ********");
1448 cmn_err(CE_DEBUG, "failures = %d", qmtest_nfails);
1449 } else {
1450 cmn_err(CE_DEBUG, "******** quotacheck successful! ********");
1451 }
1452 kmem_free(qmtest_udqtab, qmtest_hashmask * sizeof(xfs_dqhash_t));
1453 kmem_free(qmtest_gdqtab, qmtest_hashmask * sizeof(xfs_dqhash_t));
1454 mutex_unlock(&qcheck_lock);
1455 return (qmtest_nfails);
1456}
1457
1458#endif /* DEBUG */
diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h
new file mode 100644
index 000000000000..414b6004af21
--- /dev/null
+++ b/fs/xfs/quota/xfs_quota_priv.h
@@ -0,0 +1,192 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_QUOTA_PRIV_H__
33#define __XFS_QUOTA_PRIV_H__
34
35/*
36 * Number of bmaps that we ask from bmapi when doing a quotacheck.
37 * We make this restriction to keep the memory usage to a minimum.
38 */
39#define XFS_DQITER_MAP_SIZE 10
40
41/* Number of dquots that fit in to a dquot block */
42#define XFS_QM_DQPERBLK(mp) ((mp)->m_quotainfo->qi_dqperchunk)
43
44#define XFS_ISLOCKED_INODE(ip) (ismrlocked(&(ip)->i_lock, \
45 MR_UPDATE | MR_ACCESS) != 0)
46#define XFS_ISLOCKED_INODE_EXCL(ip) (ismrlocked(&(ip)->i_lock, \
47 MR_UPDATE) != 0)
48
49#define XFS_DQ_IS_ADDEDTO_TRX(t, d) ((d)->q_transp == (t))
50
51#define XFS_QI_MPLRECLAIMS(mp) ((mp)->m_quotainfo->qi_dqreclaims)
52#define XFS_QI_UQIP(mp) ((mp)->m_quotainfo->qi_uquotaip)
53#define XFS_QI_GQIP(mp) ((mp)->m_quotainfo->qi_gquotaip)
54#define XFS_QI_DQCHUNKLEN(mp) ((mp)->m_quotainfo->qi_dqchunklen)
55#define XFS_QI_BTIMELIMIT(mp) ((mp)->m_quotainfo->qi_btimelimit)
56#define XFS_QI_RTBTIMELIMIT(mp) ((mp)->m_quotainfo->qi_rtbtimelimit)
57#define XFS_QI_ITIMELIMIT(mp) ((mp)->m_quotainfo->qi_itimelimit)
58#define XFS_QI_BWARNLIMIT(mp) ((mp)->m_quotainfo->qi_bwarnlimit)
59#define XFS_QI_IWARNLIMIT(mp) ((mp)->m_quotainfo->qi_iwarnlimit)
60#define XFS_QI_QOFFLOCK(mp) ((mp)->m_quotainfo->qi_quotaofflock)
61
62#define XFS_QI_MPL_LIST(mp) ((mp)->m_quotainfo->qi_dqlist)
63#define XFS_QI_MPLLOCK(mp) ((mp)->m_quotainfo->qi_dqlist.qh_lock)
64#define XFS_QI_MPLNEXT(mp) ((mp)->m_quotainfo->qi_dqlist.qh_next)
65#define XFS_QI_MPLNDQUOTS(mp) ((mp)->m_quotainfo->qi_dqlist.qh_nelems)
66
67#define XQMLCK(h) (mutex_lock(&((h)->qh_lock), PINOD))
68#define XQMUNLCK(h) (mutex_unlock(&((h)->qh_lock)))
69#ifdef DEBUG
70struct xfs_dqhash;
71static inline int XQMISLCKD(struct xfs_dqhash *h)
72{
73 if (mutex_trylock(&h->qh_lock)) {
74 mutex_unlock(&h->qh_lock);
75 return 0;
76 }
77 return 1;
78}
79#endif
80
81#define XFS_DQ_HASH_LOCK(h) XQMLCK(h)
82#define XFS_DQ_HASH_UNLOCK(h) XQMUNLCK(h)
83#define XFS_DQ_IS_HASH_LOCKED(h) XQMISLCKD(h)
84
85#define xfs_qm_mplist_lock(mp) XQMLCK(&(XFS_QI_MPL_LIST(mp)))
86#define xfs_qm_mplist_unlock(mp) XQMUNLCK(&(XFS_QI_MPL_LIST(mp)))
87#define XFS_QM_IS_MPLIST_LOCKED(mp) XQMISLCKD(&(XFS_QI_MPL_LIST(mp)))
88
89#define xfs_qm_freelist_lock(qm) XQMLCK(&((qm)->qm_dqfreelist))
90#define xfs_qm_freelist_unlock(qm) XQMUNLCK(&((qm)->qm_dqfreelist))
91#define XFS_QM_IS_FREELIST_LOCKED(qm) XQMISLCKD(&((qm)->qm_dqfreelist))
92
93/*
94 * Hash into a bucket in the dquot hash table, based on <mp, id>.
95 */
96#define XFS_DQ_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \
97 (__psunsigned_t)(id)) & \
98 (xfs_Gqm->qm_dqhashmask - 1))
99#define XFS_DQ_HASH(mp, id, type) (type == XFS_DQ_USER ? \
100 (xfs_Gqm->qm_usr_dqhtable + \
101 XFS_DQ_HASHVAL(mp, id)) : \
102 (xfs_Gqm->qm_grp_dqhtable + \
103 XFS_DQ_HASHVAL(mp, id)))
104#define XFS_IS_DQTYPE_ON(mp, type) (type == XFS_DQ_USER ? \
105 XFS_IS_UQUOTA_ON(mp):XFS_IS_GQUOTA_ON(mp))
106#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
107 !dqp->q_core.d_blk_hardlimit && \
108 !dqp->q_core.d_blk_softlimit && \
109 !dqp->q_core.d_rtb_hardlimit && \
110 !dqp->q_core.d_rtb_softlimit && \
111 !dqp->q_core.d_ino_hardlimit && \
112 !dqp->q_core.d_ino_softlimit && \
113 !dqp->q_core.d_bcount && \
114 !dqp->q_core.d_rtbcount && \
115 !dqp->q_core.d_icount)
116
117#define HL_PREVP dq_hashlist.ql_prevp
118#define HL_NEXT dq_hashlist.ql_next
119#define MPL_PREVP dq_mplist.ql_prevp
120#define MPL_NEXT dq_mplist.ql_next
121
122
123#define _LIST_REMOVE(h, dqp, PVP, NXT) \
124 { \
125 xfs_dquot_t *d; \
126 if (((d) = (dqp)->NXT)) \
127 (d)->PVP = (dqp)->PVP; \
128 *((dqp)->PVP) = d; \
129 (dqp)->NXT = NULL; \
130 (dqp)->PVP = NULL; \
131 (h)->qh_version++; \
132 (h)->qh_nelems--; \
133 }
134
135#define _LIST_INSERT(h, dqp, PVP, NXT) \
136 { \
137 xfs_dquot_t *d; \
138 if (((d) = (h)->qh_next)) \
139 (d)->PVP = &((dqp)->NXT); \
140 (dqp)->NXT = d; \
141 (dqp)->PVP = &((h)->qh_next); \
142 (h)->qh_next = dqp; \
143 (h)->qh_version++; \
144 (h)->qh_nelems++; \
145 }
146
147#define FOREACH_DQUOT_IN_MP(dqp, mp) \
148 for ((dqp) = XFS_QI_MPLNEXT(mp); (dqp) != NULL; (dqp) = (dqp)->MPL_NEXT)
149
150#define FOREACH_DQUOT_IN_FREELIST(dqp, qlist) \
151for ((dqp) = (qlist)->qh_next; (dqp) != (xfs_dquot_t *)(qlist); \
152 (dqp) = (dqp)->dq_flnext)
153
154#define XQM_HASHLIST_INSERT(h, dqp) \
155 _LIST_INSERT(h, dqp, HL_PREVP, HL_NEXT)
156
157#define XQM_FREELIST_INSERT(h, dqp) \
158 xfs_qm_freelist_append(h, dqp)
159
160#define XQM_MPLIST_INSERT(h, dqp) \
161 _LIST_INSERT(h, dqp, MPL_PREVP, MPL_NEXT)
162
163#define XQM_HASHLIST_REMOVE(h, dqp) \
164 _LIST_REMOVE(h, dqp, HL_PREVP, HL_NEXT)
165#define XQM_FREELIST_REMOVE(dqp) \
166 xfs_qm_freelist_unlink(dqp)
167#define XQM_MPLIST_REMOVE(h, dqp) \
168 { _LIST_REMOVE(h, dqp, MPL_PREVP, MPL_NEXT); \
169 XFS_QI_MPLRECLAIMS((dqp)->q_mount)++; }
170
171#define XFS_DQ_IS_LOGITEM_INITD(dqp) ((dqp)->q_logitem.qli_dquot == (dqp))
172
173#define XFS_QM_DQP_TO_DQACCT(tp, dqp) (XFS_QM_ISUDQ(dqp) ? \
174 (tp)->t_dqinfo->dqa_usrdquots : \
175 (tp)->t_dqinfo->dqa_grpdquots)
176#define XFS_IS_SUSER_DQUOT(dqp) \
177 (!((dqp)->q_core.d_id))
178
179#define XFS_PURGE_INODE(ip) \
180 { \
181 vmap_t dqvmap; \
182 vnode_t *dqvp; \
183 dqvp = XFS_ITOV(ip); \
184 VMAP(dqvp, dqvmap); \
185 VN_RELE(dqvp); \
186 }
187
188#define DQFLAGTO_TYPESTR(d) (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \
189 (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : "???"))
190#define DQFLAGTO_DIRTYSTR(d) (XFS_DQ_IS_DIRTY(d) ? "DIRTY" : "NOTDIRTY")
191
192#endif /* __XFS_QUOTA_PRIV_H__ */
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
new file mode 100644
index 000000000000..149b2a1fd949
--- /dev/null
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -0,0 +1,941 @@
1/*
2 * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34#include "xfs_fs.h"
35#include "xfs_inum.h"
36#include "xfs_log.h"
37#include "xfs_trans.h"
38#include "xfs_sb.h"
39#include "xfs_ag.h"
40#include "xfs_dir.h"
41#include "xfs_dir2.h"
42#include "xfs_alloc.h"
43#include "xfs_dmapi.h"
44#include "xfs_quota.h"
45#include "xfs_mount.h"
46#include "xfs_alloc_btree.h"
47#include "xfs_bmap_btree.h"
48#include "xfs_ialloc_btree.h"
49#include "xfs_btree.h"
50#include "xfs_ialloc.h"
51#include "xfs_attr_sf.h"
52#include "xfs_dir_sf.h"
53#include "xfs_dir2_sf.h"
54#include "xfs_dinode.h"
55#include "xfs_inode.h"
56#include "xfs_bmap.h"
57#include "xfs_bit.h"
58#include "xfs_rtalloc.h"
59#include "xfs_error.h"
60#include "xfs_itable.h"
61#include "xfs_rw.h"
62#include "xfs_acl.h"
63#include "xfs_cap.h"
64#include "xfs_mac.h"
65#include "xfs_attr.h"
66#include "xfs_buf_item.h"
67#include "xfs_trans_priv.h"
68
69#include "xfs_qm.h"
70
71STATIC void xfs_trans_alloc_dqinfo(xfs_trans_t *);
72
73/*
74 * Add the locked dquot to the transaction.
75 * The dquot must be locked, and it cannot be associated with any
76 * transaction.
77 */
78void
79xfs_trans_dqjoin(
80 xfs_trans_t *tp,
81 xfs_dquot_t *dqp)
82{
83 xfs_dq_logitem_t *lp;
84
85 ASSERT(! XFS_DQ_IS_ADDEDTO_TRX(tp, dqp));
86 ASSERT(XFS_DQ_IS_LOCKED(dqp));
87 ASSERT(XFS_DQ_IS_LOGITEM_INITD(dqp));
88 lp = &dqp->q_logitem;
89
90 /*
91 * Get a log_item_desc to point at the new item.
92 */
93 (void) xfs_trans_add_item(tp, (xfs_log_item_t*)(lp));
94
95 /*
96 * Initialize i_transp so we can later determine if this dquot is
97 * associated with this transaction.
98 */
99 dqp->q_transp = tp;
100}
101
102
103/*
104 * This is called to mark the dquot as needing
105 * to be logged when the transaction is committed. The dquot must
106 * already be associated with the given transaction.
107 * Note that it marks the entire transaction as dirty. In the ordinary
108 * case, this gets called via xfs_trans_commit, after the transaction
109 * is already dirty. However, there's nothing stop this from getting
110 * called directly, as done by xfs_qm_scall_setqlim. Hence, the TRANS_DIRTY
111 * flag.
112 */
113void
114xfs_trans_log_dquot(
115 xfs_trans_t *tp,
116 xfs_dquot_t *dqp)
117{
118 xfs_log_item_desc_t *lidp;
119
120 ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp));
121 ASSERT(XFS_DQ_IS_LOCKED(dqp));
122
123 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(&dqp->q_logitem));
124 ASSERT(lidp != NULL);
125
126 tp->t_flags |= XFS_TRANS_DIRTY;
127 lidp->lid_flags |= XFS_LID_DIRTY;
128}
129
130/*
131 * Carry forward whatever is left of the quota blk reservation to
132 * the spanky new transaction
133 */
134STATIC void
135xfs_trans_dup_dqinfo(
136 xfs_trans_t *otp,
137 xfs_trans_t *ntp)
138{
139 xfs_dqtrx_t *oq, *nq;
140 int i,j;
141 xfs_dqtrx_t *oqa, *nqa;
142
143 if (!otp->t_dqinfo)
144 return;
145
146 xfs_trans_alloc_dqinfo(ntp);
147 oqa = otp->t_dqinfo->dqa_usrdquots;
148 nqa = ntp->t_dqinfo->dqa_usrdquots;
149
150 /*
151 * Because the quota blk reservation is carried forward,
152 * it is also necessary to carry forward the DQ_DIRTY flag.
153 */
154 if(otp->t_flags & XFS_TRANS_DQ_DIRTY)
155 ntp->t_flags |= XFS_TRANS_DQ_DIRTY;
156
157 for (j = 0; j < 2; j++) {
158 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
159 if (oqa[i].qt_dquot == NULL)
160 break;
161 oq = &oqa[i];
162 nq = &nqa[i];
163
164 nq->qt_dquot = oq->qt_dquot;
165 nq->qt_bcount_delta = nq->qt_icount_delta = 0;
166 nq->qt_rtbcount_delta = 0;
167
168 /*
169 * Transfer whatever is left of the reservations.
170 */
171 nq->qt_blk_res = oq->qt_blk_res - oq->qt_blk_res_used;
172 oq->qt_blk_res = oq->qt_blk_res_used;
173
174 nq->qt_rtblk_res = oq->qt_rtblk_res -
175 oq->qt_rtblk_res_used;
176 oq->qt_rtblk_res = oq->qt_rtblk_res_used;
177
178 nq->qt_ino_res = oq->qt_ino_res - oq->qt_ino_res_used;
179 oq->qt_ino_res = oq->qt_ino_res_used;
180
181 }
182 oqa = otp->t_dqinfo->dqa_grpdquots;
183 nqa = ntp->t_dqinfo->dqa_grpdquots;
184 }
185}
186
187/*
188 * Wrap around mod_dquot to account for both user and group quotas.
189 */
190void
191xfs_trans_mod_dquot_byino(
192 xfs_trans_t *tp,
193 xfs_inode_t *ip,
194 uint field,
195 long delta)
196{
197 xfs_mount_t *mp;
198
199 ASSERT(tp);
200 mp = tp->t_mountp;
201
202 if (!XFS_IS_QUOTA_ON(mp) ||
203 ip->i_ino == mp->m_sb.sb_uquotino ||
204 ip->i_ino == mp->m_sb.sb_gquotino)
205 return;
206
207 if (tp->t_dqinfo == NULL)
208 xfs_trans_alloc_dqinfo(tp);
209
210 if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot) {
211 (void) xfs_trans_mod_dquot(tp, ip->i_udquot, field, delta);
212 }
213 if (XFS_IS_GQUOTA_ON(mp) && ip->i_gdquot) {
214 (void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta);
215 }
216}
217
218STATIC xfs_dqtrx_t *
219xfs_trans_get_dqtrx(
220 xfs_trans_t *tp,
221 xfs_dquot_t *dqp)
222{
223 int i;
224 xfs_dqtrx_t *qa;
225
226 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
227 qa = XFS_QM_DQP_TO_DQACCT(tp, dqp);
228
229 if (qa[i].qt_dquot == NULL ||
230 qa[i].qt_dquot == dqp) {
231 return (&qa[i]);
232 }
233 }
234
235 return (NULL);
236}
237
238/*
239 * Make the changes in the transaction structure.
240 * The moral equivalent to xfs_trans_mod_sb().
241 * We don't touch any fields in the dquot, so we don't care
242 * if it's locked or not (most of the time it won't be).
243 */
244void
245xfs_trans_mod_dquot(
246 xfs_trans_t *tp,
247 xfs_dquot_t *dqp,
248 uint field,
249 long delta)
250{
251 xfs_dqtrx_t *qtrx;
252
253 ASSERT(tp);
254 qtrx = NULL;
255
256 if (tp->t_dqinfo == NULL)
257 xfs_trans_alloc_dqinfo(tp);
258 /*
259 * Find either the first free slot or the slot that belongs
260 * to this dquot.
261 */
262 qtrx = xfs_trans_get_dqtrx(tp, dqp);
263 ASSERT(qtrx);
264 if (qtrx->qt_dquot == NULL)
265 qtrx->qt_dquot = dqp;
266
267 switch (field) {
268
269 /*
270 * regular disk blk reservation
271 */
272 case XFS_TRANS_DQ_RES_BLKS:
273 qtrx->qt_blk_res += (ulong)delta;
274 break;
275
276 /*
277 * inode reservation
278 */
279 case XFS_TRANS_DQ_RES_INOS:
280 qtrx->qt_ino_res += (ulong)delta;
281 break;
282
283 /*
284 * disk blocks used.
285 */
286 case XFS_TRANS_DQ_BCOUNT:
287 if (qtrx->qt_blk_res && delta > 0) {
288 qtrx->qt_blk_res_used += (ulong)delta;
289 ASSERT(qtrx->qt_blk_res >= qtrx->qt_blk_res_used);
290 }
291 qtrx->qt_bcount_delta += delta;
292 break;
293
294 case XFS_TRANS_DQ_DELBCOUNT:
295 qtrx->qt_delbcnt_delta += delta;
296 break;
297
298 /*
299 * Inode Count
300 */
301 case XFS_TRANS_DQ_ICOUNT:
302 if (qtrx->qt_ino_res && delta > 0) {
303 qtrx->qt_ino_res_used += (ulong)delta;
304 ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used);
305 }
306 qtrx->qt_icount_delta += delta;
307 break;
308
309 /*
310 * rtblk reservation
311 */
312 case XFS_TRANS_DQ_RES_RTBLKS:
313 qtrx->qt_rtblk_res += (ulong)delta;
314 break;
315
316 /*
317 * rtblk count
318 */
319 case XFS_TRANS_DQ_RTBCOUNT:
320 if (qtrx->qt_rtblk_res && delta > 0) {
321 qtrx->qt_rtblk_res_used += (ulong)delta;
322 ASSERT(qtrx->qt_rtblk_res >= qtrx->qt_rtblk_res_used);
323 }
324 qtrx->qt_rtbcount_delta += delta;
325 break;
326
327 case XFS_TRANS_DQ_DELRTBCOUNT:
328 qtrx->qt_delrtb_delta += delta;
329 break;
330
331 default:
332 ASSERT(0);
333 }
334 tp->t_flags |= XFS_TRANS_DQ_DIRTY;
335}
336
337
338/*
339 * Given an array of dqtrx structures, lock all the dquots associated
340 * and join them to the transaction, provided they have been modified.
341 * We know that the highest number of dquots (of one type - usr OR grp),
342 * involved in a transaction is 2 and that both usr and grp combined - 3.
343 * So, we don't attempt to make this very generic.
344 */
345STATIC void
346xfs_trans_dqlockedjoin(
347 xfs_trans_t *tp,
348 xfs_dqtrx_t *q)
349{
350 ASSERT(q[0].qt_dquot != NULL);
351 if (q[1].qt_dquot == NULL) {
352 xfs_dqlock(q[0].qt_dquot);
353 xfs_trans_dqjoin(tp, q[0].qt_dquot);
354 } else {
355 ASSERT(XFS_QM_TRANS_MAXDQS == 2);
356 xfs_dqlock2(q[0].qt_dquot, q[1].qt_dquot);
357 xfs_trans_dqjoin(tp, q[0].qt_dquot);
358 xfs_trans_dqjoin(tp, q[1].qt_dquot);
359 }
360}
361
362
363/*
364 * Called by xfs_trans_commit() and similar in spirit to
365 * xfs_trans_apply_sb_deltas().
366 * Go thru all the dquots belonging to this transaction and modify the
367 * INCORE dquot to reflect the actual usages.
368 * Unreserve just the reservations done by this transaction.
369 * dquot is still left locked at exit.
370 */
371void
372xfs_trans_apply_dquot_deltas(
373 xfs_trans_t *tp)
374{
375 int i, j;
376 xfs_dquot_t *dqp;
377 xfs_dqtrx_t *qtrx, *qa;
378 xfs_disk_dquot_t *d;
379 long totalbdelta;
380 long totalrtbdelta;
381
382 if (! (tp->t_flags & XFS_TRANS_DQ_DIRTY))
383 return;
384
385 ASSERT(tp->t_dqinfo);
386 qa = tp->t_dqinfo->dqa_usrdquots;
387 for (j = 0; j < 2; j++) {
388 if (qa[0].qt_dquot == NULL) {
389 qa = tp->t_dqinfo->dqa_grpdquots;
390 continue;
391 }
392
393 /*
394 * Lock all of the dquots and join them to the transaction.
395 */
396 xfs_trans_dqlockedjoin(tp, qa);
397
398 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
399 qtrx = &qa[i];
400 /*
401 * The array of dquots is filled
402 * sequentially, not sparsely.
403 */
404 if ((dqp = qtrx->qt_dquot) == NULL)
405 break;
406
407 ASSERT(XFS_DQ_IS_LOCKED(dqp));
408 ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp));
409
410 /*
411 * adjust the actual number of blocks used
412 */
413 d = &dqp->q_core;
414
415 /*
416 * The issue here is - sometimes we don't make a blkquota
417 * reservation intentionally to be fair to users
418 * (when the amount is small). On the other hand,
419 * delayed allocs do make reservations, but that's
420 * outside of a transaction, so we have no
421 * idea how much was really reserved.
422 * So, here we've accumulated delayed allocation blks and
423 * non-delay blks. The assumption is that the
424 * delayed ones are always reserved (outside of a
425 * transaction), and the others may or may not have
426 * quota reservations.
427 */
428 totalbdelta = qtrx->qt_bcount_delta +
429 qtrx->qt_delbcnt_delta;
430 totalrtbdelta = qtrx->qt_rtbcount_delta +
431 qtrx->qt_delrtb_delta;
432#ifdef QUOTADEBUG
433 if (totalbdelta < 0)
434 ASSERT(INT_GET(d->d_bcount, ARCH_CONVERT) >=
435 (xfs_qcnt_t) -totalbdelta);
436
437 if (totalrtbdelta < 0)
438 ASSERT(INT_GET(d->d_rtbcount, ARCH_CONVERT) >=
439 (xfs_qcnt_t) -totalrtbdelta);
440
441 if (qtrx->qt_icount_delta < 0)
442 ASSERT(INT_GET(d->d_icount, ARCH_CONVERT) >=
443 (xfs_qcnt_t) -qtrx->qt_icount_delta);
444#endif
445 if (totalbdelta)
446 INT_MOD(d->d_bcount, ARCH_CONVERT, (xfs_qcnt_t)totalbdelta);
447
448 if (qtrx->qt_icount_delta)
449 INT_MOD(d->d_icount, ARCH_CONVERT, (xfs_qcnt_t)qtrx->qt_icount_delta);
450
451 if (totalrtbdelta)
452 INT_MOD(d->d_rtbcount, ARCH_CONVERT, (xfs_qcnt_t)totalrtbdelta);
453
454 /*
455 * Get any default limits in use.
456 * Start/reset the timer(s) if needed.
457 */
458 if (d->d_id) {
459 xfs_qm_adjust_dqlimits(tp->t_mountp, d);
460 xfs_qm_adjust_dqtimers(tp->t_mountp, d);
461 }
462
463 dqp->dq_flags |= XFS_DQ_DIRTY;
464 /*
465 * add this to the list of items to get logged
466 */
467 xfs_trans_log_dquot(tp, dqp);
468 /*
469 * Take off what's left of the original reservation.
470 * In case of delayed allocations, there's no
471 * reservation that a transaction structure knows of.
472 */
473 if (qtrx->qt_blk_res != 0) {
474 if (qtrx->qt_blk_res != qtrx->qt_blk_res_used) {
475 if (qtrx->qt_blk_res >
476 qtrx->qt_blk_res_used)
477 dqp->q_res_bcount -= (xfs_qcnt_t)
478 (qtrx->qt_blk_res -
479 qtrx->qt_blk_res_used);
480 else
481 dqp->q_res_bcount -= (xfs_qcnt_t)
482 (qtrx->qt_blk_res_used -
483 qtrx->qt_blk_res);
484 }
485 } else {
486 /*
487 * These blks were never reserved, either inside
488 * a transaction or outside one (in a delayed
489 * allocation). Also, this isn't always a
490 * negative number since we sometimes
491 * deliberately skip quota reservations.
492 */
493 if (qtrx->qt_bcount_delta) {
494 dqp->q_res_bcount +=
495 (xfs_qcnt_t)qtrx->qt_bcount_delta;
496 }
497 }
498 /*
499 * Adjust the RT reservation.
500 */
501 if (qtrx->qt_rtblk_res != 0) {
502 if (qtrx->qt_blk_res != qtrx->qt_blk_res_used) {
503 if (qtrx->qt_rtblk_res >
504 qtrx->qt_rtblk_res_used)
505 dqp->q_res_rtbcount -= (xfs_qcnt_t)
506 (qtrx->qt_rtblk_res -
507 qtrx->qt_rtblk_res_used);
508 else
509 dqp->q_res_rtbcount -= (xfs_qcnt_t)
510 (qtrx->qt_rtblk_res_used -
511 qtrx->qt_rtblk_res);
512 }
513 } else {
514 if (qtrx->qt_rtbcount_delta)
515 dqp->q_res_rtbcount +=
516 (xfs_qcnt_t)qtrx->qt_rtbcount_delta;
517 }
518
519 /*
520 * Adjust the inode reservation.
521 */
522 if (qtrx->qt_ino_res != 0) {
523 ASSERT(qtrx->qt_ino_res >=
524 qtrx->qt_ino_res_used);
525 if (qtrx->qt_ino_res > qtrx->qt_ino_res_used)
526 dqp->q_res_icount -= (xfs_qcnt_t)
527 (qtrx->qt_ino_res -
528 qtrx->qt_ino_res_used);
529 } else {
530 if (qtrx->qt_icount_delta)
531 dqp->q_res_icount +=
532 (xfs_qcnt_t)qtrx->qt_icount_delta;
533 }
534
535
536#ifdef QUOTADEBUG
537 if (qtrx->qt_rtblk_res != 0)
538 cmn_err(CE_DEBUG, "RT res %d for 0x%p\n",
539 (int) qtrx->qt_rtblk_res, dqp);
540#endif
541 ASSERT(dqp->q_res_bcount >=
542 INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT));
543 ASSERT(dqp->q_res_icount >=
544 INT_GET(dqp->q_core.d_icount, ARCH_CONVERT));
545 ASSERT(dqp->q_res_rtbcount >=
546 INT_GET(dqp->q_core.d_rtbcount, ARCH_CONVERT));
547 }
548 /*
549 * Do the group quotas next
550 */
551 qa = tp->t_dqinfo->dqa_grpdquots;
552 }
553}
554
555/*
556 * Release the reservations, and adjust the dquots accordingly.
557 * This is called only when the transaction is being aborted. If by
558 * any chance we have done dquot modifications incore (ie. deltas) already,
559 * we simply throw those away, since that's the expected behavior
560 * when a transaction is curtailed without a commit.
561 */
562STATIC void
563xfs_trans_unreserve_and_mod_dquots(
564 xfs_trans_t *tp)
565{
566 int i, j;
567 xfs_dquot_t *dqp;
568 xfs_dqtrx_t *qtrx, *qa;
569 boolean_t locked;
570
571 if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY))
572 return;
573
574 qa = tp->t_dqinfo->dqa_usrdquots;
575
576 for (j = 0; j < 2; j++) {
577 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
578 qtrx = &qa[i];
579 /*
580 * We assume that the array of dquots is filled
581 * sequentially, not sparsely.
582 */
583 if ((dqp = qtrx->qt_dquot) == NULL)
584 break;
585 /*
586 * Unreserve the original reservation. We don't care
587 * about the number of blocks used field, or deltas.
588 * Also we don't bother to zero the fields.
589 */
590 locked = B_FALSE;
591 if (qtrx->qt_blk_res) {
592 xfs_dqlock(dqp);
593 locked = B_TRUE;
594 dqp->q_res_bcount -=
595 (xfs_qcnt_t)qtrx->qt_blk_res;
596 }
597 if (qtrx->qt_ino_res) {
598 if (!locked) {
599 xfs_dqlock(dqp);
600 locked = B_TRUE;
601 }
602 dqp->q_res_icount -=
603 (xfs_qcnt_t)qtrx->qt_ino_res;
604 }
605
606 if (qtrx->qt_rtblk_res) {
607 if (!locked) {
608 xfs_dqlock(dqp);
609 locked = B_TRUE;
610 }
611 dqp->q_res_rtbcount -=
612 (xfs_qcnt_t)qtrx->qt_rtblk_res;
613 }
614 if (locked)
615 xfs_dqunlock(dqp);
616
617 }
618 qa = tp->t_dqinfo->dqa_grpdquots;
619 }
620}
621
622/*
623 * This reserves disk blocks and inodes against a dquot.
624 * Flags indicate if the dquot is to be locked here and also
625 * if the blk reservation is for RT or regular blocks.
626 * Sending in XFS_QMOPT_FORCE_RES flag skips the quota check.
627 * Returns EDQUOT if quota is exceeded.
628 */
629STATIC int
630xfs_trans_dqresv(
631 xfs_trans_t *tp,
632 xfs_mount_t *mp,
633 xfs_dquot_t *dqp,
634 long nblks,
635 long ninos,
636 uint flags)
637{
638 int error;
639 xfs_qcnt_t hardlimit;
640 xfs_qcnt_t softlimit;
641 time_t btimer;
642 xfs_qcnt_t *resbcountp;
643 xfs_quotainfo_t *q = mp->m_quotainfo;
644
645 if (! (flags & XFS_QMOPT_DQLOCK)) {
646 xfs_dqlock(dqp);
647 }
648 ASSERT(XFS_DQ_IS_LOCKED(dqp));
649 if (flags & XFS_TRANS_DQ_RES_BLKS) {
650 hardlimit = INT_GET(dqp->q_core.d_blk_hardlimit, ARCH_CONVERT);
651 if (!hardlimit)
652 hardlimit = q->qi_bhardlimit;
653 softlimit = INT_GET(dqp->q_core.d_blk_softlimit, ARCH_CONVERT);
654 if (!softlimit)
655 softlimit = q->qi_bsoftlimit;
656 btimer = INT_GET(dqp->q_core.d_btimer, ARCH_CONVERT);
657 resbcountp = &dqp->q_res_bcount;
658 } else {
659 ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS);
660 hardlimit = INT_GET(dqp->q_core.d_rtb_hardlimit, ARCH_CONVERT);
661 if (!hardlimit)
662 hardlimit = q->qi_rtbhardlimit;
663 softlimit = INT_GET(dqp->q_core.d_rtb_softlimit, ARCH_CONVERT);
664 if (!softlimit)
665 softlimit = q->qi_rtbsoftlimit;
666 btimer = INT_GET(dqp->q_core.d_rtbtimer, ARCH_CONVERT);
667 resbcountp = &dqp->q_res_rtbcount;
668 }
669 error = 0;
670
671 if ((flags & XFS_QMOPT_FORCE_RES) == 0 &&
672 dqp->q_core.d_id &&
673 XFS_IS_QUOTA_ENFORCED(dqp->q_mount)) {
674#ifdef QUOTADEBUG
675 cmn_err(CE_DEBUG, "BLK Res: nblks=%ld + resbcount=%Ld"
676 " > hardlimit=%Ld?", nblks, *resbcountp, hardlimit);
677#endif
678 if (nblks > 0) {
679 /*
680 * dquot is locked already. See if we'd go over the
681 * hardlimit or exceed the timelimit if we allocate
682 * nblks.
683 */
684 if (hardlimit > 0ULL &&
685 (hardlimit <= nblks + *resbcountp)) {
686 error = EDQUOT;
687 goto error_return;
688 }
689
690 if (softlimit > 0ULL &&
691 (softlimit <= nblks + *resbcountp)) {
692 /*
693 * If timer or warnings has expired,
694 * return EDQUOT
695 */
696 if ((btimer != 0 && get_seconds() > btimer) ||
697 (dqp->q_core.d_bwarns &&
698 INT_GET(dqp->q_core.d_bwarns, ARCH_CONVERT) >=
699 XFS_QI_BWARNLIMIT(dqp->q_mount))) {
700 error = EDQUOT;
701 goto error_return;
702 }
703 }
704 }
705 if (ninos > 0) {
706 hardlimit = INT_GET(dqp->q_core.d_ino_hardlimit, ARCH_CONVERT);
707 if (!hardlimit)
708 hardlimit = q->qi_ihardlimit;
709 softlimit = INT_GET(dqp->q_core.d_ino_softlimit, ARCH_CONVERT);
710 if (!softlimit)
711 softlimit = q->qi_isoftlimit;
712 if (hardlimit > 0ULL &&
713 INT_GET(dqp->q_core.d_icount, ARCH_CONVERT) >= hardlimit) {
714 error = EDQUOT;
715 goto error_return;
716 } else if (softlimit > 0ULL &&
717 INT_GET(dqp->q_core.d_icount, ARCH_CONVERT) >= softlimit) {
718 /*
719 * If timer or warnings has expired,
720 * return EDQUOT
721 */
722 if ((dqp->q_core.d_itimer &&
723 get_seconds() > INT_GET(dqp->q_core.d_itimer, ARCH_CONVERT)) ||
724 (dqp->q_core.d_iwarns &&
725 INT_GET(dqp->q_core.d_iwarns, ARCH_CONVERT) >=
726 XFS_QI_IWARNLIMIT(dqp->q_mount))) {
727 error = EDQUOT;
728 goto error_return;
729 }
730 }
731 }
732 }
733
734 /*
735 * Change the reservation, but not the actual usage.
736 * Note that q_res_bcount = q_core.d_bcount + resv
737 */
738 (*resbcountp) += (xfs_qcnt_t)nblks;
739 if (ninos != 0)
740 dqp->q_res_icount += (xfs_qcnt_t)ninos;
741
742 /*
743 * note the reservation amt in the trans struct too,
744 * so that the transaction knows how much was reserved by
745 * it against this particular dquot.
746 * We don't do this when we are reserving for a delayed allocation,
747 * because we don't have the luxury of a transaction envelope then.
748 */
749 if (tp) {
750 ASSERT(tp->t_dqinfo);
751 ASSERT(flags & XFS_QMOPT_RESBLK_MASK);
752 if (nblks != 0)
753 xfs_trans_mod_dquot(tp, dqp,
754 flags & XFS_QMOPT_RESBLK_MASK,
755 nblks);
756 if (ninos != 0)
757 xfs_trans_mod_dquot(tp, dqp,
758 XFS_TRANS_DQ_RES_INOS,
759 ninos);
760 }
761 ASSERT(dqp->q_res_bcount >= INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT));
762 ASSERT(dqp->q_res_rtbcount >= INT_GET(dqp->q_core.d_rtbcount, ARCH_CONVERT));
763 ASSERT(dqp->q_res_icount >= INT_GET(dqp->q_core.d_icount, ARCH_CONVERT));
764
765error_return:
766 if (! (flags & XFS_QMOPT_DQLOCK)) {
767 xfs_dqunlock(dqp);
768 }
769 return (error);
770}
771
772
773/*
774 * Given a dquot(s), make disk block and/or inode reservations against them.
775 * The fact that this does the reservation against both the usr and
776 * grp quotas is important, because this follows a both-or-nothing
777 * approach.
778 *
779 * flags = XFS_QMOPT_DQLOCK indicate if dquot(s) need to be locked.
780 * XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown.
781 * XFS_TRANS_DQ_RES_BLKS reserves regular disk blocks
782 * XFS_TRANS_DQ_RES_RTBLKS reserves realtime disk blocks
783 * dquots are unlocked on return, if they were not locked by caller.
784 */
785int
786xfs_trans_reserve_quota_bydquots(
787 xfs_trans_t *tp,
788 xfs_mount_t *mp,
789 xfs_dquot_t *udqp,
790 xfs_dquot_t *gdqp,
791 long nblks,
792 long ninos,
793 uint flags)
794{
795 int resvd;
796
797 if (! XFS_IS_QUOTA_ON(mp))
798 return (0);
799
800 if (tp && tp->t_dqinfo == NULL)
801 xfs_trans_alloc_dqinfo(tp);
802
803 ASSERT(flags & XFS_QMOPT_RESBLK_MASK);
804 resvd = 0;
805
806 if (udqp) {
807 if (xfs_trans_dqresv(tp, mp, udqp, nblks, ninos, flags))
808 return (EDQUOT);
809 resvd = 1;
810 }
811
812 if (gdqp) {
813 if (xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags)) {
814 /*
815 * can't do it, so backout previous reservation
816 */
817 if (resvd) {
818 flags |= XFS_QMOPT_FORCE_RES;
819 xfs_trans_dqresv(tp, mp, udqp,
820 -nblks, -ninos, flags);
821 }
822 return (EDQUOT);
823 }
824 }
825
826 /*
827 * Didnt change anything critical, so, no need to log
828 */
829 return (0);
830}
831
832
833/*
834 * Lock the dquot and change the reservation if we can.
835 * This doesn't change the actual usage, just the reservation.
836 * The inode sent in is locked.
837 *
838 * Returns 0 on success, EDQUOT or other errors otherwise
839 */
840STATIC int
841xfs_trans_reserve_quota_nblks(
842 xfs_trans_t *tp,
843 xfs_mount_t *mp,
844 xfs_inode_t *ip,
845 long nblks,
846 long ninos,
847 uint type)
848{
849 int error;
850
851 if (!XFS_IS_QUOTA_ON(mp))
852 return (0);
853
854 ASSERT(ip->i_ino != mp->m_sb.sb_uquotino);
855 ASSERT(ip->i_ino != mp->m_sb.sb_gquotino);
856
857 ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
858 ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount));
859 ASSERT((type & ~XFS_QMOPT_FORCE_RES) == XFS_TRANS_DQ_RES_RTBLKS ||
860 (type & ~XFS_QMOPT_FORCE_RES) == XFS_TRANS_DQ_RES_BLKS);
861
862 /*
863 * Reserve nblks against these dquots, with trans as the mediator.
864 */
865 error = xfs_trans_reserve_quota_bydquots(tp, mp,
866 ip->i_udquot, ip->i_gdquot,
867 nblks, ninos,
868 type);
869 return (error);
870}
871
872/*
873 * This routine is called to allocate a quotaoff log item.
874 */
875xfs_qoff_logitem_t *
876xfs_trans_get_qoff_item(
877 xfs_trans_t *tp,
878 xfs_qoff_logitem_t *startqoff,
879 uint flags)
880{
881 xfs_qoff_logitem_t *q;
882
883 ASSERT(tp != NULL);
884
885 q = xfs_qm_qoff_logitem_init(tp->t_mountp, startqoff, flags);
886 ASSERT(q != NULL);
887
888 /*
889 * Get a log_item_desc to point at the new item.
890 */
891 (void) xfs_trans_add_item(tp, (xfs_log_item_t*)q);
892
893 return (q);
894}
895
896
897/*
898 * This is called to mark the quotaoff logitem as needing
899 * to be logged when the transaction is committed. The logitem must
900 * already be associated with the given transaction.
901 */
902void
903xfs_trans_log_quotaoff_item(
904 xfs_trans_t *tp,
905 xfs_qoff_logitem_t *qlp)
906{
907 xfs_log_item_desc_t *lidp;
908
909 lidp = xfs_trans_find_item(tp, (xfs_log_item_t *)qlp);
910 ASSERT(lidp != NULL);
911
912 tp->t_flags |= XFS_TRANS_DIRTY;
913 lidp->lid_flags |= XFS_LID_DIRTY;
914}
915
916STATIC void
917xfs_trans_alloc_dqinfo(
918 xfs_trans_t *tp)
919{
920 (tp)->t_dqinfo = kmem_zone_zalloc(xfs_Gqm->qm_dqtrxzone, KM_SLEEP);
921}
922
923STATIC void
924xfs_trans_free_dqinfo(
925 xfs_trans_t *tp)
926{
927 if (!tp->t_dqinfo)
928 return;
929 kmem_zone_free(xfs_Gqm->qm_dqtrxzone, (tp)->t_dqinfo);
930 (tp)->t_dqinfo = NULL;
931}
932
933xfs_dqtrxops_t xfs_trans_dquot_ops = {
934 .qo_dup_dqinfo = xfs_trans_dup_dqinfo,
935 .qo_free_dqinfo = xfs_trans_free_dqinfo,
936 .qo_mod_dquot_byino = xfs_trans_mod_dquot_byino,
937 .qo_apply_dquot_deltas = xfs_trans_apply_dquot_deltas,
938 .qo_reserve_quota_nblks = xfs_trans_reserve_quota_nblks,
939 .qo_reserve_quota_bydquots = xfs_trans_reserve_quota_bydquots,
940 .qo_unreserve_and_mod_dquots = xfs_trans_unreserve_and_mod_dquots,
941};
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
new file mode 100644
index 000000000000..7d6e1f37df10
--- /dev/null
+++ b/fs/xfs/support/debug.c
@@ -0,0 +1,127 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "debug.h"
34
35#include <asm/page.h>
36#include <linux/sched.h>
37#include <linux/kernel.h>
38
39int doass = 1;
40static char message[256]; /* keep it off the stack */
41static DEFINE_SPINLOCK(xfs_err_lock);
42
43/* Translate from CE_FOO to KERN_FOO, err_level(CE_FOO) == KERN_FOO */
44#define XFS_MAX_ERR_LEVEL 7
45#define XFS_ERR_MASK ((1 << 3) - 1)
46static char *err_level[XFS_MAX_ERR_LEVEL+1] =
47 {KERN_EMERG, KERN_ALERT, KERN_CRIT,
48 KERN_ERR, KERN_WARNING, KERN_NOTICE,
49 KERN_INFO, KERN_DEBUG};
50
51void
52assfail(char *a, char *f, int l)
53{
54 printk("XFS assertion failed: %s, file: %s, line: %d\n", a, f, l);
55 BUG();
56}
57
58#if ((defined(DEBUG) || defined(INDUCE_IO_ERRROR)) && !defined(NO_WANT_RANDOM))
59
60unsigned long
61random(void)
62{
63 static unsigned long RandomValue = 1;
64 /* cycles pseudo-randomly through all values between 1 and 2^31 - 2 */
65 register long rv = RandomValue;
66 register long lo;
67 register long hi;
68
69 hi = rv / 127773;
70 lo = rv % 127773;
71 rv = 16807 * lo - 2836 * hi;
72 if( rv <= 0 ) rv += 2147483647;
73 return( RandomValue = rv );
74}
75
76int
77get_thread_id(void)
78{
79 return current->pid;
80}
81
82#endif /* DEBUG || INDUCE_IO_ERRROR || !NO_WANT_RANDOM */
83
84void
85cmn_err(register int level, char *fmt, ...)
86{
87 char *fp = fmt;
88 int len;
89 ulong flags;
90 va_list ap;
91
92 level &= XFS_ERR_MASK;
93 if (level > XFS_MAX_ERR_LEVEL)
94 level = XFS_MAX_ERR_LEVEL;
95 spin_lock_irqsave(&xfs_err_lock,flags);
96 va_start(ap, fmt);
97 if (*fmt == '!') fp++;
98 len = vsprintf(message, fp, ap);
99 if (message[len-1] != '\n')
100 strcat(message, "\n");
101 printk("%s%s", err_level[level], message);
102 va_end(ap);
103 spin_unlock_irqrestore(&xfs_err_lock,flags);
104
105 if (level == CE_PANIC)
106 BUG();
107}
108
109
110void
111icmn_err(register int level, char *fmt, va_list ap)
112{
113 ulong flags;
114 int len;
115
116 level &= XFS_ERR_MASK;
117 if(level > XFS_MAX_ERR_LEVEL)
118 level = XFS_MAX_ERR_LEVEL;
119 spin_lock_irqsave(&xfs_err_lock,flags);
120 len = vsprintf(message, fmt, ap);
121 if (message[len-1] != '\n')
122 strcat(message, "\n");
123 spin_unlock_irqrestore(&xfs_err_lock,flags);
124 printk("%s%s", err_level[level], message);
125 if (level == CE_PANIC)
126 BUG();
127}
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
new file mode 100644
index 000000000000..40b0f4c54d9e
--- /dev/null
+++ b/fs/xfs/support/debug.h
@@ -0,0 +1,72 @@
1/*
2 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_SUPPORT_DEBUG_H__
33#define __XFS_SUPPORT_DEBUG_H__
34
35#include <stdarg.h>
36
37#define CE_DEBUG 7 /* debug */
38#define CE_CONT 6 /* continuation */
39#define CE_NOTE 5 /* notice */
40#define CE_WARN 4 /* warning */
41#define CE_ALERT 1 /* alert */
42#define CE_PANIC 0 /* panic */
43
44extern void icmn_err(int, char *, va_list);
45/* PRINTFLIKE2 */
46extern void cmn_err(int, char *, ...);
47
48#ifndef STATIC
49# define STATIC static
50#endif
51
52#ifdef DEBUG
53# ifdef lint
54# define ASSERT(EX) ((void)0) /* avoid "constant in conditional" babble */
55# else
56# define ASSERT(EX) ((!doass||(EX))?((void)0):assfail(#EX, __FILE__, __LINE__))
57# endif /* lint */
58#else
59# define ASSERT(x) ((void)0)
60#endif
61
62extern int doass; /* dynamically turn off asserts */
63extern void assfail(char *, char *, int);
64#ifdef DEBUG
65extern unsigned long random(void);
66extern int get_thread_id(void);
67#endif
68
69#define ASSERT_ALWAYS(EX) ((EX)?((void)0):assfail(#EX, __FILE__, __LINE__))
70#define debug_stop_all_cpus(param) /* param is "cpumask_t *" */
71
72#endif /* __XFS_SUPPORT_DEBUG_H__ */
diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c
new file mode 100644
index 000000000000..3dae14c8c55a
--- /dev/null
+++ b/fs/xfs/support/ktrace.c
@@ -0,0 +1,346 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include <xfs.h>
34
35static kmem_zone_t *ktrace_hdr_zone;
36static kmem_zone_t *ktrace_ent_zone;
37static int ktrace_zentries;
38
39void
40ktrace_init(int zentries)
41{
42 ktrace_zentries = zentries;
43
44 ktrace_hdr_zone = kmem_zone_init(sizeof(ktrace_t),
45 "ktrace_hdr");
46 ASSERT(ktrace_hdr_zone);
47
48 ktrace_ent_zone = kmem_zone_init(ktrace_zentries
49 * sizeof(ktrace_entry_t),
50 "ktrace_ent");
51 ASSERT(ktrace_ent_zone);
52}
53
54void
55ktrace_uninit(void)
56{
57 kmem_cache_destroy(ktrace_hdr_zone);
58 kmem_cache_destroy(ktrace_ent_zone);
59}
60
61/*
62 * ktrace_alloc()
63 *
64 * Allocate a ktrace header and enough buffering for the given
65 * number of entries.
66 */
67ktrace_t *
68ktrace_alloc(int nentries, int sleep)
69{
70 ktrace_t *ktp;
71 ktrace_entry_t *ktep;
72
73 ktp = (ktrace_t*)kmem_zone_alloc(ktrace_hdr_zone, sleep);
74
75 if (ktp == (ktrace_t*)NULL) {
76 /*
77 * KM_SLEEP callers don't expect failure.
78 */
79 if (sleep & KM_SLEEP)
80 panic("ktrace_alloc: NULL memory on KM_SLEEP request!");
81
82 return NULL;
83 }
84
85 /*
86 * Special treatment for buffers with the ktrace_zentries entries
87 */
88 if (nentries == ktrace_zentries) {
89 ktep = (ktrace_entry_t*)kmem_zone_zalloc(ktrace_ent_zone,
90 sleep);
91 } else {
92 ktep = (ktrace_entry_t*)kmem_zalloc((nentries * sizeof(*ktep)),
93 sleep);
94 }
95
96 if (ktep == NULL) {
97 /*
98 * KM_SLEEP callers don't expect failure.
99 */
100 if (sleep & KM_SLEEP)
101 panic("ktrace_alloc: NULL memory on KM_SLEEP request!");
102
103 kmem_free(ktp, sizeof(*ktp));
104
105 return NULL;
106 }
107
108 spinlock_init(&(ktp->kt_lock), "kt_lock");
109
110 ktp->kt_entries = ktep;
111 ktp->kt_nentries = nentries;
112 ktp->kt_index = 0;
113 ktp->kt_rollover = 0;
114 return ktp;
115}
116
117
118/*
119 * ktrace_free()
120 *
121 * Free up the ktrace header and buffer. It is up to the caller
122 * to ensure that no-one is referencing it.
123 */
124void
125ktrace_free(ktrace_t *ktp)
126{
127 int entries_size;
128
129 if (ktp == (ktrace_t *)NULL)
130 return;
131
132 spinlock_destroy(&ktp->kt_lock);
133
134 /*
135 * Special treatment for the Vnode trace buffer.
136 */
137 if (ktp->kt_nentries == ktrace_zentries) {
138 kmem_zone_free(ktrace_ent_zone, ktp->kt_entries);
139 } else {
140 entries_size = (int)(ktp->kt_nentries * sizeof(ktrace_entry_t));
141
142 kmem_free(ktp->kt_entries, entries_size);
143 }
144
145 kmem_zone_free(ktrace_hdr_zone, ktp);
146}
147
148
149/*
150 * Enter the given values into the "next" entry in the trace buffer.
151 * kt_index is always the index of the next entry to be filled.
152 */
153void
154ktrace_enter(
155 ktrace_t *ktp,
156 void *val0,
157 void *val1,
158 void *val2,
159 void *val3,
160 void *val4,
161 void *val5,
162 void *val6,
163 void *val7,
164 void *val8,
165 void *val9,
166 void *val10,
167 void *val11,
168 void *val12,
169 void *val13,
170 void *val14,
171 void *val15)
172{
173 static lock_t wrap_lock = SPIN_LOCK_UNLOCKED;
174 unsigned long flags;
175 int index;
176 ktrace_entry_t *ktep;
177
178 ASSERT(ktp != NULL);
179
180 /*
181 * Grab an entry by pushing the index up to the next one.
182 */
183 spin_lock_irqsave(&wrap_lock, flags);
184 index = ktp->kt_index;
185 if (++ktp->kt_index == ktp->kt_nentries)
186 ktp->kt_index = 0;
187 spin_unlock_irqrestore(&wrap_lock, flags);
188
189 if (!ktp->kt_rollover && index == ktp->kt_nentries - 1)
190 ktp->kt_rollover = 1;
191
192 ASSERT((index >= 0) && (index < ktp->kt_nentries));
193
194 ktep = &(ktp->kt_entries[index]);
195
196 ktep->val[0] = val0;
197 ktep->val[1] = val1;
198 ktep->val[2] = val2;
199 ktep->val[3] = val3;
200 ktep->val[4] = val4;
201 ktep->val[5] = val5;
202 ktep->val[6] = val6;
203 ktep->val[7] = val7;
204 ktep->val[8] = val8;
205 ktep->val[9] = val9;
206 ktep->val[10] = val10;
207 ktep->val[11] = val11;
208 ktep->val[12] = val12;
209 ktep->val[13] = val13;
210 ktep->val[14] = val14;
211 ktep->val[15] = val15;
212}
213
214/*
215 * Return the number of entries in the trace buffer.
216 */
217int
218ktrace_nentries(
219 ktrace_t *ktp)
220{
221 if (ktp == NULL) {
222 return 0;
223 }
224
225 return (ktp->kt_rollover ? ktp->kt_nentries : ktp->kt_index);
226}
227
228/*
229 * ktrace_first()
230 *
231 * This is used to find the start of the trace buffer.
232 * In conjunction with ktrace_next() it can be used to
233 * iterate through the entire trace buffer. This code does
234 * not do any locking because it is assumed that it is called
235 * from the debugger.
236 *
237 * The caller must pass in a pointer to a ktrace_snap
238 * structure in which we will keep some state used to
239 * iterate through the buffer. This state must not touched
240 * by any code outside of this module.
241 */
242ktrace_entry_t *
243ktrace_first(ktrace_t *ktp, ktrace_snap_t *ktsp)
244{
245 ktrace_entry_t *ktep;
246 int index;
247 int nentries;
248
249 if (ktp->kt_rollover)
250 index = ktp->kt_index;
251 else
252 index = 0;
253
254 ktsp->ks_start = index;
255 ktep = &(ktp->kt_entries[index]);
256
257 nentries = ktrace_nentries(ktp);
258 index++;
259 if (index < nentries) {
260 ktsp->ks_index = index;
261 } else {
262 ktsp->ks_index = 0;
263 if (index > nentries)
264 ktep = NULL;
265 }
266 return ktep;
267}
268
269/*
270 * ktrace_next()
271 *
272 * This is used to iterate through the entries of the given
273 * trace buffer. The caller must pass in the ktrace_snap_t
274 * structure initialized by ktrace_first(). The return value
275 * will be either a pointer to the next ktrace_entry or NULL
276 * if all of the entries have been traversed.
277 */
278ktrace_entry_t *
279ktrace_next(
280 ktrace_t *ktp,
281 ktrace_snap_t *ktsp)
282{
283 int index;
284 ktrace_entry_t *ktep;
285
286 index = ktsp->ks_index;
287 if (index == ktsp->ks_start) {
288 ktep = NULL;
289 } else {
290 ktep = &ktp->kt_entries[index];
291 }
292
293 index++;
294 if (index == ktrace_nentries(ktp)) {
295 ktsp->ks_index = 0;
296 } else {
297 ktsp->ks_index = index;
298 }
299
300 return ktep;
301}
302
303/*
304 * ktrace_skip()
305 *
306 * Skip the next "count" entries and return the entry after that.
307 * Return NULL if this causes us to iterate past the beginning again.
308 */
309ktrace_entry_t *
310ktrace_skip(
311 ktrace_t *ktp,
312 int count,
313 ktrace_snap_t *ktsp)
314{
315 int index;
316 int new_index;
317 ktrace_entry_t *ktep;
318 int nentries = ktrace_nentries(ktp);
319
320 index = ktsp->ks_index;
321 new_index = index + count;
322 while (new_index >= nentries) {
323 new_index -= nentries;
324 }
325 if (index == ktsp->ks_start) {
326 /*
327 * We've iterated around to the start, so we're done.
328 */
329 ktep = NULL;
330 } else if ((new_index < index) && (index < ktsp->ks_index)) {
331 /*
332 * We've skipped past the start again, so we're done.
333 */
334 ktep = NULL;
335 ktsp->ks_index = ktsp->ks_start;
336 } else {
337 ktep = &(ktp->kt_entries[new_index]);
338 new_index++;
339 if (new_index == nentries) {
340 ktsp->ks_index = 0;
341 } else {
342 ktsp->ks_index = new_index;
343 }
344 }
345 return ktep;
346}
diff --git a/fs/xfs/support/ktrace.h b/fs/xfs/support/ktrace.h
new file mode 100644
index 000000000000..92d1a1a5d04b
--- /dev/null
+++ b/fs/xfs/support/ktrace.h
@@ -0,0 +1,101 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_SUPPORT_KTRACE_H__
33#define __XFS_SUPPORT_KTRACE_H__
34
35#include <spin.h>
36
37/*
38 * Trace buffer entry structure.
39 */
40typedef struct ktrace_entry {
41 void *val[16];
42} ktrace_entry_t;
43
44/*
45 * Trace buffer header structure.
46 */
47typedef struct ktrace {
48 lock_t kt_lock; /* mutex to guard counters */
49 int kt_nentries; /* number of entries in trace buf */
50 int kt_index; /* current index in entries */
51 int kt_rollover;
52 ktrace_entry_t *kt_entries; /* buffer of entries */
53} ktrace_t;
54
55/*
56 * Trace buffer snapshot structure.
57 */
58typedef struct ktrace_snap {
59 int ks_start; /* kt_index at time of snap */
60 int ks_index; /* current index */
61} ktrace_snap_t;
62
63
64#ifdef CONFIG_XFS_TRACE
65
66extern void ktrace_init(int zentries);
67extern void ktrace_uninit(void);
68
69extern ktrace_t *ktrace_alloc(int, int);
70extern void ktrace_free(ktrace_t *);
71
72extern void ktrace_enter(
73 ktrace_t *,
74 void *,
75 void *,
76 void *,
77 void *,
78 void *,
79 void *,
80 void *,
81 void *,
82 void *,
83 void *,
84 void *,
85 void *,
86 void *,
87 void *,
88 void *,
89 void *);
90
91extern ktrace_entry_t *ktrace_first(ktrace_t *, ktrace_snap_t *);
92extern int ktrace_nentries(ktrace_t *);
93extern ktrace_entry_t *ktrace_next(ktrace_t *, ktrace_snap_t *);
94extern ktrace_entry_t *ktrace_skip(ktrace_t *, int, ktrace_snap_t *);
95
96#else
97#define ktrace_init(x) do { } while (0)
98#define ktrace_uninit() do { } while (0)
99#endif /* CONFIG_XFS_TRACE */
100
101#endif /* __XFS_SUPPORT_KTRACE_H__ */
diff --git a/fs/xfs/support/move.c b/fs/xfs/support/move.c
new file mode 100644
index 000000000000..15b5194f16b2
--- /dev/null
+++ b/fs/xfs/support/move.c
@@ -0,0 +1,66 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include <xfs.h>
34
35/* Read from kernel buffer at src to user/kernel buffer defined
36 * by the uio structure. Advance the pointer in the uio struct
37 * as we go.
38 */
39int
40uio_read(caddr_t src, size_t len, struct uio *uio)
41{
42 size_t count;
43
44 if (!len || !uio->uio_resid)
45 return 0;
46
47 count = uio->uio_iov->iov_len;
48 if (!count)
49 return 0;
50 if (count > len)
51 count = len;
52
53 if (uio->uio_segflg == UIO_USERSPACE) {
54 if (copy_to_user(uio->uio_iov->iov_base, src, count))
55 return EFAULT;
56 } else {
57 ASSERT(uio->uio_segflg == UIO_SYSSPACE);
58 memcpy(uio->uio_iov->iov_base, src, count);
59 }
60
61 uio->uio_iov->iov_base = (void*)((char*)uio->uio_iov->iov_base + count);
62 uio->uio_iov->iov_len -= count;
63 uio->uio_offset += count;
64 uio->uio_resid -= count;
65 return 0;
66}
diff --git a/fs/xfs/support/move.h b/fs/xfs/support/move.h
new file mode 100644
index 000000000000..3d406dc1c89e
--- /dev/null
+++ b/fs/xfs/support/move.h
@@ -0,0 +1,84 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 *
32 * Portions Copyright (c) 1982, 1986, 1993, 1994
33 * The Regents of the University of California. All rights reserved.
34 *
35 * Redistribution and use in source and binary forms, with or without
36 * modification, are permitted provided that the following conditions
37 * are met:
38 * 1. Redistributions of source code must retain the above copyright
39 * notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 * notice, this list of conditions and the following disclaimer in the
42 * documentation and/or other materials provided with the distribution.
43 * 3. Neither the name of the University nor the names of its contributors
44 * may be used to endorse or promote products derived from this software
45 * without specific prior written permission.
46 *
47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57 * SUCH DAMAGE.
58 */
59#ifndef __XFS_SUPPORT_MOVE_H__
60#define __XFS_SUPPORT_MOVE_H__
61
62#include <linux/uio.h>
63#include <asm/uaccess.h>
64
65/* Segment flag values. */
66enum uio_seg {
67 UIO_USERSPACE, /* from user data space */
68 UIO_SYSSPACE, /* from system space */
69};
70
71struct uio {
72 struct iovec *uio_iov; /* pointer to array of iovecs */
73 int uio_iovcnt; /* number of iovecs in array */
74 xfs_off_t uio_offset; /* offset in file this uio corresponds to */
75 int uio_resid; /* residual i/o count */
76 enum uio_seg uio_segflg; /* see above */
77};
78
79typedef struct uio uio_t;
80typedef struct iovec iovec_t;
81
82extern int uio_read (caddr_t, size_t, uio_t *);
83
84#endif /* __XFS_SUPPORT_MOVE_H__ */
diff --git a/fs/xfs/support/qsort.c b/fs/xfs/support/qsort.c
new file mode 100644
index 000000000000..1ec824140cf7
--- /dev/null
+++ b/fs/xfs/support/qsort.c
@@ -0,0 +1,155 @@
1/*
2 * Copyright (c) 1992, 1993
3 * The Regents of the University of California. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the University nor the names of its contributors
14 * may be used to endorse or promote products derived from this software
15 * without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include <linux/kernel.h>
31#include <linux/string.h>
32
33/*
34 * Qsort routine from Bentley & McIlroy's "Engineering a Sort Function".
35 */
36#define swapcode(TYPE, parmi, parmj, n) { \
37 long i = (n) / sizeof (TYPE); \
38 register TYPE *pi = (TYPE *) (parmi); \
39 register TYPE *pj = (TYPE *) (parmj); \
40 do { \
41 register TYPE t = *pi; \
42 *pi++ = *pj; \
43 *pj++ = t; \
44 } while (--i > 0); \
45}
46
47#define SWAPINIT(a, es) swaptype = ((char *)a - (char *)0) % sizeof(long) || \
48 es % sizeof(long) ? 2 : es == sizeof(long)? 0 : 1;
49
50static __inline void
51swapfunc(char *a, char *b, int n, int swaptype)
52{
53 if (swaptype <= 1)
54 swapcode(long, a, b, n)
55 else
56 swapcode(char, a, b, n)
57}
58
59#define swap(a, b) \
60 if (swaptype == 0) { \
61 long t = *(long *)(a); \
62 *(long *)(a) = *(long *)(b); \
63 *(long *)(b) = t; \
64 } else \
65 swapfunc(a, b, es, swaptype)
66
67#define vecswap(a, b, n) if ((n) > 0) swapfunc(a, b, n, swaptype)
68
69static __inline char *
70med3(char *a, char *b, char *c, int (*cmp)(const void *, const void *))
71{
72 return cmp(a, b) < 0 ?
73 (cmp(b, c) < 0 ? b : (cmp(a, c) < 0 ? c : a ))
74 :(cmp(b, c) > 0 ? b : (cmp(a, c) < 0 ? a : c ));
75}
76
77void
78qsort(void *aa, size_t n, size_t es, int (*cmp)(const void *, const void *))
79{
80 char *pa, *pb, *pc, *pd, *pl, *pm, *pn;
81 int d, r, swaptype, swap_cnt;
82 register char *a = aa;
83
84loop: SWAPINIT(a, es);
85 swap_cnt = 0;
86 if (n < 7) {
87 for (pm = (char *)a + es; pm < (char *) a + n * es; pm += es)
88 for (pl = pm; pl > (char *) a && cmp(pl - es, pl) > 0;
89 pl -= es)
90 swap(pl, pl - es);
91 return;
92 }
93 pm = (char *)a + (n / 2) * es;
94 if (n > 7) {
95 pl = (char *)a;
96 pn = (char *)a + (n - 1) * es;
97 if (n > 40) {
98 d = (n / 8) * es;
99 pl = med3(pl, pl + d, pl + 2 * d, cmp);
100 pm = med3(pm - d, pm, pm + d, cmp);
101 pn = med3(pn - 2 * d, pn - d, pn, cmp);
102 }
103 pm = med3(pl, pm, pn, cmp);
104 }
105 swap(a, pm);
106 pa = pb = (char *)a + es;
107
108 pc = pd = (char *)a + (n - 1) * es;
109 for (;;) {
110 while (pb <= pc && (r = cmp(pb, a)) <= 0) {
111 if (r == 0) {
112 swap_cnt = 1;
113 swap(pa, pb);
114 pa += es;
115 }
116 pb += es;
117 }
118 while (pb <= pc && (r = cmp(pc, a)) >= 0) {
119 if (r == 0) {
120 swap_cnt = 1;
121 swap(pc, pd);
122 pd -= es;
123 }
124 pc -= es;
125 }
126 if (pb > pc)
127 break;
128 swap(pb, pc);
129 swap_cnt = 1;
130 pb += es;
131 pc -= es;
132 }
133 if (swap_cnt == 0) { /* Switch to insertion sort */
134 for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es)
135 for (pl = pm; pl > (char *) a && cmp(pl - es, pl) > 0;
136 pl -= es)
137 swap(pl, pl - es);
138 return;
139 }
140
141 pn = (char *)a + n * es;
142 r = min(pa - (char *)a, pb - pa);
143 vecswap(a, pb - r, r);
144 r = min((long)(pd - pc), (long)(pn - pd - es));
145 vecswap(pb, pn - r, r);
146 if ((r = pb - pa) > es)
147 qsort(a, r / es, es, cmp);
148 if ((r = pd - pc) > es) {
149 /* Iterate rather than recurse to save stack space */
150 a = pn - r;
151 n = r / es;
152 goto loop;
153 }
154/* qsort(pn - r, r / es, es, cmp);*/
155}
diff --git a/fs/xfs/support/qsort.h b/fs/xfs/support/qsort.h
new file mode 100644
index 000000000000..94263106d716
--- /dev/null
+++ b/fs/xfs/support/qsort.h
@@ -0,0 +1,41 @@
1/*
2 * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#ifndef QSORT_H
34#define QSORT_H
35
36extern void qsort (void *const pbase,
37 size_t total_elems,
38 size_t size,
39 int (*cmp)(const void *, const void *));
40
41#endif
diff --git a/fs/xfs/support/uuid.c b/fs/xfs/support/uuid.c
new file mode 100644
index 000000000000..81f40cfcb267
--- /dev/null
+++ b/fs/xfs/support/uuid.c
@@ -0,0 +1,151 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include <xfs.h>
34
35static mutex_t uuid_monitor;
36static int uuid_table_size;
37static uuid_t *uuid_table;
38
39void
40uuid_init(void)
41{
42 mutex_init(&uuid_monitor, MUTEX_DEFAULT, "uuid_monitor");
43}
44
45/*
46 * uuid_getnodeuniq - obtain the node unique fields of a UUID.
47 *
48 * This is not in any way a standard or condoned UUID function;
49 * it just something that's needed for user-level file handles.
50 */
51void
52uuid_getnodeuniq(uuid_t *uuid, int fsid [2])
53{
54 char *uu = (char *)uuid;
55
56 /* on IRIX, this function assumes big-endian fields within
57 * the uuid, so we use INT_GET to get the same result on
58 * little-endian systems
59 */
60
61 fsid[0] = (INT_GET(*(u_int16_t*)(uu+8), ARCH_CONVERT) << 16) +
62 INT_GET(*(u_int16_t*)(uu+4), ARCH_CONVERT);
63 fsid[1] = INT_GET(*(u_int32_t*)(uu ), ARCH_CONVERT);
64}
65
66void
67uuid_create_nil(uuid_t *uuid)
68{
69 memset(uuid, 0, sizeof(*uuid));
70}
71
72int
73uuid_is_nil(uuid_t *uuid)
74{
75 int i;
76 char *cp = (char *)uuid;
77
78 if (uuid == NULL)
79 return 0;
80 /* implied check of version number here... */
81 for (i = 0; i < sizeof *uuid; i++)
82 if (*cp++) return 0; /* not nil */
83 return 1; /* is nil */
84}
85
86int
87uuid_equal(uuid_t *uuid1, uuid_t *uuid2)
88{
89 return memcmp(uuid1, uuid2, sizeof(uuid_t)) ? 0 : 1;
90}
91
92/*
93 * Given a 128-bit uuid, return a 64-bit value by adding the top and bottom
94 * 64-bit words. NOTE: This function can not be changed EVER. Although
95 * brain-dead, some applications depend on this 64-bit value remaining
96 * persistent. Specifically, DMI vendors store the value as a persistent
97 * filehandle.
98 */
99__uint64_t
100uuid_hash64(uuid_t *uuid)
101{
102 __uint64_t *sp = (__uint64_t *)uuid;
103
104 return sp[0] + sp[1];
105}
106
107int
108uuid_table_insert(uuid_t *uuid)
109{
110 int i, hole;
111
112 mutex_lock(&uuid_monitor, PVFS);
113 for (i = 0, hole = -1; i < uuid_table_size; i++) {
114 if (uuid_is_nil(&uuid_table[i])) {
115 hole = i;
116 continue;
117 }
118 if (uuid_equal(uuid, &uuid_table[i])) {
119 mutex_unlock(&uuid_monitor);
120 return 0;
121 }
122 }
123 if (hole < 0) {
124 uuid_table = kmem_realloc(uuid_table,
125 (uuid_table_size + 1) * sizeof(*uuid_table),
126 uuid_table_size * sizeof(*uuid_table),
127 KM_SLEEP);
128 hole = uuid_table_size++;
129 }
130 uuid_table[hole] = *uuid;
131 mutex_unlock(&uuid_monitor);
132 return 1;
133}
134
135void
136uuid_table_remove(uuid_t *uuid)
137{
138 int i;
139
140 mutex_lock(&uuid_monitor, PVFS);
141 for (i = 0; i < uuid_table_size; i++) {
142 if (uuid_is_nil(&uuid_table[i]))
143 continue;
144 if (!uuid_equal(uuid, &uuid_table[i]))
145 continue;
146 uuid_create_nil(&uuid_table[i]);
147 break;
148 }
149 ASSERT(i < uuid_table_size);
150 mutex_unlock(&uuid_monitor);
151}
diff --git a/fs/xfs/support/uuid.h b/fs/xfs/support/uuid.h
new file mode 100644
index 000000000000..5220ea58ba2b
--- /dev/null
+++ b/fs/xfs/support/uuid.h
@@ -0,0 +1,48 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_SUPPORT_UUID_H__
33#define __XFS_SUPPORT_UUID_H__
34
35typedef struct {
36 unsigned char __u_bits[16];
37} uuid_t;
38
39void uuid_init(void);
40void uuid_create_nil(uuid_t *uuid);
41int uuid_is_nil(uuid_t *uuid);
42int uuid_equal(uuid_t *uuid1, uuid_t *uuid2);
43void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]);
44__uint64_t uuid_hash64(uuid_t *uuid);
45int uuid_table_insert(uuid_t *uuid);
46void uuid_table_remove(uuid_t *uuid);
47
48#endif /* __XFS_SUPPORT_UUID_H__ */
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
new file mode 100644
index 000000000000..7e276dcaf4dc
--- /dev/null
+++ b/fs/xfs/xfs.h
@@ -0,0 +1,40 @@
1/*
2 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_H__
33#define __XFS_H__
34
35#include <linux-2.6/xfs_linux.h>
36
37#include <xfs_fs.h>
38#include <xfs_macros.h>
39
40#endif /* __XFS_H__ */
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
new file mode 100644
index 000000000000..8d01dce8c532
--- /dev/null
+++ b/fs/xfs/xfs_acl.c
@@ -0,0 +1,937 @@
1/*
2 * Copyright (c) 2001-2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34
35#include "xfs_inum.h"
36#include "xfs_dir.h"
37#include "xfs_dir2.h"
38#include "xfs_alloc_btree.h"
39#include "xfs_bmap_btree.h"
40#include "xfs_ialloc_btree.h"
41#include "xfs_btree.h"
42#include "xfs_attr_sf.h"
43#include "xfs_dir_sf.h"
44#include "xfs_dir2_sf.h"
45#include "xfs_dinode.h"
46#include "xfs_inode.h"
47#include "xfs_acl.h"
48#include "xfs_mac.h"
49#include "xfs_attr.h"
50
51#include <linux/posix_acl_xattr.h>
52
53STATIC int xfs_acl_setmode(vnode_t *, xfs_acl_t *, int *);
54STATIC void xfs_acl_filter_mode(mode_t, xfs_acl_t *);
55STATIC void xfs_acl_get_endian(xfs_acl_t *);
56STATIC int xfs_acl_access(uid_t, gid_t, xfs_acl_t *, mode_t, cred_t *);
57STATIC int xfs_acl_invalid(xfs_acl_t *);
58STATIC void xfs_acl_sync_mode(mode_t, xfs_acl_t *);
59STATIC void xfs_acl_get_attr(vnode_t *, xfs_acl_t *, int, int, int *);
60STATIC void xfs_acl_set_attr(vnode_t *, xfs_acl_t *, int, int *);
61STATIC int xfs_acl_allow_set(vnode_t *, int);
62
63kmem_zone_t *xfs_acl_zone;
64
65
66/*
67 * Test for existence of access ACL attribute as efficiently as possible.
68 */
69int
70xfs_acl_vhasacl_access(
71 vnode_t *vp)
72{
73 int error;
74
75 xfs_acl_get_attr(vp, NULL, _ACL_TYPE_ACCESS, ATTR_KERNOVAL, &error);
76 return (error == 0);
77}
78
79/*
80 * Test for existence of default ACL attribute as efficiently as possible.
81 */
82int
83xfs_acl_vhasacl_default(
84 vnode_t *vp)
85{
86 int error;
87
88 if (vp->v_type != VDIR)
89 return 0;
90 xfs_acl_get_attr(vp, NULL, _ACL_TYPE_DEFAULT, ATTR_KERNOVAL, &error);
91 return (error == 0);
92}
93
94/*
95 * Convert from extended attribute representation to in-memory for XFS.
96 */
97STATIC int
98posix_acl_xattr_to_xfs(
99 posix_acl_xattr_header *src,
100 size_t size,
101 xfs_acl_t *dest)
102{
103 posix_acl_xattr_entry *src_entry;
104 xfs_acl_entry_t *dest_entry;
105 int n;
106
107 if (!src || !dest)
108 return EINVAL;
109
110 if (size < sizeof(posix_acl_xattr_header))
111 return EINVAL;
112
113 if (src->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
114 return EOPNOTSUPP;
115
116 memset(dest, 0, sizeof(xfs_acl_t));
117 dest->acl_cnt = posix_acl_xattr_count(size);
118 if (dest->acl_cnt < 0 || dest->acl_cnt > XFS_ACL_MAX_ENTRIES)
119 return EINVAL;
120
121 /*
122 * acl_set_file(3) may request that we set default ACLs with
123 * zero length -- defend (gracefully) against that here.
124 */
125 if (!dest->acl_cnt)
126 return 0;
127
128 src_entry = (posix_acl_xattr_entry *)((char *)src + sizeof(*src));
129 dest_entry = &dest->acl_entry[0];
130
131 for (n = 0; n < dest->acl_cnt; n++, src_entry++, dest_entry++) {
132 dest_entry->ae_perm = le16_to_cpu(src_entry->e_perm);
133 if (_ACL_PERM_INVALID(dest_entry->ae_perm))
134 return EINVAL;
135 dest_entry->ae_tag = le16_to_cpu(src_entry->e_tag);
136 switch(dest_entry->ae_tag) {
137 case ACL_USER:
138 case ACL_GROUP:
139 dest_entry->ae_id = le32_to_cpu(src_entry->e_id);
140 break;
141 case ACL_USER_OBJ:
142 case ACL_GROUP_OBJ:
143 case ACL_MASK:
144 case ACL_OTHER:
145 dest_entry->ae_id = ACL_UNDEFINED_ID;
146 break;
147 default:
148 return EINVAL;
149 }
150 }
151 if (xfs_acl_invalid(dest))
152 return EINVAL;
153
154 return 0;
155}
156
157/*
158 * Comparison function called from qsort().
159 * Primary key is ae_tag, secondary key is ae_id.
160 */
161STATIC int
162xfs_acl_entry_compare(
163 const void *va,
164 const void *vb)
165{
166 xfs_acl_entry_t *a = (xfs_acl_entry_t *)va,
167 *b = (xfs_acl_entry_t *)vb;
168
169 if (a->ae_tag == b->ae_tag)
170 return (a->ae_id - b->ae_id);
171 return (a->ae_tag - b->ae_tag);
172}
173
174/*
175 * Convert from in-memory XFS to extended attribute representation.
176 */
177STATIC int
178posix_acl_xfs_to_xattr(
179 xfs_acl_t *src,
180 posix_acl_xattr_header *dest,
181 size_t size)
182{
183 int n;
184 size_t new_size = posix_acl_xattr_size(src->acl_cnt);
185 posix_acl_xattr_entry *dest_entry;
186 xfs_acl_entry_t *src_entry;
187
188 if (size < new_size)
189 return -ERANGE;
190
191 /* Need to sort src XFS ACL by <ae_tag,ae_id> */
192 qsort(src->acl_entry, src->acl_cnt, sizeof(src->acl_entry[0]),
193 xfs_acl_entry_compare);
194
195 dest->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
196 dest_entry = &dest->a_entries[0];
197 src_entry = &src->acl_entry[0];
198 for (n = 0; n < src->acl_cnt; n++, dest_entry++, src_entry++) {
199 dest_entry->e_perm = cpu_to_le16(src_entry->ae_perm);
200 if (_ACL_PERM_INVALID(src_entry->ae_perm))
201 return -EINVAL;
202 dest_entry->e_tag = cpu_to_le16(src_entry->ae_tag);
203 switch (src_entry->ae_tag) {
204 case ACL_USER:
205 case ACL_GROUP:
206 dest_entry->e_id = cpu_to_le32(src_entry->ae_id);
207 break;
208 case ACL_USER_OBJ:
209 case ACL_GROUP_OBJ:
210 case ACL_MASK:
211 case ACL_OTHER:
212 dest_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
213 break;
214 default:
215 return -EINVAL;
216 }
217 }
218 return new_size;
219}
220
221int
222xfs_acl_vget(
223 vnode_t *vp,
224 void *acl,
225 size_t size,
226 int kind)
227{
228 int error;
229 xfs_acl_t *xfs_acl = NULL;
230 posix_acl_xattr_header *ext_acl = acl;
231 int flags = 0;
232
233 VN_HOLD(vp);
234 if(size) {
235 if (!(_ACL_ALLOC(xfs_acl))) {
236 error = ENOMEM;
237 goto out;
238 }
239 memset(xfs_acl, 0, sizeof(xfs_acl_t));
240 } else
241 flags = ATTR_KERNOVAL;
242
243 xfs_acl_get_attr(vp, xfs_acl, kind, flags, &error);
244 if (error)
245 goto out;
246
247 if (!size) {
248 error = -posix_acl_xattr_size(XFS_ACL_MAX_ENTRIES);
249 } else {
250 if (xfs_acl_invalid(xfs_acl)) {
251 error = EINVAL;
252 goto out;
253 }
254 if (kind == _ACL_TYPE_ACCESS) {
255 vattr_t va;
256
257 va.va_mask = XFS_AT_MODE;
258 VOP_GETATTR(vp, &va, 0, sys_cred, error);
259 if (error)
260 goto out;
261 xfs_acl_sync_mode(va.va_mode, xfs_acl);
262 }
263 error = -posix_acl_xfs_to_xattr(xfs_acl, ext_acl, size);
264 }
265out:
266 VN_RELE(vp);
267 if(xfs_acl)
268 _ACL_FREE(xfs_acl);
269 return -error;
270}
271
272int
273xfs_acl_vremove(
274 vnode_t *vp,
275 int kind)
276{
277 int error;
278
279 VN_HOLD(vp);
280 error = xfs_acl_allow_set(vp, kind);
281 if (!error) {
282 VOP_ATTR_REMOVE(vp, kind == _ACL_TYPE_DEFAULT?
283 SGI_ACL_DEFAULT: SGI_ACL_FILE,
284 ATTR_ROOT, sys_cred, error);
285 if (error == ENOATTR)
286 error = 0; /* 'scool */
287 }
288 VN_RELE(vp);
289 return -error;
290}
291
292int
293xfs_acl_vset(
294 vnode_t *vp,
295 void *acl,
296 size_t size,
297 int kind)
298{
299 posix_acl_xattr_header *ext_acl = acl;
300 xfs_acl_t *xfs_acl;
301 int error;
302 int basicperms = 0; /* more than std unix perms? */
303
304 if (!acl)
305 return -EINVAL;
306
307 if (!(_ACL_ALLOC(xfs_acl)))
308 return -ENOMEM;
309
310 error = posix_acl_xattr_to_xfs(ext_acl, size, xfs_acl);
311 if (error) {
312 _ACL_FREE(xfs_acl);
313 return -error;
314 }
315 if (!xfs_acl->acl_cnt) {
316 _ACL_FREE(xfs_acl);
317 return 0;
318 }
319
320 VN_HOLD(vp);
321 error = xfs_acl_allow_set(vp, kind);
322 if (error)
323 goto out;
324
325 /* Incoming ACL exists, set file mode based on its value */
326 if (kind == _ACL_TYPE_ACCESS)
327 xfs_acl_setmode(vp, xfs_acl, &basicperms);
328
329 /*
330 * If we have more than std unix permissions, set up the actual attr.
331 * Otherwise, delete any existing attr. This prevents us from
332 * having actual attrs for permissions that can be stored in the
333 * standard permission bits.
334 */
335 if (!basicperms) {
336 xfs_acl_set_attr(vp, xfs_acl, kind, &error);
337 } else {
338 xfs_acl_vremove(vp, _ACL_TYPE_ACCESS);
339 }
340
341out:
342 VN_RELE(vp);
343 _ACL_FREE(xfs_acl);
344 return -error;
345}
346
347int
348xfs_acl_iaccess(
349 xfs_inode_t *ip,
350 mode_t mode,
351 cred_t *cr)
352{
353 xfs_acl_t *acl;
354 int rval;
355
356 if (!(_ACL_ALLOC(acl)))
357 return -1;
358
359 /* If the file has no ACL return -1. */
360 rval = sizeof(xfs_acl_t);
361 if (xfs_attr_fetch(ip, SGI_ACL_FILE, SGI_ACL_FILE_SIZE,
362 (char *)acl, &rval, ATTR_ROOT | ATTR_KERNACCESS, cr)) {
363 _ACL_FREE(acl);
364 return -1;
365 }
366 xfs_acl_get_endian(acl);
367
368 /* If the file has an empty ACL return -1. */
369 if (acl->acl_cnt == XFS_ACL_NOT_PRESENT) {
370 _ACL_FREE(acl);
371 return -1;
372 }
373
374 /* Synchronize ACL with mode bits */
375 xfs_acl_sync_mode(ip->i_d.di_mode, acl);
376
377 rval = xfs_acl_access(ip->i_d.di_uid, ip->i_d.di_gid, acl, mode, cr);
378 _ACL_FREE(acl);
379 return rval;
380}
381
382STATIC int
383xfs_acl_allow_set(
384 vnode_t *vp,
385 int kind)
386{
387 vattr_t va;
388 int error;
389
390 if (vp->v_inode.i_flags & (S_IMMUTABLE|S_APPEND))
391 return EPERM;
392 if (kind == _ACL_TYPE_DEFAULT && vp->v_type != VDIR)
393 return ENOTDIR;
394 if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
395 return EROFS;
396 va.va_mask = XFS_AT_UID;
397 VOP_GETATTR(vp, &va, 0, NULL, error);
398 if (error)
399 return error;
400 if (va.va_uid != current->fsuid && !capable(CAP_FOWNER))
401 return EPERM;
402 return error;
403}
404
405/*
406 * The access control process to determine the access permission:
407 * if uid == file owner id, use the file owner bits.
408 * if gid == file owner group id, use the file group bits.
409 * scan ACL for a maching user or group, and use matched entry
410 * permission. Use total permissions of all matching group entries,
411 * until all acl entries are exhausted. The final permission produced
412 * by matching acl entry or entries needs to be & with group permission.
413 * if not owner, owning group, or matching entry in ACL, use file
414 * other bits.
415 */
416STATIC int
417xfs_acl_capability_check(
418 mode_t mode,
419 cred_t *cr)
420{
421 if ((mode & ACL_READ) && !capable_cred(cr, CAP_DAC_READ_SEARCH))
422 return EACCES;
423 if ((mode & ACL_WRITE) && !capable_cred(cr, CAP_DAC_OVERRIDE))
424 return EACCES;
425 if ((mode & ACL_EXECUTE) && !capable_cred(cr, CAP_DAC_OVERRIDE))
426 return EACCES;
427
428 return 0;
429}
430
431/*
432 * Note: cr is only used here for the capability check if the ACL test fails.
433 * It is not used to find out the credentials uid or groups etc, as was
434 * done in IRIX. It is assumed that the uid and groups for the current
435 * thread are taken from "current" instead of the cr parameter.
436 */
437STATIC int
438xfs_acl_access(
439 uid_t fuid,
440 gid_t fgid,
441 xfs_acl_t *fap,
442 mode_t md,
443 cred_t *cr)
444{
445 xfs_acl_entry_t matched;
446 int i, allows;
447 int maskallows = -1; /* true, but not 1, either */
448 int seen_userobj = 0;
449
450 matched.ae_tag = 0; /* Invalid type */
451 md >>= 6; /* Normalize the bits for comparison */
452
453 for (i = 0; i < fap->acl_cnt; i++) {
454 /*
455 * Break out if we've got a user_obj entry or
456 * a user entry and the mask (and have processed USER_OBJ)
457 */
458 if (matched.ae_tag == ACL_USER_OBJ)
459 break;
460 if (matched.ae_tag == ACL_USER) {
461 if (maskallows != -1 && seen_userobj)
462 break;
463 if (fap->acl_entry[i].ae_tag != ACL_MASK &&
464 fap->acl_entry[i].ae_tag != ACL_USER_OBJ)
465 continue;
466 }
467 /* True if this entry allows the requested access */
468 allows = ((fap->acl_entry[i].ae_perm & md) == md);
469
470 switch (fap->acl_entry[i].ae_tag) {
471 case ACL_USER_OBJ:
472 seen_userobj = 1;
473 if (fuid != current->fsuid)
474 continue;
475 matched.ae_tag = ACL_USER_OBJ;
476 matched.ae_perm = allows;
477 break;
478 case ACL_USER:
479 if (fap->acl_entry[i].ae_id != current->fsuid)
480 continue;
481 matched.ae_tag = ACL_USER;
482 matched.ae_perm = allows;
483 break;
484 case ACL_GROUP_OBJ:
485 if ((matched.ae_tag == ACL_GROUP_OBJ ||
486 matched.ae_tag == ACL_GROUP) && !allows)
487 continue;
488 if (!in_group_p(fgid))
489 continue;
490 matched.ae_tag = ACL_GROUP_OBJ;
491 matched.ae_perm = allows;
492 break;
493 case ACL_GROUP:
494 if ((matched.ae_tag == ACL_GROUP_OBJ ||
495 matched.ae_tag == ACL_GROUP) && !allows)
496 continue;
497 if (!in_group_p(fap->acl_entry[i].ae_id))
498 continue;
499 matched.ae_tag = ACL_GROUP;
500 matched.ae_perm = allows;
501 break;
502 case ACL_MASK:
503 maskallows = allows;
504 break;
505 case ACL_OTHER:
506 if (matched.ae_tag != 0)
507 continue;
508 matched.ae_tag = ACL_OTHER;
509 matched.ae_perm = allows;
510 break;
511 }
512 }
513 /*
514 * First possibility is that no matched entry allows access.
515 * The capability to override DAC may exist, so check for it.
516 */
517 switch (matched.ae_tag) {
518 case ACL_OTHER:
519 case ACL_USER_OBJ:
520 if (matched.ae_perm)
521 return 0;
522 break;
523 case ACL_USER:
524 case ACL_GROUP_OBJ:
525 case ACL_GROUP:
526 if (maskallows && matched.ae_perm)
527 return 0;
528 break;
529 case 0:
530 break;
531 }
532
533 return xfs_acl_capability_check(md, cr);
534}
535
536/*
537 * ACL validity checker.
538 * This acl validation routine checks each ACL entry read in makes sense.
539 */
540STATIC int
541xfs_acl_invalid(
542 xfs_acl_t *aclp)
543{
544 xfs_acl_entry_t *entry, *e;
545 int user = 0, group = 0, other = 0, mask = 0;
546 int mask_required = 0;
547 int i, j;
548
549 if (!aclp)
550 goto acl_invalid;
551
552 if (aclp->acl_cnt > XFS_ACL_MAX_ENTRIES)
553 goto acl_invalid;
554
555 for (i = 0; i < aclp->acl_cnt; i++) {
556 entry = &aclp->acl_entry[i];
557 switch (entry->ae_tag) {
558 case ACL_USER_OBJ:
559 if (user++)
560 goto acl_invalid;
561 break;
562 case ACL_GROUP_OBJ:
563 if (group++)
564 goto acl_invalid;
565 break;
566 case ACL_OTHER:
567 if (other++)
568 goto acl_invalid;
569 break;
570 case ACL_USER:
571 case ACL_GROUP:
572 for (j = i + 1; j < aclp->acl_cnt; j++) {
573 e = &aclp->acl_entry[j];
574 if (e->ae_id == entry->ae_id &&
575 e->ae_tag == entry->ae_tag)
576 goto acl_invalid;
577 }
578 mask_required++;
579 break;
580 case ACL_MASK:
581 if (mask++)
582 goto acl_invalid;
583 break;
584 default:
585 goto acl_invalid;
586 }
587 }
588 if (!user || !group || !other || (mask_required && !mask))
589 goto acl_invalid;
590 else
591 return 0;
592acl_invalid:
593 return EINVAL;
594}
595
596/*
597 * Do ACL endian conversion.
598 */
599STATIC void
600xfs_acl_get_endian(
601 xfs_acl_t *aclp)
602{
603 xfs_acl_entry_t *ace, *end;
604
605 INT_SET(aclp->acl_cnt, ARCH_CONVERT, aclp->acl_cnt);
606 end = &aclp->acl_entry[0]+aclp->acl_cnt;
607 for (ace = &aclp->acl_entry[0]; ace < end; ace++) {
608 INT_SET(ace->ae_tag, ARCH_CONVERT, ace->ae_tag);
609 INT_SET(ace->ae_id, ARCH_CONVERT, ace->ae_id);
610 INT_SET(ace->ae_perm, ARCH_CONVERT, ace->ae_perm);
611 }
612}
613
614/*
615 * Get the ACL from the EA and do endian conversion.
616 */
617STATIC void
618xfs_acl_get_attr(
619 vnode_t *vp,
620 xfs_acl_t *aclp,
621 int kind,
622 int flags,
623 int *error)
624{
625 int len = sizeof(xfs_acl_t);
626
627 ASSERT((flags & ATTR_KERNOVAL) ? (aclp == NULL) : 1);
628 flags |= ATTR_ROOT;
629 VOP_ATTR_GET(vp,
630 kind == _ACL_TYPE_ACCESS ? SGI_ACL_FILE : SGI_ACL_DEFAULT,
631 (char *)aclp, &len, flags, sys_cred, *error);
632 if (*error || (flags & ATTR_KERNOVAL))
633 return;
634 xfs_acl_get_endian(aclp);
635}
636
637/*
638 * Set the EA with the ACL and do endian conversion.
639 */
640STATIC void
641xfs_acl_set_attr(
642 vnode_t *vp,
643 xfs_acl_t *aclp,
644 int kind,
645 int *error)
646{
647 xfs_acl_entry_t *ace, *newace, *end;
648 xfs_acl_t *newacl;
649 int len;
650
651 if (!(_ACL_ALLOC(newacl))) {
652 *error = ENOMEM;
653 return;
654 }
655
656 len = sizeof(xfs_acl_t) -
657 (sizeof(xfs_acl_entry_t) * (XFS_ACL_MAX_ENTRIES - aclp->acl_cnt));
658 end = &aclp->acl_entry[0]+aclp->acl_cnt;
659 for (ace = &aclp->acl_entry[0], newace = &newacl->acl_entry[0];
660 ace < end;
661 ace++, newace++) {
662 INT_SET(newace->ae_tag, ARCH_CONVERT, ace->ae_tag);
663 INT_SET(newace->ae_id, ARCH_CONVERT, ace->ae_id);
664 INT_SET(newace->ae_perm, ARCH_CONVERT, ace->ae_perm);
665 }
666 INT_SET(newacl->acl_cnt, ARCH_CONVERT, aclp->acl_cnt);
667 VOP_ATTR_SET(vp,
668 kind == _ACL_TYPE_ACCESS ? SGI_ACL_FILE: SGI_ACL_DEFAULT,
669 (char *)newacl, len, ATTR_ROOT, sys_cred, *error);
670 _ACL_FREE(newacl);
671}
672
673int
674xfs_acl_vtoacl(
675 vnode_t *vp,
676 xfs_acl_t *access_acl,
677 xfs_acl_t *default_acl)
678{
679 vattr_t va;
680 int error = 0;
681
682 if (access_acl) {
683 /*
684 * Get the Access ACL and the mode. If either cannot
685 * be obtained for some reason, invalidate the access ACL.
686 */
687 xfs_acl_get_attr(vp, access_acl, _ACL_TYPE_ACCESS, 0, &error);
688 if (!error) {
689 /* Got the ACL, need the mode... */
690 va.va_mask = XFS_AT_MODE;
691 VOP_GETATTR(vp, &va, 0, sys_cred, error);
692 }
693
694 if (error)
695 access_acl->acl_cnt = XFS_ACL_NOT_PRESENT;
696 else /* We have a good ACL and the file mode, synchronize. */
697 xfs_acl_sync_mode(va.va_mode, access_acl);
698 }
699
700 if (default_acl) {
701 xfs_acl_get_attr(vp, default_acl, _ACL_TYPE_DEFAULT, 0, &error);
702 if (error)
703 default_acl->acl_cnt = XFS_ACL_NOT_PRESENT;
704 }
705 return error;
706}
707
708/*
709 * This function retrieves the parent directory's acl, processes it
710 * and lets the child inherit the acl(s) that it should.
711 */
712int
713xfs_acl_inherit(
714 vnode_t *vp,
715 vattr_t *vap,
716 xfs_acl_t *pdaclp)
717{
718 xfs_acl_t *cacl;
719 int error = 0;
720 int basicperms = 0;
721
722 /*
723 * If the parent does not have a default ACL, or it's an
724 * invalid ACL, we're done.
725 */
726 if (!vp)
727 return 0;
728 if (!pdaclp || xfs_acl_invalid(pdaclp))
729 return 0;
730
731 /*
732 * Copy the default ACL of the containing directory to
733 * the access ACL of the new file and use the mode that
734 * was passed in to set up the correct initial values for
735 * the u::,g::[m::], and o:: entries. This is what makes
736 * umask() "work" with ACL's.
737 */
738
739 if (!(_ACL_ALLOC(cacl)))
740 return ENOMEM;
741
742 memcpy(cacl, pdaclp, sizeof(xfs_acl_t));
743 xfs_acl_filter_mode(vap->va_mode, cacl);
744 xfs_acl_setmode(vp, cacl, &basicperms);
745
746 /*
747 * Set the Default and Access ACL on the file. The mode is already
748 * set on the file, so we don't need to worry about that.
749 *
750 * If the new file is a directory, its default ACL is a copy of
751 * the containing directory's default ACL.
752 */
753 if (vp->v_type == VDIR)
754 xfs_acl_set_attr(vp, pdaclp, _ACL_TYPE_DEFAULT, &error);
755 if (!error && !basicperms)
756 xfs_acl_set_attr(vp, cacl, _ACL_TYPE_ACCESS, &error);
757 _ACL_FREE(cacl);
758 return error;
759}
760
761/*
762 * Set up the correct mode on the file based on the supplied ACL. This
763 * makes sure that the mode on the file reflects the state of the
764 * u::,g::[m::], and o:: entries in the ACL. Since the mode is where
765 * the ACL is going to get the permissions for these entries, we must
766 * synchronize the mode whenever we set the ACL on a file.
767 */
768STATIC int
769xfs_acl_setmode(
770 vnode_t *vp,
771 xfs_acl_t *acl,
772 int *basicperms)
773{
774 vattr_t va;
775 xfs_acl_entry_t *ap;
776 xfs_acl_entry_t *gap = NULL;
777 int i, error, nomask = 1;
778
779 *basicperms = 1;
780
781 if (acl->acl_cnt == XFS_ACL_NOT_PRESENT)
782 return 0;
783
784 /*
785 * Copy the u::, g::, o::, and m:: bits from the ACL into the
786 * mode. The m:: bits take precedence over the g:: bits.
787 */
788 va.va_mask = XFS_AT_MODE;
789 VOP_GETATTR(vp, &va, 0, sys_cred, error);
790 if (error)
791 return error;
792
793 va.va_mask = XFS_AT_MODE;
794 va.va_mode &= ~(S_IRWXU|S_IRWXG|S_IRWXO);
795 ap = acl->acl_entry;
796 for (i = 0; i < acl->acl_cnt; ++i) {
797 switch (ap->ae_tag) {
798 case ACL_USER_OBJ:
799 va.va_mode |= ap->ae_perm << 6;
800 break;
801 case ACL_GROUP_OBJ:
802 gap = ap;
803 break;
804 case ACL_MASK: /* more than just standard modes */
805 nomask = 0;
806 va.va_mode |= ap->ae_perm << 3;
807 *basicperms = 0;
808 break;
809 case ACL_OTHER:
810 va.va_mode |= ap->ae_perm;
811 break;
812 default: /* more than just standard modes */
813 *basicperms = 0;
814 break;
815 }
816 ap++;
817 }
818
819 /* Set the group bits from ACL_GROUP_OBJ if there's no ACL_MASK */
820 if (gap && nomask)
821 va.va_mode |= gap->ae_perm << 3;
822
823 VOP_SETATTR(vp, &va, 0, sys_cred, error);
824 return error;
825}
826
827/*
828 * The permissions for the special ACL entries (u::, g::[m::], o::) are
829 * actually stored in the file mode (if there is both a group and a mask,
830 * the group is stored in the ACL entry and the mask is stored on the file).
831 * This allows the mode to remain automatically in sync with the ACL without
832 * the need for a call-back to the ACL system at every point where the mode
833 * could change. This function takes the permissions from the specified mode
834 * and places it in the supplied ACL.
835 *
836 * This implementation draws its validity from the fact that, when the ACL
837 * was assigned, the mode was copied from the ACL.
838 * If the mode did not change, therefore, the mode remains exactly what was
839 * taken from the special ACL entries at assignment.
840 * If a subsequent chmod() was done, the POSIX spec says that the change in
841 * mode must cause an update to the ACL seen at user level and used for
842 * access checks. Before and after a mode change, therefore, the file mode
843 * most accurately reflects what the special ACL entries should permit/deny.
844 *
845 * CAVEAT: If someone sets the SGI_ACL_FILE attribute directly,
846 * the existing mode bits will override whatever is in the
847 * ACL. Similarly, if there is a pre-existing ACL that was
848 * never in sync with its mode (owing to a bug in 6.5 and
849 * before), it will now magically (or mystically) be
850 * synchronized. This could cause slight astonishment, but
851 * it is better than inconsistent permissions.
852 *
853 * The supplied ACL is a template that may contain any combination
854 * of special entries. These are treated as place holders when we fill
855 * out the ACL. This routine does not add or remove special entries, it
856 * simply unites each special entry with its associated set of permissions.
857 */
858STATIC void
859xfs_acl_sync_mode(
860 mode_t mode,
861 xfs_acl_t *acl)
862{
863 int i, nomask = 1;
864 xfs_acl_entry_t *ap;
865 xfs_acl_entry_t *gap = NULL;
866
867 /*
868 * Set ACL entries. POSIX1003.1eD16 requires that the MASK
869 * be set instead of the GROUP entry, if there is a MASK.
870 */
871 for (ap = acl->acl_entry, i = 0; i < acl->acl_cnt; ap++, i++) {
872 switch (ap->ae_tag) {
873 case ACL_USER_OBJ:
874 ap->ae_perm = (mode >> 6) & 0x7;
875 break;
876 case ACL_GROUP_OBJ:
877 gap = ap;
878 break;
879 case ACL_MASK:
880 nomask = 0;
881 ap->ae_perm = (mode >> 3) & 0x7;
882 break;
883 case ACL_OTHER:
884 ap->ae_perm = mode & 0x7;
885 break;
886 default:
887 break;
888 }
889 }
890 /* Set the ACL_GROUP_OBJ if there's no ACL_MASK */
891 if (gap && nomask)
892 gap->ae_perm = (mode >> 3) & 0x7;
893}
894
895/*
896 * When inheriting an Access ACL from a directory Default ACL,
897 * the ACL bits are set to the intersection of the ACL default
898 * permission bits and the file permission bits in mode. If there
899 * are no permission bits on the file then we must not give them
900 * the ACL. This is what what makes umask() work with ACLs.
901 */
902STATIC void
903xfs_acl_filter_mode(
904 mode_t mode,
905 xfs_acl_t *acl)
906{
907 int i, nomask = 1;
908 xfs_acl_entry_t *ap;
909 xfs_acl_entry_t *gap = NULL;
910
911 /*
912 * Set ACL entries. POSIX1003.1eD16 requires that the MASK
913 * be merged with GROUP entry, if there is a MASK.
914 */
915 for (ap = acl->acl_entry, i = 0; i < acl->acl_cnt; ap++, i++) {
916 switch (ap->ae_tag) {
917 case ACL_USER_OBJ:
918 ap->ae_perm &= (mode >> 6) & 0x7;
919 break;
920 case ACL_GROUP_OBJ:
921 gap = ap;
922 break;
923 case ACL_MASK:
924 nomask = 0;
925 ap->ae_perm &= (mode >> 3) & 0x7;
926 break;
927 case ACL_OTHER:
928 ap->ae_perm &= mode & 0x7;
929 break;
930 default:
931 break;
932 }
933 }
934 /* Set the ACL_GROUP_OBJ if there's no ACL_MASK */
935 if (gap && nomask)
936 gap->ae_perm &= (mode >> 3) & 0x7;
937}
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
new file mode 100644
index 000000000000..0363eb46d357
--- /dev/null
+++ b/fs/xfs/xfs_acl.h
@@ -0,0 +1,116 @@
1/*
2 * Copyright (c) 2001-2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_ACL_H__
33#define __XFS_ACL_H__
34
35/*
36 * Access Control Lists
37 */
38typedef __uint16_t xfs_acl_perm_t;
39typedef __int32_t xfs_acl_type_t;
40typedef __int32_t xfs_acl_tag_t;
41typedef __int32_t xfs_acl_id_t;
42
43#define XFS_ACL_MAX_ENTRIES 25
44#define XFS_ACL_NOT_PRESENT (-1)
45
46typedef struct xfs_acl_entry {
47 xfs_acl_tag_t ae_tag;
48 xfs_acl_id_t ae_id;
49 xfs_acl_perm_t ae_perm;
50} xfs_acl_entry_t;
51
52typedef struct xfs_acl {
53 __int32_t acl_cnt;
54 xfs_acl_entry_t acl_entry[XFS_ACL_MAX_ENTRIES];
55} xfs_acl_t;
56
57/* On-disk XFS extended attribute names */
58#define SGI_ACL_FILE "SGI_ACL_FILE"
59#define SGI_ACL_DEFAULT "SGI_ACL_DEFAULT"
60#define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1)
61#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1)
62
63
64#ifdef CONFIG_XFS_POSIX_ACL
65
66struct vattr;
67struct vnode;
68struct xfs_inode;
69
70extern struct kmem_zone *xfs_acl_zone;
71#define xfs_acl_zone_init(zone, name) \
72 (zone) = kmem_zone_init(sizeof(xfs_acl_t), name)
73#define xfs_acl_zone_destroy(zone) kmem_cache_destroy(zone)
74
75extern int xfs_acl_inherit(struct vnode *, struct vattr *, xfs_acl_t *);
76extern int xfs_acl_iaccess(struct xfs_inode *, mode_t, cred_t *);
77extern int xfs_acl_vtoacl(struct vnode *, xfs_acl_t *, xfs_acl_t *);
78extern int xfs_acl_vhasacl_access(struct vnode *);
79extern int xfs_acl_vhasacl_default(struct vnode *);
80extern int xfs_acl_vset(struct vnode *, void *, size_t, int);
81extern int xfs_acl_vget(struct vnode *, void *, size_t, int);
82extern int xfs_acl_vremove(struct vnode *vp, int);
83
84#define _ACL_TYPE_ACCESS 1
85#define _ACL_TYPE_DEFAULT 2
86#define _ACL_PERM_INVALID(perm) ((perm) & ~(ACL_READ|ACL_WRITE|ACL_EXECUTE))
87
88#define _ACL_INHERIT(c,v,d) (xfs_acl_inherit(c,v,d))
89#define _ACL_GET_ACCESS(pv,pa) (xfs_acl_vtoacl(pv,pa,NULL) == 0)
90#define _ACL_GET_DEFAULT(pv,pd) (xfs_acl_vtoacl(pv,NULL,pd) == 0)
91#define _ACL_ACCESS_EXISTS xfs_acl_vhasacl_access
92#define _ACL_DEFAULT_EXISTS xfs_acl_vhasacl_default
93#define _ACL_XFS_IACCESS(i,m,c) (XFS_IFORK_Q(i) ? xfs_acl_iaccess(i,m,c) : -1)
94
95#define _ACL_ALLOC(a) ((a) = kmem_zone_alloc(xfs_acl_zone, KM_SLEEP))
96#define _ACL_FREE(a) ((a)? kmem_zone_free(xfs_acl_zone, (a)):(void)0)
97
98#else
99#define xfs_acl_zone_init(zone,name)
100#define xfs_acl_zone_destroy(zone)
101#define xfs_acl_vset(v,p,sz,t) (-EOPNOTSUPP)
102#define xfs_acl_vget(v,p,sz,t) (-EOPNOTSUPP)
103#define xfs_acl_vremove(v,t) (-EOPNOTSUPP)
104#define xfs_acl_vhasacl_access(v) (0)
105#define xfs_acl_vhasacl_default(v) (0)
106#define _ACL_ALLOC(a) (1) /* successfully allocate nothing */
107#define _ACL_FREE(a) ((void)0)
108#define _ACL_INHERIT(c,v,d) (0)
109#define _ACL_GET_ACCESS(pv,pa) (0)
110#define _ACL_GET_DEFAULT(pv,pd) (0)
111#define _ACL_ACCESS_EXISTS (NULL)
112#define _ACL_DEFAULT_EXISTS (NULL)
113#define _ACL_XFS_IACCESS(i,m,c) (-1)
114#endif
115
116#endif /* __XFS_ACL_H__ */
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
new file mode 100644
index 000000000000..96b70f7fba39
--- /dev/null
+++ b/fs/xfs/xfs_ag.h
@@ -0,0 +1,345 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_AG_H__
33#define __XFS_AG_H__
34
35/*
36 * Allocation group header
37 * This is divided into three structures, placed in sequential 512-byte
38 * buffers after a copy of the superblock (also in a 512-byte buffer).
39 */
40
41struct xfs_buf;
42struct xfs_mount;
43struct xfs_trans;
44
45#define XFS_AGF_MAGIC 0x58414746 /* 'XAGF' */
46#define XFS_AGI_MAGIC 0x58414749 /* 'XAGI' */
47#define XFS_AGF_VERSION 1
48#define XFS_AGI_VERSION 1
49#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGF_GOOD_VERSION)
50int xfs_agf_good_version(unsigned v);
51#define XFS_AGF_GOOD_VERSION(v) xfs_agf_good_version(v)
52#else
53#define XFS_AGF_GOOD_VERSION(v) ((v) == XFS_AGF_VERSION)
54#endif
55#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGI_GOOD_VERSION)
56int xfs_agi_good_version(unsigned v);
57#define XFS_AGI_GOOD_VERSION(v) xfs_agi_good_version(v)
58#else
59#define XFS_AGI_GOOD_VERSION(v) ((v) == XFS_AGI_VERSION)
60#endif
61
62/*
63 * Btree number 0 is bno, 1 is cnt. This value gives the size of the
64 * arrays below.
65 */
66#define XFS_BTNUM_AGF ((int)XFS_BTNUM_CNTi + 1)
67
68/*
69 * The second word of agf_levels in the first a.g. overlaps the EFS
70 * superblock's magic number. Since the magic numbers valid for EFS
71 * are > 64k, our value cannot be confused for an EFS superblock's.
72 */
73
74typedef struct xfs_agf
75{
76 /*
77 * Common allocation group header information
78 */
79 __uint32_t agf_magicnum; /* magic number == XFS_AGF_MAGIC */
80 __uint32_t agf_versionnum; /* header version == XFS_AGF_VERSION */
81 xfs_agnumber_t agf_seqno; /* sequence # starting from 0 */
82 xfs_agblock_t agf_length; /* size in blocks of a.g. */
83 /*
84 * Freespace information
85 */
86 xfs_agblock_t agf_roots[XFS_BTNUM_AGF]; /* root blocks */
87 __uint32_t agf_spare0; /* spare field */
88 __uint32_t agf_levels[XFS_BTNUM_AGF]; /* btree levels */
89 __uint32_t agf_spare1; /* spare field */
90 __uint32_t agf_flfirst; /* first freelist block's index */
91 __uint32_t agf_fllast; /* last freelist block's index */
92 __uint32_t agf_flcount; /* count of blocks in freelist */
93 xfs_extlen_t agf_freeblks; /* total free blocks */
94 xfs_extlen_t agf_longest; /* longest free space */
95} xfs_agf_t;
96
97#define XFS_AGF_MAGICNUM 0x00000001
98#define XFS_AGF_VERSIONNUM 0x00000002
99#define XFS_AGF_SEQNO 0x00000004
100#define XFS_AGF_LENGTH 0x00000008
101#define XFS_AGF_ROOTS 0x00000010
102#define XFS_AGF_LEVELS 0x00000020
103#define XFS_AGF_FLFIRST 0x00000040
104#define XFS_AGF_FLLAST 0x00000080
105#define XFS_AGF_FLCOUNT 0x00000100
106#define XFS_AGF_FREEBLKS 0x00000200
107#define XFS_AGF_LONGEST 0x00000400
108#define XFS_AGF_NUM_BITS 11
109#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1)
110
111/* disk block (xfs_daddr_t) in the AG */
112#define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
113#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGF_BLOCK)
114xfs_agblock_t xfs_agf_block(struct xfs_mount *mp);
115#define XFS_AGF_BLOCK(mp) xfs_agf_block(mp)
116#else
117#define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
118#endif
119
120/*
121 * Size of the unlinked inode hash table in the agi.
122 */
123#define XFS_AGI_UNLINKED_BUCKETS 64
124
125typedef struct xfs_agi
126{
127 /*
128 * Common allocation group header information
129 */
130 __uint32_t agi_magicnum; /* magic number == XFS_AGI_MAGIC */
131 __uint32_t agi_versionnum; /* header version == XFS_AGI_VERSION */
132 xfs_agnumber_t agi_seqno; /* sequence # starting from 0 */
133 xfs_agblock_t agi_length; /* size in blocks of a.g. */
134 /*
135 * Inode information
136 * Inodes are mapped by interpreting the inode number, so no
137 * mapping data is needed here.
138 */
139 xfs_agino_t agi_count; /* count of allocated inodes */
140 xfs_agblock_t agi_root; /* root of inode btree */
141 __uint32_t agi_level; /* levels in inode btree */
142 xfs_agino_t agi_freecount; /* number of free inodes */
143 xfs_agino_t agi_newino; /* new inode just allocated */
144 xfs_agino_t agi_dirino; /* last directory inode chunk */
145 /*
146 * Hash table of inodes which have been unlinked but are
147 * still being referenced.
148 */
149 xfs_agino_t agi_unlinked[XFS_AGI_UNLINKED_BUCKETS];
150} xfs_agi_t;
151
152#define XFS_AGI_MAGICNUM 0x00000001
153#define XFS_AGI_VERSIONNUM 0x00000002
154#define XFS_AGI_SEQNO 0x00000004
155#define XFS_AGI_LENGTH 0x00000008
156#define XFS_AGI_COUNT 0x00000010
157#define XFS_AGI_ROOT 0x00000020
158#define XFS_AGI_LEVEL 0x00000040
159#define XFS_AGI_FREECOUNT 0x00000080
160#define XFS_AGI_NEWINO 0x00000100
161#define XFS_AGI_DIRINO 0x00000200
162#define XFS_AGI_UNLINKED 0x00000400
163#define XFS_AGI_NUM_BITS 11
164#define XFS_AGI_ALL_BITS ((1 << XFS_AGI_NUM_BITS) - 1)
165
166/* disk block (xfs_daddr_t) in the AG */
167#define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log))
168#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGI_BLOCK)
169xfs_agblock_t xfs_agi_block(struct xfs_mount *mp);
170#define XFS_AGI_BLOCK(mp) xfs_agi_block(mp)
171#else
172#define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp))
173#endif
174
175/*
176 * The third a.g. block contains the a.g. freelist, an array
177 * of block pointers to blocks owned by the allocation btree code.
178 */
179#define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log))
180#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGFL_BLOCK)
181xfs_agblock_t xfs_agfl_block(struct xfs_mount *mp);
182#define XFS_AGFL_BLOCK(mp) xfs_agfl_block(mp)
183#else
184#define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp))
185#endif
186#define XFS_AGFL_SIZE(mp) ((mp)->m_sb.sb_sectsize / sizeof(xfs_agblock_t))
187
188typedef struct xfs_agfl {
189 xfs_agblock_t agfl_bno[1]; /* actually XFS_AGFL_SIZE(mp) */
190} xfs_agfl_t;
191
192/*
193 * Busy block/extent entry. Used in perag to mark blocks that have been freed
194 * but whose transactions aren't committed to disk yet.
195 */
196typedef struct xfs_perag_busy {
197 xfs_agblock_t busy_start;
198 xfs_extlen_t busy_length;
199 struct xfs_trans *busy_tp; /* transaction that did the free */
200} xfs_perag_busy_t;
201
202/*
203 * Per-ag incore structure, copies of information in agf and agi,
204 * to improve the performance of allocation group selection.
205 *
206 * pick sizes which fit in allocation buckets well
207 */
208#if (BITS_PER_LONG == 32)
209#define XFS_PAGB_NUM_SLOTS 84
210#elif (BITS_PER_LONG == 64)
211#define XFS_PAGB_NUM_SLOTS 128
212#endif
213
214typedef struct xfs_perag
215{
216 char pagf_init; /* this agf's entry is initialized */
217 char pagi_init; /* this agi's entry is initialized */
218 char pagf_metadata; /* the agf is prefered to be metadata */
219 char pagi_inodeok; /* The agi is ok for inodes */
220 __uint8_t pagf_levels[XFS_BTNUM_AGF];
221 /* # of levels in bno & cnt btree */
222 __uint32_t pagf_flcount; /* count of blocks in freelist */
223 xfs_extlen_t pagf_freeblks; /* total free blocks */
224 xfs_extlen_t pagf_longest; /* longest free space */
225 xfs_agino_t pagi_freecount; /* number of free inodes */
226#ifdef __KERNEL__
227 lock_t pagb_lock; /* lock for pagb_list */
228#endif
229 int pagb_count; /* pagb slots in use */
230 xfs_perag_busy_t *pagb_list; /* unstable blocks */
231} xfs_perag_t;
232
233#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AG_MAXLEVELS)
234int xfs_ag_maxlevels(struct xfs_mount *mp);
235#define XFS_AG_MAXLEVELS(mp) xfs_ag_maxlevels(mp)
236#else
237#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels)
238#endif
239#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MIN_FREELIST)
240int xfs_min_freelist(xfs_agf_t *a, struct xfs_mount *mp);
241#define XFS_MIN_FREELIST(a,mp) xfs_min_freelist(a,mp)
242#else
243#define XFS_MIN_FREELIST(a,mp) \
244 XFS_MIN_FREELIST_RAW( \
245 INT_GET((a)->agf_levels[XFS_BTNUM_BNOi], ARCH_CONVERT), \
246 INT_GET((a)->agf_levels[XFS_BTNUM_CNTi], ARCH_CONVERT), mp)
247#endif
248#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MIN_FREELIST_PAG)
249int xfs_min_freelist_pag(xfs_perag_t *pag, struct xfs_mount *mp);
250#define XFS_MIN_FREELIST_PAG(pag,mp) xfs_min_freelist_pag(pag,mp)
251#else
252#define XFS_MIN_FREELIST_PAG(pag,mp) \
253 XFS_MIN_FREELIST_RAW((uint_t)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
254 (uint_t)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp)
255#endif
256#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MIN_FREELIST_RAW)
257int xfs_min_freelist_raw(int bl, int cl, struct xfs_mount *mp);
258#define XFS_MIN_FREELIST_RAW(bl,cl,mp) xfs_min_freelist_raw(bl,cl,mp)
259#else
260#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \
261 (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + \
262 MIN(cl + 1, XFS_AG_MAXLEVELS(mp)))
263#endif
264
265#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGB_TO_FSB)
266xfs_fsblock_t xfs_agb_to_fsb(struct xfs_mount *mp, xfs_agnumber_t agno,
267 xfs_agblock_t agbno);
268#define XFS_AGB_TO_FSB(mp,agno,agbno) xfs_agb_to_fsb(mp,agno,agbno)
269#else
270#define XFS_AGB_TO_FSB(mp,agno,agbno) \
271 (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno))
272#endif
273#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FSB_TO_AGNO)
274xfs_agnumber_t xfs_fsb_to_agno(struct xfs_mount *mp, xfs_fsblock_t fsbno);
275#define XFS_FSB_TO_AGNO(mp,fsbno) xfs_fsb_to_agno(mp,fsbno)
276#else
277#define XFS_FSB_TO_AGNO(mp,fsbno) \
278 ((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog))
279#endif
280#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FSB_TO_AGBNO)
281xfs_agblock_t xfs_fsb_to_agbno(struct xfs_mount *mp, xfs_fsblock_t fsbno);
282#define XFS_FSB_TO_AGBNO(mp,fsbno) xfs_fsb_to_agbno(mp,fsbno)
283#else
284#define XFS_FSB_TO_AGBNO(mp,fsbno) \
285 ((xfs_agblock_t)((fsbno) & XFS_MASK32LO((mp)->m_sb.sb_agblklog)))
286#endif
287
288#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGB_TO_DADDR)
289xfs_daddr_t xfs_agb_to_daddr(struct xfs_mount *mp, xfs_agnumber_t agno,
290 xfs_agblock_t agbno);
291#define XFS_AGB_TO_DADDR(mp,agno,agbno) xfs_agb_to_daddr(mp,agno,agbno)
292#else
293#define XFS_AGB_TO_DADDR(mp,agno,agbno) \
294 ((xfs_daddr_t)(XFS_FSB_TO_BB(mp, \
295 (xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno))))
296#endif
297/*
298 * XFS_DADDR_TO_AGNO and XFS_DADDR_TO_AGBNO moved to xfs_mount.h
299 * to avoid header file ordering change
300 */
301
302#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AG_DADDR)
303xfs_daddr_t xfs_ag_daddr(struct xfs_mount *mp, xfs_agnumber_t agno,
304 xfs_daddr_t d);
305#define XFS_AG_DADDR(mp,agno,d) xfs_ag_daddr(mp,agno,d)
306#else
307#define XFS_AG_DADDR(mp,agno,d) (XFS_AGB_TO_DADDR(mp, agno, 0) + (d))
308#endif
309
310#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_AGF)
311xfs_agf_t *xfs_buf_to_agf(struct xfs_buf *bp);
312#define XFS_BUF_TO_AGF(bp) xfs_buf_to_agf(bp)
313#else
314#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)XFS_BUF_PTR(bp))
315#endif
316#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_AGI)
317xfs_agi_t *xfs_buf_to_agi(struct xfs_buf *bp);
318#define XFS_BUF_TO_AGI(bp) xfs_buf_to_agi(bp)
319#else
320#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)XFS_BUF_PTR(bp))
321#endif
322#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_AGFL)
323xfs_agfl_t *xfs_buf_to_agfl(struct xfs_buf *bp);
324#define XFS_BUF_TO_AGFL(bp) xfs_buf_to_agfl(bp)
325#else
326#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)XFS_BUF_PTR(bp))
327#endif
328
329/*
330 * For checking for bad ranges of xfs_daddr_t's, covering multiple
331 * allocation groups or a single xfs_daddr_t that's a superblock copy.
332 */
333#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AG_CHECK_DADDR)
334void xfs_ag_check_daddr(struct xfs_mount *mp, xfs_daddr_t d, xfs_extlen_t len);
335#define XFS_AG_CHECK_DADDR(mp,d,len) xfs_ag_check_daddr(mp,d,len)
336#else
337#define XFS_AG_CHECK_DADDR(mp,d,len) \
338 ((len) == 1 ? \
339 ASSERT((d) == XFS_SB_DADDR || \
340 XFS_DADDR_TO_AGBNO(mp, d) != XFS_SB_DADDR) : \
341 ASSERT(XFS_DADDR_TO_AGNO(mp, d) == \
342 XFS_DADDR_TO_AGNO(mp, (d) + (len) - 1)))
343#endif
344
345#endif /* __XFS_AG_H__ */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
new file mode 100644
index 000000000000..36603db10fe9
--- /dev/null
+++ b/fs/xfs/xfs_alloc.c
@@ -0,0 +1,2623 @@
1/*
2 * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33/*
34 * Free space allocation for XFS.
35 */
36#include "xfs.h"
37#include "xfs_macros.h"
38#include "xfs_types.h"
39#include "xfs_inum.h"
40#include "xfs_log.h"
41#include "xfs_trans.h"
42#include "xfs_sb.h"
43#include "xfs_ag.h"
44#include "xfs_dir.h"
45#include "xfs_dmapi.h"
46#include "xfs_mount.h"
47#include "xfs_alloc_btree.h"
48#include "xfs_bmap_btree.h"
49#include "xfs_ialloc_btree.h"
50#include "xfs_btree.h"
51#include "xfs_ialloc.h"
52#include "xfs_alloc.h"
53#include "xfs_bit.h"
54#include "xfs_error.h"
55
56
57#define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
58
59#define XFSA_FIXUP_BNO_OK 1
60#define XFSA_FIXUP_CNT_OK 2
61
62int
63xfs_alloc_search_busy(xfs_trans_t *tp,
64 xfs_agnumber_t agno,
65 xfs_agblock_t bno,
66 xfs_extlen_t len);
67
68#if defined(XFS_ALLOC_TRACE)
69ktrace_t *xfs_alloc_trace_buf;
70
71#define TRACE_ALLOC(s,a) \
72 xfs_alloc_trace_alloc(fname, s, a, __LINE__)
73#define TRACE_FREE(s,a,b,x,f) \
74 xfs_alloc_trace_free(fname, s, mp, a, b, x, f, __LINE__)
75#define TRACE_MODAGF(s,a,f) \
76 xfs_alloc_trace_modagf(fname, s, mp, a, f, __LINE__)
77#define TRACE_BUSY(fname,s,ag,agb,l,sl,tp) \
78 xfs_alloc_trace_busy(fname, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSY, __LINE__)
79#define TRACE_UNBUSY(fname,s,ag,sl,tp) \
80 xfs_alloc_trace_busy(fname, s, mp, ag, -1, -1, sl, tp, XFS_ALLOC_KTRACE_UNBUSY, __LINE__)
81#define TRACE_BUSYSEARCH(fname,s,ag,agb,l,sl,tp) \
82 xfs_alloc_trace_busy(fname, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__)
83#else
84#define TRACE_ALLOC(s,a)
85#define TRACE_FREE(s,a,b,x,f)
86#define TRACE_MODAGF(s,a,f)
87#define TRACE_BUSY(s,a,ag,agb,l,sl,tp)
88#define TRACE_UNBUSY(fname,s,ag,sl,tp)
89#define TRACE_BUSYSEARCH(fname,s,ag,agb,l,sl,tp)
90#endif /* XFS_ALLOC_TRACE */
91
92/*
93 * Prototypes for per-ag allocation routines
94 */
95
96STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
97STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
98STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
99STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
100 xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
101
102/*
103 * Internal functions.
104 */
105
106/*
107 * Compute aligned version of the found extent.
108 * Takes alignment and min length into account.
109 */
110STATIC int /* success (>= minlen) */
111xfs_alloc_compute_aligned(
112 xfs_agblock_t foundbno, /* starting block in found extent */
113 xfs_extlen_t foundlen, /* length in found extent */
114 xfs_extlen_t alignment, /* alignment for allocation */
115 xfs_extlen_t minlen, /* minimum length for allocation */
116 xfs_agblock_t *resbno, /* result block number */
117 xfs_extlen_t *reslen) /* result length */
118{
119 xfs_agblock_t bno;
120 xfs_extlen_t diff;
121 xfs_extlen_t len;
122
123 if (alignment > 1 && foundlen >= minlen) {
124 bno = roundup(foundbno, alignment);
125 diff = bno - foundbno;
126 len = diff >= foundlen ? 0 : foundlen - diff;
127 } else {
128 bno = foundbno;
129 len = foundlen;
130 }
131 *resbno = bno;
132 *reslen = len;
133 return len >= minlen;
134}
135
136/*
137 * Compute best start block and diff for "near" allocations.
138 * freelen >= wantlen already checked by caller.
139 */
140STATIC xfs_extlen_t /* difference value (absolute) */
141xfs_alloc_compute_diff(
142 xfs_agblock_t wantbno, /* target starting block */
143 xfs_extlen_t wantlen, /* target length */
144 xfs_extlen_t alignment, /* target alignment */
145 xfs_agblock_t freebno, /* freespace's starting block */
146 xfs_extlen_t freelen, /* freespace's length */
147 xfs_agblock_t *newbnop) /* result: best start block from free */
148{
149 xfs_agblock_t freeend; /* end of freespace extent */
150 xfs_agblock_t newbno1; /* return block number */
151 xfs_agblock_t newbno2; /* other new block number */
152 xfs_extlen_t newlen1=0; /* length with newbno1 */
153 xfs_extlen_t newlen2=0; /* length with newbno2 */
154 xfs_agblock_t wantend; /* end of target extent */
155
156 ASSERT(freelen >= wantlen);
157 freeend = freebno + freelen;
158 wantend = wantbno + wantlen;
159 if (freebno >= wantbno) {
160 if ((newbno1 = roundup(freebno, alignment)) >= freeend)
161 newbno1 = NULLAGBLOCK;
162 } else if (freeend >= wantend && alignment > 1) {
163 newbno1 = roundup(wantbno, alignment);
164 newbno2 = newbno1 - alignment;
165 if (newbno1 >= freeend)
166 newbno1 = NULLAGBLOCK;
167 else
168 newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1);
169 if (newbno2 < freebno)
170 newbno2 = NULLAGBLOCK;
171 else
172 newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2);
173 if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
174 if (newlen1 < newlen2 ||
175 (newlen1 == newlen2 &&
176 XFS_ABSDIFF(newbno1, wantbno) >
177 XFS_ABSDIFF(newbno2, wantbno)))
178 newbno1 = newbno2;
179 } else if (newbno2 != NULLAGBLOCK)
180 newbno1 = newbno2;
181 } else if (freeend >= wantend) {
182 newbno1 = wantbno;
183 } else if (alignment > 1) {
184 newbno1 = roundup(freeend - wantlen, alignment);
185 if (newbno1 > freeend - wantlen &&
186 newbno1 - alignment >= freebno)
187 newbno1 -= alignment;
188 else if (newbno1 >= freeend)
189 newbno1 = NULLAGBLOCK;
190 } else
191 newbno1 = freeend - wantlen;
192 *newbnop = newbno1;
193 return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
194}
195
196/*
197 * Fix up the length, based on mod and prod.
198 * len should be k * prod + mod for some k.
199 * If len is too small it is returned unchanged.
200 * If len hits maxlen it is left alone.
201 */
202STATIC void
203xfs_alloc_fix_len(
204 xfs_alloc_arg_t *args) /* allocation argument structure */
205{
206 xfs_extlen_t k;
207 xfs_extlen_t rlen;
208
209 ASSERT(args->mod < args->prod);
210 rlen = args->len;
211 ASSERT(rlen >= args->minlen);
212 ASSERT(rlen <= args->maxlen);
213 if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen ||
214 (args->mod == 0 && rlen < args->prod))
215 return;
216 k = rlen % args->prod;
217 if (k == args->mod)
218 return;
219 if (k > args->mod) {
220 if ((int)(rlen = rlen - k - args->mod) < (int)args->minlen)
221 return;
222 } else {
223 if ((int)(rlen = rlen - args->prod - (args->mod - k)) <
224 (int)args->minlen)
225 return;
226 }
227 ASSERT(rlen >= args->minlen);
228 ASSERT(rlen <= args->maxlen);
229 args->len = rlen;
230}
231
232/*
233 * Fix up length if there is too little space left in the a.g.
234 * Return 1 if ok, 0 if too little, should give up.
235 */
236STATIC int
237xfs_alloc_fix_minleft(
238 xfs_alloc_arg_t *args) /* allocation argument structure */
239{
240 xfs_agf_t *agf; /* a.g. freelist header */
241 int diff; /* free space difference */
242
243 if (args->minleft == 0)
244 return 1;
245 agf = XFS_BUF_TO_AGF(args->agbp);
246 diff = INT_GET(agf->agf_freeblks, ARCH_CONVERT)
247 + INT_GET(agf->agf_flcount, ARCH_CONVERT)
248 - args->len - args->minleft;
249 if (diff >= 0)
250 return 1;
251 args->len += diff; /* shrink the allocated space */
252 if (args->len >= args->minlen)
253 return 1;
254 args->agbno = NULLAGBLOCK;
255 return 0;
256}
257
258/*
259 * Update the two btrees, logically removing from freespace the extent
260 * starting at rbno, rlen blocks. The extent is contained within the
261 * actual (current) free extent fbno for flen blocks.
262 * Flags are passed in indicating whether the cursors are set to the
263 * relevant records.
264 */
265STATIC int /* error code */
266xfs_alloc_fixup_trees(
267 xfs_btree_cur_t *cnt_cur, /* cursor for by-size btree */
268 xfs_btree_cur_t *bno_cur, /* cursor for by-block btree */
269 xfs_agblock_t fbno, /* starting block of free extent */
270 xfs_extlen_t flen, /* length of free extent */
271 xfs_agblock_t rbno, /* starting block of returned extent */
272 xfs_extlen_t rlen, /* length of returned extent */
273 int flags) /* flags, XFSA_FIXUP_... */
274{
275 int error; /* error code */
276 int i; /* operation results */
277 xfs_agblock_t nfbno1; /* first new free startblock */
278 xfs_agblock_t nfbno2; /* second new free startblock */
279 xfs_extlen_t nflen1=0; /* first new free length */
280 xfs_extlen_t nflen2=0; /* second new free length */
281
282 /*
283 * Look up the record in the by-size tree if necessary.
284 */
285 if (flags & XFSA_FIXUP_CNT_OK) {
286#ifdef DEBUG
287 if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
288 return error;
289 XFS_WANT_CORRUPTED_RETURN(
290 i == 1 && nfbno1 == fbno && nflen1 == flen);
291#endif
292 } else {
293 if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
294 return error;
295 XFS_WANT_CORRUPTED_RETURN(i == 1);
296 }
297 /*
298 * Look up the record in the by-block tree if necessary.
299 */
300 if (flags & XFSA_FIXUP_BNO_OK) {
301#ifdef DEBUG
302 if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
303 return error;
304 XFS_WANT_CORRUPTED_RETURN(
305 i == 1 && nfbno1 == fbno && nflen1 == flen);
306#endif
307 } else {
308 if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
309 return error;
310 XFS_WANT_CORRUPTED_RETURN(i == 1);
311 }
312#ifdef DEBUG
313 {
314 xfs_alloc_block_t *bnoblock;
315 xfs_alloc_block_t *cntblock;
316
317 if (bno_cur->bc_nlevels == 1 &&
318 cnt_cur->bc_nlevels == 1) {
319 bnoblock = XFS_BUF_TO_ALLOC_BLOCK(bno_cur->bc_bufs[0]);
320 cntblock = XFS_BUF_TO_ALLOC_BLOCK(cnt_cur->bc_bufs[0]);
321 XFS_WANT_CORRUPTED_RETURN(
322 INT_GET(bnoblock->bb_numrecs, ARCH_CONVERT) == INT_GET(cntblock->bb_numrecs, ARCH_CONVERT));
323 }
324 }
325#endif
326 /*
327 * Deal with all four cases: the allocated record is contained
328 * within the freespace record, so we can have new freespace
329 * at either (or both) end, or no freespace remaining.
330 */
331 if (rbno == fbno && rlen == flen)
332 nfbno1 = nfbno2 = NULLAGBLOCK;
333 else if (rbno == fbno) {
334 nfbno1 = rbno + rlen;
335 nflen1 = flen - rlen;
336 nfbno2 = NULLAGBLOCK;
337 } else if (rbno + rlen == fbno + flen) {
338 nfbno1 = fbno;
339 nflen1 = flen - rlen;
340 nfbno2 = NULLAGBLOCK;
341 } else {
342 nfbno1 = fbno;
343 nflen1 = rbno - fbno;
344 nfbno2 = rbno + rlen;
345 nflen2 = (fbno + flen) - nfbno2;
346 }
347 /*
348 * Delete the entry from the by-size btree.
349 */
350 if ((error = xfs_alloc_delete(cnt_cur, &i)))
351 return error;
352 XFS_WANT_CORRUPTED_RETURN(i == 1);
353 /*
354 * Add new by-size btree entry(s).
355 */
356 if (nfbno1 != NULLAGBLOCK) {
357 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
358 return error;
359 XFS_WANT_CORRUPTED_RETURN(i == 0);
360 if ((error = xfs_alloc_insert(cnt_cur, &i)))
361 return error;
362 XFS_WANT_CORRUPTED_RETURN(i == 1);
363 }
364 if (nfbno2 != NULLAGBLOCK) {
365 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
366 return error;
367 XFS_WANT_CORRUPTED_RETURN(i == 0);
368 if ((error = xfs_alloc_insert(cnt_cur, &i)))
369 return error;
370 XFS_WANT_CORRUPTED_RETURN(i == 1);
371 }
372 /*
373 * Fix up the by-block btree entry(s).
374 */
375 if (nfbno1 == NULLAGBLOCK) {
376 /*
377 * No remaining freespace, just delete the by-block tree entry.
378 */
379 if ((error = xfs_alloc_delete(bno_cur, &i)))
380 return error;
381 XFS_WANT_CORRUPTED_RETURN(i == 1);
382 } else {
383 /*
384 * Update the by-block entry to start later|be shorter.
385 */
386 if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1)))
387 return error;
388 }
389 if (nfbno2 != NULLAGBLOCK) {
390 /*
391 * 2 resulting free entries, need to add one.
392 */
393 if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
394 return error;
395 XFS_WANT_CORRUPTED_RETURN(i == 0);
396 if ((error = xfs_alloc_insert(bno_cur, &i)))
397 return error;
398 XFS_WANT_CORRUPTED_RETURN(i == 1);
399 }
400 return 0;
401}
402
403/*
404 * Read in the allocation group free block array.
405 */
406STATIC int /* error */
407xfs_alloc_read_agfl(
408 xfs_mount_t *mp, /* mount point structure */
409 xfs_trans_t *tp, /* transaction pointer */
410 xfs_agnumber_t agno, /* allocation group number */
411 xfs_buf_t **bpp) /* buffer for the ag free block array */
412{
413 xfs_buf_t *bp; /* return value */
414 int error;
415
416 ASSERT(agno != NULLAGNUMBER);
417 error = xfs_trans_read_buf(
418 mp, tp, mp->m_ddev_targp,
419 XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
420 XFS_FSS_TO_BB(mp, 1), 0, &bp);
421 if (error)
422 return error;
423 ASSERT(bp);
424 ASSERT(!XFS_BUF_GETERROR(bp));
425 XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGFL, XFS_AGFL_REF);
426 *bpp = bp;
427 return 0;
428}
429
430#if defined(XFS_ALLOC_TRACE)
431/*
432 * Add an allocation trace entry for an alloc call.
433 */
434STATIC void
435xfs_alloc_trace_alloc(
436 char *name, /* function tag string */
437 char *str, /* additional string */
438 xfs_alloc_arg_t *args, /* allocation argument structure */
439 int line) /* source line number */
440{
441 ktrace_enter(xfs_alloc_trace_buf,
442 (void *)(__psint_t)(XFS_ALLOC_KTRACE_ALLOC | (line << 16)),
443 (void *)name,
444 (void *)str,
445 (void *)args->mp,
446 (void *)(__psunsigned_t)args->agno,
447 (void *)(__psunsigned_t)args->agbno,
448 (void *)(__psunsigned_t)args->minlen,
449 (void *)(__psunsigned_t)args->maxlen,
450 (void *)(__psunsigned_t)args->mod,
451 (void *)(__psunsigned_t)args->prod,
452 (void *)(__psunsigned_t)args->minleft,
453 (void *)(__psunsigned_t)args->total,
454 (void *)(__psunsigned_t)args->alignment,
455 (void *)(__psunsigned_t)args->len,
456 (void *)((((__psint_t)args->type) << 16) |
457 (__psint_t)args->otype),
458 (void *)(__psint_t)((args->wasdel << 3) |
459 (args->wasfromfl << 2) |
460 (args->isfl << 1) |
461 (args->userdata << 0)));
462}
463
464/*
465 * Add an allocation trace entry for a free call.
466 */
467STATIC void
468xfs_alloc_trace_free(
469 char *name, /* function tag string */
470 char *str, /* additional string */
471 xfs_mount_t *mp, /* file system mount point */
472 xfs_agnumber_t agno, /* allocation group number */
473 xfs_agblock_t agbno, /* a.g. relative block number */
474 xfs_extlen_t len, /* length of extent */
475 int isfl, /* set if is freelist allocation/free */
476 int line) /* source line number */
477{
478 ktrace_enter(xfs_alloc_trace_buf,
479 (void *)(__psint_t)(XFS_ALLOC_KTRACE_FREE | (line << 16)),
480 (void *)name,
481 (void *)str,
482 (void *)mp,
483 (void *)(__psunsigned_t)agno,
484 (void *)(__psunsigned_t)agbno,
485 (void *)(__psunsigned_t)len,
486 (void *)(__psint_t)isfl,
487 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL);
488}
489
490/*
491 * Add an allocation trace entry for modifying an agf.
492 */
493STATIC void
494xfs_alloc_trace_modagf(
495 char *name, /* function tag string */
496 char *str, /* additional string */
497 xfs_mount_t *mp, /* file system mount point */
498 xfs_agf_t *agf, /* new agf value */
499 int flags, /* logging flags for agf */
500 int line) /* source line number */
501{
502 ktrace_enter(xfs_alloc_trace_buf,
503 (void *)(__psint_t)(XFS_ALLOC_KTRACE_MODAGF | (line << 16)),
504 (void *)name,
505 (void *)str,
506 (void *)mp,
507 (void *)(__psint_t)flags,
508 (void *)(__psunsigned_t)INT_GET(agf->agf_seqno, ARCH_CONVERT),
509 (void *)(__psunsigned_t)INT_GET(agf->agf_length, ARCH_CONVERT),
510 (void *)(__psunsigned_t)INT_GET(agf->agf_roots[XFS_BTNUM_BNO],
511 ARCH_CONVERT),
512 (void *)(__psunsigned_t)INT_GET(agf->agf_roots[XFS_BTNUM_CNT],
513 ARCH_CONVERT),
514 (void *)(__psunsigned_t)INT_GET(agf->agf_levels[XFS_BTNUM_BNO],
515 ARCH_CONVERT),
516 (void *)(__psunsigned_t)INT_GET(agf->agf_levels[XFS_BTNUM_CNT],
517 ARCH_CONVERT),
518 (void *)(__psunsigned_t)INT_GET(agf->agf_flfirst, ARCH_CONVERT),
519 (void *)(__psunsigned_t)INT_GET(agf->agf_fllast, ARCH_CONVERT),
520 (void *)(__psunsigned_t)INT_GET(agf->agf_flcount, ARCH_CONVERT),
521 (void *)(__psunsigned_t)INT_GET(agf->agf_freeblks, ARCH_CONVERT),
522 (void *)(__psunsigned_t)INT_GET(agf->agf_longest, ARCH_CONVERT));
523}
524
525STATIC void
526xfs_alloc_trace_busy(
527 char *name, /* function tag string */
528 char *str, /* additional string */
529 xfs_mount_t *mp, /* file system mount poing */
530 xfs_agnumber_t agno, /* allocation group number */
531 xfs_agblock_t agbno, /* a.g. relative block number */
532 xfs_extlen_t len, /* length of extent */
533 int slot, /* perag Busy slot */
534 xfs_trans_t *tp,
535 int trtype, /* type: add, delete, search */
536 int line) /* source line number */
537{
538 ktrace_enter(xfs_alloc_trace_buf,
539 (void *)(__psint_t)(trtype | (line << 16)),
540 (void *)name,
541 (void *)str,
542 (void *)mp,
543 (void *)(__psunsigned_t)agno,
544 (void *)(__psunsigned_t)agbno,
545 (void *)(__psunsigned_t)len,
546 (void *)(__psint_t)slot,
547 (void *)tp,
548 NULL, NULL, NULL, NULL, NULL, NULL, NULL);
549}
550#endif /* XFS_ALLOC_TRACE */
551
552/*
553 * Allocation group level functions.
554 */
555
556/*
557 * Allocate a variable extent in the allocation group agno.
558 * Type and bno are used to determine where in the allocation group the
559 * extent will start.
560 * Extent's length (returned in *len) will be between minlen and maxlen,
561 * and of the form k * prod + mod unless there's nothing that large.
562 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
563 */
564STATIC int /* error */
565xfs_alloc_ag_vextent(
566 xfs_alloc_arg_t *args) /* argument structure for allocation */
567{
568 int error=0;
569#ifdef XFS_ALLOC_TRACE
570 static char fname[] = "xfs_alloc_ag_vextent";
571#endif
572
573 ASSERT(args->minlen > 0);
574 ASSERT(args->maxlen > 0);
575 ASSERT(args->minlen <= args->maxlen);
576 ASSERT(args->mod < args->prod);
577 ASSERT(args->alignment > 0);
578 /*
579 * Branch to correct routine based on the type.
580 */
581 args->wasfromfl = 0;
582 switch (args->type) {
583 case XFS_ALLOCTYPE_THIS_AG:
584 error = xfs_alloc_ag_vextent_size(args);
585 break;
586 case XFS_ALLOCTYPE_NEAR_BNO:
587 error = xfs_alloc_ag_vextent_near(args);
588 break;
589 case XFS_ALLOCTYPE_THIS_BNO:
590 error = xfs_alloc_ag_vextent_exact(args);
591 break;
592 default:
593 ASSERT(0);
594 /* NOTREACHED */
595 }
596 if (error)
597 return error;
598 /*
599 * If the allocation worked, need to change the agf structure
600 * (and log it), and the superblock.
601 */
602 if (args->agbno != NULLAGBLOCK) {
603 xfs_agf_t *agf; /* allocation group freelist header */
604#ifdef XFS_ALLOC_TRACE
605 xfs_mount_t *mp = args->mp;
606#endif
607 long slen = (long)args->len;
608
609 ASSERT(args->len >= args->minlen && args->len <= args->maxlen);
610 ASSERT(!(args->wasfromfl) || !args->isfl);
611 ASSERT(args->agbno % args->alignment == 0);
612 if (!(args->wasfromfl)) {
613
614 agf = XFS_BUF_TO_AGF(args->agbp);
615 INT_MOD(agf->agf_freeblks, ARCH_CONVERT, -(args->len));
616 xfs_trans_agblocks_delta(args->tp,
617 -((long)(args->len)));
618 args->pag->pagf_freeblks -= args->len;
619 ASSERT(INT_GET(agf->agf_freeblks, ARCH_CONVERT)
620 <= INT_GET(agf->agf_length, ARCH_CONVERT));
621 TRACE_MODAGF(NULL, agf, XFS_AGF_FREEBLKS);
622 xfs_alloc_log_agf(args->tp, args->agbp,
623 XFS_AGF_FREEBLKS);
624 /* search the busylist for these blocks */
625 xfs_alloc_search_busy(args->tp, args->agno,
626 args->agbno, args->len);
627 }
628 if (!args->isfl)
629 xfs_trans_mod_sb(args->tp,
630 args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
631 XFS_TRANS_SB_FDBLOCKS, -slen);
632 XFS_STATS_INC(xs_allocx);
633 XFS_STATS_ADD(xs_allocb, args->len);
634 }
635 return 0;
636}
637
638/*
639 * Allocate a variable extent at exactly agno/bno.
640 * Extent's length (returned in *len) will be between minlen and maxlen,
641 * and of the form k * prod + mod unless there's nothing that large.
642 * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
643 */
644STATIC int /* error */
645xfs_alloc_ag_vextent_exact(
646 xfs_alloc_arg_t *args) /* allocation argument structure */
647{
648 xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */
649 xfs_btree_cur_t *cnt_cur;/* by count btree cursor */
650 xfs_agblock_t end; /* end of allocated extent */
651 int error;
652 xfs_agblock_t fbno; /* start block of found extent */
653 xfs_agblock_t fend; /* end block of found extent */
654 xfs_extlen_t flen; /* length of found extent */
655#ifdef XFS_ALLOC_TRACE
656 static char fname[] = "xfs_alloc_ag_vextent_exact";
657#endif
658 int i; /* success/failure of operation */
659 xfs_agblock_t maxend; /* end of maximal extent */
660 xfs_agblock_t minend; /* end of minimal extent */
661 xfs_extlen_t rlen; /* length of returned extent */
662
663 ASSERT(args->alignment == 1);
664 /*
665 * Allocate/initialize a cursor for the by-number freespace btree.
666 */
667 bno_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
668 args->agno, XFS_BTNUM_BNO, NULL, 0);
669 /*
670 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
671 * Look for the closest free block <= bno, it must contain bno
672 * if any free block does.
673 */
674 if ((error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i)))
675 goto error0;
676 if (!i) {
677 /*
678 * Didn't find it, return null.
679 */
680 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
681 args->agbno = NULLAGBLOCK;
682 return 0;
683 }
684 /*
685 * Grab the freespace record.
686 */
687 if ((error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i)))
688 goto error0;
689 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
690 ASSERT(fbno <= args->agbno);
691 minend = args->agbno + args->minlen;
692 maxend = args->agbno + args->maxlen;
693 fend = fbno + flen;
694 /*
695 * Give up if the freespace isn't long enough for the minimum request.
696 */
697 if (fend < minend) {
698 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
699 args->agbno = NULLAGBLOCK;
700 return 0;
701 }
702 /*
703 * End of extent will be smaller of the freespace end and the
704 * maximal requested end.
705 */
706 end = XFS_AGBLOCK_MIN(fend, maxend);
707 /*
708 * Fix the length according to mod and prod if given.
709 */
710 args->len = end - args->agbno;
711 xfs_alloc_fix_len(args);
712 if (!xfs_alloc_fix_minleft(args)) {
713 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
714 return 0;
715 }
716 rlen = args->len;
717 ASSERT(args->agbno + rlen <= fend);
718 end = args->agbno + rlen;
719 /*
720 * We are allocating agbno for rlen [agbno .. end]
721 * Allocate/initialize a cursor for the by-size btree.
722 */
723 cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
724 args->agno, XFS_BTNUM_CNT, NULL, 0);
725 ASSERT(args->agbno + args->len <=
726 INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_length,
727 ARCH_CONVERT));
728 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
729 args->agbno, args->len, XFSA_FIXUP_BNO_OK))) {
730 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
731 goto error0;
732 }
733 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
734 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
735 TRACE_ALLOC("normal", args);
736 args->wasfromfl = 0;
737 return 0;
738
739error0:
740 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
741 TRACE_ALLOC("error", args);
742 return error;
743}
744
745/*
746 * Allocate a variable extent near bno in the allocation group agno.
747 * Extent's length (returned in len) will be between minlen and maxlen,
748 * and of the form k * prod + mod unless there's nothing that large.
749 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
750 */
751STATIC int /* error */
752xfs_alloc_ag_vextent_near(
753 xfs_alloc_arg_t *args) /* allocation argument structure */
754{
755 xfs_btree_cur_t *bno_cur_gt; /* cursor for bno btree, right side */
756 xfs_btree_cur_t *bno_cur_lt; /* cursor for bno btree, left side */
757 xfs_btree_cur_t *cnt_cur; /* cursor for count btree */
758#ifdef XFS_ALLOC_TRACE
759 static char fname[] = "xfs_alloc_ag_vextent_near";
760#endif
761 xfs_agblock_t gtbno; /* start bno of right side entry */
762 xfs_agblock_t gtbnoa; /* aligned ... */
763 xfs_extlen_t gtdiff; /* difference to right side entry */
764 xfs_extlen_t gtlen; /* length of right side entry */
765 xfs_extlen_t gtlena; /* aligned ... */
766 xfs_agblock_t gtnew; /* useful start bno of right side */
767 int error; /* error code */
768 int i; /* result code, temporary */
769 int j; /* result code, temporary */
770 xfs_agblock_t ltbno; /* start bno of left side entry */
771 xfs_agblock_t ltbnoa; /* aligned ... */
772 xfs_extlen_t ltdiff; /* difference to left side entry */
773 /*REFERENCED*/
774 xfs_agblock_t ltend; /* end bno of left side entry */
775 xfs_extlen_t ltlen; /* length of left side entry */
776 xfs_extlen_t ltlena; /* aligned ... */
777 xfs_agblock_t ltnew; /* useful start bno of left side */
778 xfs_extlen_t rlen; /* length of returned extent */
779#if defined(DEBUG) && defined(__KERNEL__)
780 /*
781 * Randomly don't execute the first algorithm.
782 */
783 int dofirst; /* set to do first algorithm */
784
785 dofirst = random() & 1;
786#endif
787 /*
788 * Get a cursor for the by-size btree.
789 */
790 cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
791 args->agno, XFS_BTNUM_CNT, NULL, 0);
792 ltlen = 0;
793 bno_cur_lt = bno_cur_gt = NULL;
794 /*
795 * See if there are any free extents as big as maxlen.
796 */
797 if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, args->maxlen, &i)))
798 goto error0;
799 /*
800 * If none, then pick up the last entry in the tree unless the
801 * tree is empty.
802 */
803 if (!i) {
804 if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &ltbno,
805 &ltlen, &i)))
806 goto error0;
807 if (i == 0 || ltlen == 0) {
808 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
809 return 0;
810 }
811 ASSERT(i == 1);
812 }
813 args->wasfromfl = 0;
814 /*
815 * First algorithm.
816 * If the requested extent is large wrt the freespaces available
817 * in this a.g., then the cursor will be pointing to a btree entry
818 * near the right edge of the tree. If it's in the last btree leaf
819 * block, then we just examine all the entries in that block
820 * that are big enough, and pick the best one.
821 * This is written as a while loop so we can break out of it,
822 * but we never loop back to the top.
823 */
824 while (xfs_btree_islastblock(cnt_cur, 0)) {
825 xfs_extlen_t bdiff;
826 int besti=0;
827 xfs_extlen_t blen=0;
828 xfs_agblock_t bnew=0;
829
830#if defined(DEBUG) && defined(__KERNEL__)
831 if (!dofirst)
832 break;
833#endif
834 /*
835 * Start from the entry that lookup found, sequence through
836 * all larger free blocks. If we're actually pointing at a
837 * record smaller than maxlen, go to the start of this block,
838 * and skip all those smaller than minlen.
839 */
840 if (ltlen || args->alignment > 1) {
841 cnt_cur->bc_ptrs[0] = 1;
842 do {
843 if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno,
844 &ltlen, &i)))
845 goto error0;
846 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
847 if (ltlen >= args->minlen)
848 break;
849 if ((error = xfs_alloc_increment(cnt_cur, 0, &i)))
850 goto error0;
851 } while (i);
852 ASSERT(ltlen >= args->minlen);
853 if (!i)
854 break;
855 }
856 i = cnt_cur->bc_ptrs[0];
857 for (j = 1, blen = 0, bdiff = 0;
858 !error && j && (blen < args->maxlen || bdiff > 0);
859 error = xfs_alloc_increment(cnt_cur, 0, &j)) {
860 /*
861 * For each entry, decide if it's better than
862 * the previous best entry.
863 */
864 if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
865 goto error0;
866 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
867 if (!xfs_alloc_compute_aligned(ltbno, ltlen,
868 args->alignment, args->minlen,
869 &ltbnoa, &ltlena))
870 continue;
871 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
872 xfs_alloc_fix_len(args);
873 ASSERT(args->len >= args->minlen);
874 if (args->len < blen)
875 continue;
876 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
877 args->alignment, ltbno, ltlen, &ltnew);
878 if (ltnew != NULLAGBLOCK &&
879 (args->len > blen || ltdiff < bdiff)) {
880 bdiff = ltdiff;
881 bnew = ltnew;
882 blen = args->len;
883 besti = cnt_cur->bc_ptrs[0];
884 }
885 }
886 /*
887 * It didn't work. We COULD be in a case where
888 * there's a good record somewhere, so try again.
889 */
890 if (blen == 0)
891 break;
892 /*
893 * Point at the best entry, and retrieve it again.
894 */
895 cnt_cur->bc_ptrs[0] = besti;
896 if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
897 goto error0;
898 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
899 ltend = ltbno + ltlen;
900 ASSERT(ltend <= INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_length,
901 ARCH_CONVERT));
902 args->len = blen;
903 if (!xfs_alloc_fix_minleft(args)) {
904 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
905 TRACE_ALLOC("nominleft", args);
906 return 0;
907 }
908 blen = args->len;
909 /*
910 * We are allocating starting at bnew for blen blocks.
911 */
912 args->agbno = bnew;
913 ASSERT(bnew >= ltbno);
914 ASSERT(bnew + blen <= ltend);
915 /*
916 * Set up a cursor for the by-bno tree.
917 */
918 bno_cur_lt = xfs_btree_init_cursor(args->mp, args->tp,
919 args->agbp, args->agno, XFS_BTNUM_BNO, NULL, 0);
920 /*
921 * Fix up the btree entries.
922 */
923 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno,
924 ltlen, bnew, blen, XFSA_FIXUP_CNT_OK)))
925 goto error0;
926 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
927 xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
928 TRACE_ALLOC("first", args);
929 return 0;
930 }
931 /*
932 * Second algorithm.
933 * Search in the by-bno tree to the left and to the right
934 * simultaneously, until in each case we find a space big enough,
935 * or run into the edge of the tree. When we run into the edge,
936 * we deallocate that cursor.
937 * If both searches succeed, we compare the two spaces and pick
938 * the better one.
939 * With alignment, it's possible for both to fail; the upper
940 * level algorithm that picks allocation groups for allocations
941 * is not supposed to do this.
942 */
943 /*
944 * Allocate and initialize the cursor for the leftward search.
945 */
946 bno_cur_lt = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
947 args->agno, XFS_BTNUM_BNO, NULL, 0);
948 /*
949 * Lookup <= bno to find the leftward search's starting point.
950 */
951 if ((error = xfs_alloc_lookup_le(bno_cur_lt, args->agbno, args->maxlen, &i)))
952 goto error0;
953 if (!i) {
954 /*
955 * Didn't find anything; use this cursor for the rightward
956 * search.
957 */
958 bno_cur_gt = bno_cur_lt;
959 bno_cur_lt = NULL;
960 }
961 /*
962 * Found something. Duplicate the cursor for the rightward search.
963 */
964 else if ((error = xfs_btree_dup_cursor(bno_cur_lt, &bno_cur_gt)))
965 goto error0;
966 /*
967 * Increment the cursor, so we will point at the entry just right
968 * of the leftward entry if any, or to the leftmost entry.
969 */
970 if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i)))
971 goto error0;
972 if (!i) {
973 /*
974 * It failed, there are no rightward entries.
975 */
976 xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_NOERROR);
977 bno_cur_gt = NULL;
978 }
979 /*
980 * Loop going left with the leftward cursor, right with the
981 * rightward cursor, until either both directions give up or
982 * we find an entry at least as big as minlen.
983 */
984 do {
985 if (bno_cur_lt) {
986 if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
987 goto error0;
988 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
989 if (xfs_alloc_compute_aligned(ltbno, ltlen,
990 args->alignment, args->minlen,
991 &ltbnoa, &ltlena))
992 break;
993 if ((error = xfs_alloc_decrement(bno_cur_lt, 0, &i)))
994 goto error0;
995 if (!i) {
996 xfs_btree_del_cursor(bno_cur_lt,
997 XFS_BTREE_NOERROR);
998 bno_cur_lt = NULL;
999 }
1000 }
1001 if (bno_cur_gt) {
1002 if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
1003 goto error0;
1004 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1005 if (xfs_alloc_compute_aligned(gtbno, gtlen,
1006 args->alignment, args->minlen,
1007 &gtbnoa, &gtlena))
1008 break;
1009 if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i)))
1010 goto error0;
1011 if (!i) {
1012 xfs_btree_del_cursor(bno_cur_gt,
1013 XFS_BTREE_NOERROR);
1014 bno_cur_gt = NULL;
1015 }
1016 }
1017 } while (bno_cur_lt || bno_cur_gt);
1018 /*
1019 * Got both cursors still active, need to find better entry.
1020 */
1021 if (bno_cur_lt && bno_cur_gt) {
1022 /*
1023 * Left side is long enough, look for a right side entry.
1024 */
1025 if (ltlena >= args->minlen) {
1026 /*
1027 * Fix up the length.
1028 */
1029 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
1030 xfs_alloc_fix_len(args);
1031 rlen = args->len;
1032 ltdiff = xfs_alloc_compute_diff(args->agbno, rlen,
1033 args->alignment, ltbno, ltlen, &ltnew);
1034 /*
1035 * Not perfect.
1036 */
1037 if (ltdiff) {
1038 /*
1039 * Look until we find a better one, run out of
1040 * space, or run off the end.
1041 */
1042 while (bno_cur_lt && bno_cur_gt) {
1043 if ((error = xfs_alloc_get_rec(
1044 bno_cur_gt, &gtbno,
1045 &gtlen, &i)))
1046 goto error0;
1047 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1048 xfs_alloc_compute_aligned(gtbno, gtlen,
1049 args->alignment, args->minlen,
1050 &gtbnoa, &gtlena);
1051 /*
1052 * The left one is clearly better.
1053 */
1054 if (gtbnoa >= args->agbno + ltdiff) {
1055 xfs_btree_del_cursor(
1056 bno_cur_gt,
1057 XFS_BTREE_NOERROR);
1058 bno_cur_gt = NULL;
1059 break;
1060 }
1061 /*
1062 * If we reach a big enough entry,
1063 * compare the two and pick the best.
1064 */
1065 if (gtlena >= args->minlen) {
1066 args->len =
1067 XFS_EXTLEN_MIN(gtlena,
1068 args->maxlen);
1069 xfs_alloc_fix_len(args);
1070 rlen = args->len;
1071 gtdiff = xfs_alloc_compute_diff(
1072 args->agbno, rlen,
1073 args->alignment,
1074 gtbno, gtlen, &gtnew);
1075 /*
1076 * Right side is better.
1077 */
1078 if (gtdiff < ltdiff) {
1079 xfs_btree_del_cursor(
1080 bno_cur_lt,
1081 XFS_BTREE_NOERROR);
1082 bno_cur_lt = NULL;
1083 }
1084 /*
1085 * Left side is better.
1086 */
1087 else {
1088 xfs_btree_del_cursor(
1089 bno_cur_gt,
1090 XFS_BTREE_NOERROR);
1091 bno_cur_gt = NULL;
1092 }
1093 break;
1094 }
1095 /*
1096 * Fell off the right end.
1097 */
1098 if ((error = xfs_alloc_increment(
1099 bno_cur_gt, 0, &i)))
1100 goto error0;
1101 if (!i) {
1102 xfs_btree_del_cursor(
1103 bno_cur_gt,
1104 XFS_BTREE_NOERROR);
1105 bno_cur_gt = NULL;
1106 break;
1107 }
1108 }
1109 }
1110 /*
1111 * The left side is perfect, trash the right side.
1112 */
1113 else {
1114 xfs_btree_del_cursor(bno_cur_gt,
1115 XFS_BTREE_NOERROR);
1116 bno_cur_gt = NULL;
1117 }
1118 }
1119 /*
1120 * It's the right side that was found first, look left.
1121 */
1122 else {
1123 /*
1124 * Fix up the length.
1125 */
1126 args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
1127 xfs_alloc_fix_len(args);
1128 rlen = args->len;
1129 gtdiff = xfs_alloc_compute_diff(args->agbno, rlen,
1130 args->alignment, gtbno, gtlen, &gtnew);
1131 /*
1132 * Right side entry isn't perfect.
1133 */
1134 if (gtdiff) {
1135 /*
1136 * Look until we find a better one, run out of
1137 * space, or run off the end.
1138 */
1139 while (bno_cur_lt && bno_cur_gt) {
1140 if ((error = xfs_alloc_get_rec(
1141 bno_cur_lt, &ltbno,
1142 &ltlen, &i)))
1143 goto error0;
1144 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1145 xfs_alloc_compute_aligned(ltbno, ltlen,
1146 args->alignment, args->minlen,
1147 &ltbnoa, &ltlena);
1148 /*
1149 * The right one is clearly better.
1150 */
1151 if (ltbnoa <= args->agbno - gtdiff) {
1152 xfs_btree_del_cursor(
1153 bno_cur_lt,
1154 XFS_BTREE_NOERROR);
1155 bno_cur_lt = NULL;
1156 break;
1157 }
1158 /*
1159 * If we reach a big enough entry,
1160 * compare the two and pick the best.
1161 */
1162 if (ltlena >= args->minlen) {
1163 args->len = XFS_EXTLEN_MIN(
1164 ltlena, args->maxlen);
1165 xfs_alloc_fix_len(args);
1166 rlen = args->len;
1167 ltdiff = xfs_alloc_compute_diff(
1168 args->agbno, rlen,
1169 args->alignment,
1170 ltbno, ltlen, &ltnew);
1171 /*
1172 * Left side is better.
1173 */
1174 if (ltdiff < gtdiff) {
1175 xfs_btree_del_cursor(
1176 bno_cur_gt,
1177 XFS_BTREE_NOERROR);
1178 bno_cur_gt = NULL;
1179 }
1180 /*
1181 * Right side is better.
1182 */
1183 else {
1184 xfs_btree_del_cursor(
1185 bno_cur_lt,
1186 XFS_BTREE_NOERROR);
1187 bno_cur_lt = NULL;
1188 }
1189 break;
1190 }
1191 /*
1192 * Fell off the left end.
1193 */
1194 if ((error = xfs_alloc_decrement(
1195 bno_cur_lt, 0, &i)))
1196 goto error0;
1197 if (!i) {
1198 xfs_btree_del_cursor(bno_cur_lt,
1199 XFS_BTREE_NOERROR);
1200 bno_cur_lt = NULL;
1201 break;
1202 }
1203 }
1204 }
1205 /*
1206 * The right side is perfect, trash the left side.
1207 */
1208 else {
1209 xfs_btree_del_cursor(bno_cur_lt,
1210 XFS_BTREE_NOERROR);
1211 bno_cur_lt = NULL;
1212 }
1213 }
1214 }
1215 /*
1216 * If we couldn't get anything, give up.
1217 */
1218 if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
1219 TRACE_ALLOC("neither", args);
1220 args->agbno = NULLAGBLOCK;
1221 return 0;
1222 }
1223 /*
1224 * At this point we have selected a freespace entry, either to the
1225 * left or to the right. If it's on the right, copy all the
1226 * useful variables to the "left" set so we only have one
1227 * copy of this code.
1228 */
1229 if (bno_cur_gt) {
1230 bno_cur_lt = bno_cur_gt;
1231 bno_cur_gt = NULL;
1232 ltbno = gtbno;
1233 ltbnoa = gtbnoa;
1234 ltlen = gtlen;
1235 ltlena = gtlena;
1236 j = 1;
1237 } else
1238 j = 0;
1239 /*
1240 * Fix up the length and compute the useful address.
1241 */
1242 ltend = ltbno + ltlen;
1243 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
1244 xfs_alloc_fix_len(args);
1245 if (!xfs_alloc_fix_minleft(args)) {
1246 TRACE_ALLOC("nominleft", args);
1247 xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
1248 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1249 return 0;
1250 }
1251 rlen = args->len;
1252 (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, ltbno,
1253 ltlen, &ltnew);
1254 ASSERT(ltnew >= ltbno);
1255 ASSERT(ltnew + rlen <= ltend);
1256 ASSERT(ltnew + rlen <= INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_length,
1257 ARCH_CONVERT));
1258 args->agbno = ltnew;
1259 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
1260 ltnew, rlen, XFSA_FIXUP_BNO_OK)))
1261 goto error0;
1262 TRACE_ALLOC(j ? "gt" : "lt", args);
1263 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1264 xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
1265 return 0;
1266
1267 error0:
1268 TRACE_ALLOC("error", args);
1269 if (cnt_cur != NULL)
1270 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1271 if (bno_cur_lt != NULL)
1272 xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_ERROR);
1273 if (bno_cur_gt != NULL)
1274 xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_ERROR);
1275 return error;
1276}
1277
1278/*
1279 * Allocate a variable extent anywhere in the allocation group agno.
1280 * Extent's length (returned in len) will be between minlen and maxlen,
1281 * and of the form k * prod + mod unless there's nothing that large.
1282 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1283 */
1284STATIC int /* error */
1285xfs_alloc_ag_vextent_size(
1286 xfs_alloc_arg_t *args) /* allocation argument structure */
1287{
1288 xfs_btree_cur_t *bno_cur; /* cursor for bno btree */
1289 xfs_btree_cur_t *cnt_cur; /* cursor for cnt btree */
1290 int error; /* error result */
1291 xfs_agblock_t fbno; /* start of found freespace */
1292 xfs_extlen_t flen; /* length of found freespace */
1293#ifdef XFS_ALLOC_TRACE
1294 static char fname[] = "xfs_alloc_ag_vextent_size";
1295#endif
1296 int i; /* temp status variable */
1297 xfs_agblock_t rbno; /* returned block number */
1298 xfs_extlen_t rlen; /* length of returned extent */
1299
1300 /*
1301 * Allocate and initialize a cursor for the by-size btree.
1302 */
1303 cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
1304 args->agno, XFS_BTNUM_CNT, NULL, 0);
1305 bno_cur = NULL;
1306 /*
1307 * Look for an entry >= maxlen+alignment-1 blocks.
1308 */
1309 if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
1310 args->maxlen + args->alignment - 1, &i)))
1311 goto error0;
1312 /*
1313 * If none, then pick up the last entry in the tree unless the
1314 * tree is empty.
1315 */
1316 if (!i) {
1317 if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &fbno,
1318 &flen, &i)))
1319 goto error0;
1320 if (i == 0 || flen == 0) {
1321 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1322 TRACE_ALLOC("noentry", args);
1323 return 0;
1324 }
1325 ASSERT(i == 1);
1326 }
1327 /*
1328 * There's a freespace as big as maxlen+alignment-1, get it.
1329 */
1330 else {
1331 if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i)))
1332 goto error0;
1333 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1334 }
1335 /*
1336 * In the first case above, we got the last entry in the
1337 * by-size btree. Now we check to see if the space hits maxlen
1338 * once aligned; if not, we search left for something better.
1339 * This can't happen in the second case above.
1340 */
1341 xfs_alloc_compute_aligned(fbno, flen, args->alignment, args->minlen,
1342 &rbno, &rlen);
1343 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1344 XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
1345 (rlen <= flen && rbno + rlen <= fbno + flen), error0);
1346 if (rlen < args->maxlen) {
1347 xfs_agblock_t bestfbno;
1348 xfs_extlen_t bestflen;
1349 xfs_agblock_t bestrbno;
1350 xfs_extlen_t bestrlen;
1351
1352 bestrlen = rlen;
1353 bestrbno = rbno;
1354 bestflen = flen;
1355 bestfbno = fbno;
1356 for (;;) {
1357 if ((error = xfs_alloc_decrement(cnt_cur, 0, &i)))
1358 goto error0;
1359 if (i == 0)
1360 break;
1361 if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
1362 &i)))
1363 goto error0;
1364 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1365 if (flen < bestrlen)
1366 break;
1367 xfs_alloc_compute_aligned(fbno, flen, args->alignment,
1368 args->minlen, &rbno, &rlen);
1369 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1370 XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
1371 (rlen <= flen && rbno + rlen <= fbno + flen),
1372 error0);
1373 if (rlen > bestrlen) {
1374 bestrlen = rlen;
1375 bestrbno = rbno;
1376 bestflen = flen;
1377 bestfbno = fbno;
1378 if (rlen == args->maxlen)
1379 break;
1380 }
1381 }
1382 if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
1383 &i)))
1384 goto error0;
1385 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1386 rlen = bestrlen;
1387 rbno = bestrbno;
1388 flen = bestflen;
1389 fbno = bestfbno;
1390 }
1391 args->wasfromfl = 0;
1392 /*
1393 * Fix up the length.
1394 */
1395 args->len = rlen;
1396 xfs_alloc_fix_len(args);
1397 if (rlen < args->minlen || !xfs_alloc_fix_minleft(args)) {
1398 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1399 TRACE_ALLOC("nominleft", args);
1400 args->agbno = NULLAGBLOCK;
1401 return 0;
1402 }
1403 rlen = args->len;
1404 XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0);
1405 /*
1406 * Allocate and initialize a cursor for the by-block tree.
1407 */
1408 bno_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
1409 args->agno, XFS_BTNUM_BNO, NULL, 0);
1410 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
1411 rbno, rlen, XFSA_FIXUP_CNT_OK)))
1412 goto error0;
1413 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1414 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1415 cnt_cur = bno_cur = NULL;
1416 args->len = rlen;
1417 args->agbno = rbno;
1418 XFS_WANT_CORRUPTED_GOTO(
1419 args->agbno + args->len <=
1420 INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_length,
1421 ARCH_CONVERT),
1422 error0);
1423 TRACE_ALLOC("normal", args);
1424 return 0;
1425
1426error0:
1427 TRACE_ALLOC("error", args);
1428 if (cnt_cur)
1429 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1430 if (bno_cur)
1431 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1432 return error;
1433}
1434
1435/*
1436 * Deal with the case where only small freespaces remain.
1437 * Either return the contents of the last freespace record,
1438 * or allocate space from the freelist if there is nothing in the tree.
1439 */
1440STATIC int /* error */
1441xfs_alloc_ag_vextent_small(
1442 xfs_alloc_arg_t *args, /* allocation argument structure */
1443 xfs_btree_cur_t *ccur, /* by-size cursor */
1444 xfs_agblock_t *fbnop, /* result block number */
1445 xfs_extlen_t *flenp, /* result length */
1446 int *stat) /* status: 0-freelist, 1-normal/none */
1447{
1448 int error;
1449 xfs_agblock_t fbno;
1450 xfs_extlen_t flen;
1451#ifdef XFS_ALLOC_TRACE
1452 static char fname[] = "xfs_alloc_ag_vextent_small";
1453#endif
1454 int i;
1455
1456 if ((error = xfs_alloc_decrement(ccur, 0, &i)))
1457 goto error0;
1458 if (i) {
1459 if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i)))
1460 goto error0;
1461 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1462 }
1463 /*
1464 * Nothing in the btree, try the freelist. Make sure
1465 * to respect minleft even when pulling from the
1466 * freelist.
1467 */
1468 else if (args->minlen == 1 && args->alignment == 1 && !args->isfl &&
1469 (INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_flcount,
1470 ARCH_CONVERT) > args->minleft)) {
1471 if ((error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno)))
1472 goto error0;
1473 if (fbno != NULLAGBLOCK) {
1474 if (args->userdata) {
1475 xfs_buf_t *bp;
1476
1477 bp = xfs_btree_get_bufs(args->mp, args->tp,
1478 args->agno, fbno, 0);
1479 xfs_trans_binval(args->tp, bp);
1480 }
1481 args->len = 1;
1482 args->agbno = fbno;
1483 XFS_WANT_CORRUPTED_GOTO(
1484 args->agbno + args->len <=
1485 INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_length,
1486 ARCH_CONVERT),
1487 error0);
1488 args->wasfromfl = 1;
1489 TRACE_ALLOC("freelist", args);
1490 *stat = 0;
1491 return 0;
1492 }
1493 /*
1494 * Nothing in the freelist.
1495 */
1496 else
1497 flen = 0;
1498 }
1499 /*
1500 * Can't allocate from the freelist for some reason.
1501 */
1502 else
1503 flen = 0;
1504 /*
1505 * Can't do the allocation, give up.
1506 */
1507 if (flen < args->minlen) {
1508 args->agbno = NULLAGBLOCK;
1509 TRACE_ALLOC("notenough", args);
1510 flen = 0;
1511 }
1512 *fbnop = fbno;
1513 *flenp = flen;
1514 *stat = 1;
1515 TRACE_ALLOC("normal", args);
1516 return 0;
1517
1518error0:
1519 TRACE_ALLOC("error", args);
1520 return error;
1521}
1522
1523/*
1524 * Free the extent starting at agno/bno for length.
1525 */
1526STATIC int /* error */
1527xfs_free_ag_extent(
1528 xfs_trans_t *tp, /* transaction pointer */
1529 xfs_buf_t *agbp, /* buffer for a.g. freelist header */
1530 xfs_agnumber_t agno, /* allocation group number */
1531 xfs_agblock_t bno, /* starting block number */
1532 xfs_extlen_t len, /* length of extent */
1533 int isfl) /* set if is freelist blocks - no sb acctg */
1534{
1535 xfs_btree_cur_t *bno_cur; /* cursor for by-block btree */
1536 xfs_btree_cur_t *cnt_cur; /* cursor for by-size btree */
1537 int error; /* error return value */
1538#ifdef XFS_ALLOC_TRACE
1539 static char fname[] = "xfs_free_ag_extent";
1540#endif
1541 xfs_agblock_t gtbno; /* start of right neighbor block */
1542 xfs_extlen_t gtlen; /* length of right neighbor block */
1543 int haveleft; /* have a left neighbor block */
1544 int haveright; /* have a right neighbor block */
1545 int i; /* temp, result code */
1546 xfs_agblock_t ltbno; /* start of left neighbor block */
1547 xfs_extlen_t ltlen; /* length of left neighbor block */
1548 xfs_mount_t *mp; /* mount point struct for filesystem */
1549 xfs_agblock_t nbno; /* new starting block of freespace */
1550 xfs_extlen_t nlen; /* new length of freespace */
1551
1552 mp = tp->t_mountp;
1553 /*
1554 * Allocate and initialize a cursor for the by-block btree.
1555 */
1556 bno_cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO, NULL,
1557 0);
1558 cnt_cur = NULL;
1559 /*
1560 * Look for a neighboring block on the left (lower block numbers)
1561 * that is contiguous with this space.
1562 */
1563 if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft)))
1564 goto error0;
1565 if (haveleft) {
1566 /*
1567 * There is a block to our left.
1568 */
1569 if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
1570 goto error0;
1571 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1572 /*
1573 * It's not contiguous, though.
1574 */
1575 if (ltbno + ltlen < bno)
1576 haveleft = 0;
1577 else {
1578 /*
1579 * If this failure happens the request to free this
1580 * space was invalid, it's (partly) already free.
1581 * Very bad.
1582 */
1583 XFS_WANT_CORRUPTED_GOTO(ltbno + ltlen <= bno, error0);
1584 }
1585 }
1586 /*
1587 * Look for a neighboring block on the right (higher block numbers)
1588 * that is contiguous with this space.
1589 */
1590 if ((error = xfs_alloc_increment(bno_cur, 0, &haveright)))
1591 goto error0;
1592 if (haveright) {
1593 /*
1594 * There is a block to our right.
1595 */
1596 if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
1597 goto error0;
1598 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1599 /*
1600 * It's not contiguous, though.
1601 */
1602 if (bno + len < gtbno)
1603 haveright = 0;
1604 else {
1605 /*
1606 * If this failure happens the request to free this
1607 * space was invalid, it's (partly) already free.
1608 * Very bad.
1609 */
1610 XFS_WANT_CORRUPTED_GOTO(gtbno >= bno + len, error0);
1611 }
1612 }
1613 /*
1614 * Now allocate and initialize a cursor for the by-size tree.
1615 */
1616 cnt_cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT, NULL,
1617 0);
1618 /*
1619 * Have both left and right contiguous neighbors.
1620 * Merge all three into a single free block.
1621 */
1622 if (haveleft && haveright) {
1623 /*
1624 * Delete the old by-size entry on the left.
1625 */
1626 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
1627 goto error0;
1628 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1629 if ((error = xfs_alloc_delete(cnt_cur, &i)))
1630 goto error0;
1631 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1632 /*
1633 * Delete the old by-size entry on the right.
1634 */
1635 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
1636 goto error0;
1637 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1638 if ((error = xfs_alloc_delete(cnt_cur, &i)))
1639 goto error0;
1640 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1641 /*
1642 * Delete the old by-block entry for the right block.
1643 */
1644 if ((error = xfs_alloc_delete(bno_cur, &i)))
1645 goto error0;
1646 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1647 /*
1648 * Move the by-block cursor back to the left neighbor.
1649 */
1650 if ((error = xfs_alloc_decrement(bno_cur, 0, &i)))
1651 goto error0;
1652 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1653#ifdef DEBUG
1654 /*
1655 * Check that this is the right record: delete didn't
1656 * mangle the cursor.
1657 */
1658 {
1659 xfs_agblock_t xxbno;
1660 xfs_extlen_t xxlen;
1661
1662 if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
1663 &i)))
1664 goto error0;
1665 XFS_WANT_CORRUPTED_GOTO(
1666 i == 1 && xxbno == ltbno && xxlen == ltlen,
1667 error0);
1668 }
1669#endif
1670 /*
1671 * Update remaining by-block entry to the new, joined block.
1672 */
1673 nbno = ltbno;
1674 nlen = len + ltlen + gtlen;
1675 if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
1676 goto error0;
1677 }
1678 /*
1679 * Have only a left contiguous neighbor.
1680 * Merge it together with the new freespace.
1681 */
1682 else if (haveleft) {
1683 /*
1684 * Delete the old by-size entry on the left.
1685 */
1686 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
1687 goto error0;
1688 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1689 if ((error = xfs_alloc_delete(cnt_cur, &i)))
1690 goto error0;
1691 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1692 /*
1693 * Back up the by-block cursor to the left neighbor, and
1694 * update its length.
1695 */
1696 if ((error = xfs_alloc_decrement(bno_cur, 0, &i)))
1697 goto error0;
1698 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1699 nbno = ltbno;
1700 nlen = len + ltlen;
1701 if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
1702 goto error0;
1703 }
1704 /*
1705 * Have only a right contiguous neighbor.
1706 * Merge it together with the new freespace.
1707 */
1708 else if (haveright) {
1709 /*
1710 * Delete the old by-size entry on the right.
1711 */
1712 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
1713 goto error0;
1714 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1715 if ((error = xfs_alloc_delete(cnt_cur, &i)))
1716 goto error0;
1717 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1718 /*
1719 * Update the starting block and length of the right
1720 * neighbor in the by-block tree.
1721 */
1722 nbno = bno;
1723 nlen = len + gtlen;
1724 if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
1725 goto error0;
1726 }
1727 /*
1728 * No contiguous neighbors.
1729 * Insert the new freespace into the by-block tree.
1730 */
1731 else {
1732 nbno = bno;
1733 nlen = len;
1734 if ((error = xfs_alloc_insert(bno_cur, &i)))
1735 goto error0;
1736 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1737 }
1738 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1739 bno_cur = NULL;
1740 /*
1741 * In all cases we need to insert the new freespace in the by-size tree.
1742 */
1743 if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
1744 goto error0;
1745 XFS_WANT_CORRUPTED_GOTO(i == 0, error0);
1746 if ((error = xfs_alloc_insert(cnt_cur, &i)))
1747 goto error0;
1748 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1749 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1750 cnt_cur = NULL;
1751 /*
1752 * Update the freespace totals in the ag and superblock.
1753 */
1754 {
1755 xfs_agf_t *agf;
1756 xfs_perag_t *pag; /* per allocation group data */
1757
1758 agf = XFS_BUF_TO_AGF(agbp);
1759 pag = &mp->m_perag[agno];
1760 INT_MOD(agf->agf_freeblks, ARCH_CONVERT, len);
1761 xfs_trans_agblocks_delta(tp, len);
1762 pag->pagf_freeblks += len;
1763 XFS_WANT_CORRUPTED_GOTO(
1764 INT_GET(agf->agf_freeblks, ARCH_CONVERT)
1765 <= INT_GET(agf->agf_length, ARCH_CONVERT),
1766 error0);
1767 TRACE_MODAGF(NULL, agf, XFS_AGF_FREEBLKS);
1768 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
1769 if (!isfl)
1770 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
1771 XFS_STATS_INC(xs_freex);
1772 XFS_STATS_ADD(xs_freeb, len);
1773 }
1774 TRACE_FREE(haveleft ?
1775 (haveright ? "both" : "left") :
1776 (haveright ? "right" : "none"),
1777 agno, bno, len, isfl);
1778
1779 /*
1780 * Since blocks move to the free list without the coordination
1781 * used in xfs_bmap_finish, we can't allow block to be available
1782 * for reallocation and non-transaction writing (user data)
1783 * until we know that the transaction that moved it to the free
1784 * list is permanently on disk. We track the blocks by declaring
1785 * these blocks as "busy"; the busy list is maintained on a per-ag
1786 * basis and each transaction records which entries should be removed
1787 * when the iclog commits to disk. If a busy block is allocated,
1788 * the iclog is pushed up to the LSN that freed the block.
1789 */
1790 xfs_alloc_mark_busy(tp, agno, bno, len);
1791 return 0;
1792
1793 error0:
1794 TRACE_FREE("error", agno, bno, len, isfl);
1795 if (bno_cur)
1796 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1797 if (cnt_cur)
1798 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1799 return error;
1800}
1801
1802/*
1803 * Visible (exported) allocation/free functions.
1804 * Some of these are used just by xfs_alloc_btree.c and this file.
1805 */
1806
1807/*
1808 * Compute and fill in value of m_ag_maxlevels.
1809 */
1810void
1811xfs_alloc_compute_maxlevels(
1812 xfs_mount_t *mp) /* file system mount structure */
1813{
1814 int level;
1815 uint maxblocks;
1816 uint maxleafents;
1817 int minleafrecs;
1818 int minnoderecs;
1819
1820 maxleafents = (mp->m_sb.sb_agblocks + 1) / 2;
1821 minleafrecs = mp->m_alloc_mnr[0];
1822 minnoderecs = mp->m_alloc_mnr[1];
1823 maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
1824 for (level = 1; maxblocks > 1; level++)
1825 maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
1826 mp->m_ag_maxlevels = level;
1827}
1828
1829/*
1830 * Decide whether to use this allocation group for this allocation.
1831 * If so, fix up the btree freelist's size.
1832 */
1833STATIC int /* error */
1834xfs_alloc_fix_freelist(
1835 xfs_alloc_arg_t *args, /* allocation argument structure */
1836 int flags) /* XFS_ALLOC_FLAG_... */
1837{
1838 xfs_buf_t *agbp; /* agf buffer pointer */
1839 xfs_agf_t *agf; /* a.g. freespace structure pointer */
1840 xfs_buf_t *agflbp;/* agfl buffer pointer */
1841 xfs_agblock_t bno; /* freelist block */
1842 xfs_extlen_t delta; /* new blocks needed in freelist */
1843 int error; /* error result code */
1844 xfs_extlen_t longest;/* longest extent in allocation group */
1845 xfs_mount_t *mp; /* file system mount point structure */
1846 xfs_extlen_t need; /* total blocks needed in freelist */
1847 xfs_perag_t *pag; /* per-ag information structure */
1848 xfs_alloc_arg_t targs; /* local allocation arguments */
1849 xfs_trans_t *tp; /* transaction pointer */
1850
1851 mp = args->mp;
1852
1853 pag = args->pag;
1854 tp = args->tp;
1855 if (!pag->pagf_init) {
1856 if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
1857 &agbp)))
1858 return error;
1859 if (!pag->pagf_init) {
1860 args->agbp = NULL;
1861 return 0;
1862 }
1863 } else
1864 agbp = NULL;
1865
1866 /* If this is a metadata prefered pag and we are user data
1867 * then try somewhere else if we are not being asked to
1868 * try harder at this point
1869 */
1870 if (pag->pagf_metadata && args->userdata && flags) {
1871 args->agbp = NULL;
1872 return 0;
1873 }
1874
1875 need = XFS_MIN_FREELIST_PAG(pag, mp);
1876 delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0;
1877 /*
1878 * If it looks like there isn't a long enough extent, or enough
1879 * total blocks, reject it.
1880 */
1881 longest = (pag->pagf_longest > delta) ?
1882 (pag->pagf_longest - delta) :
1883 (pag->pagf_flcount > 0 || pag->pagf_longest > 0);
1884 if (args->minlen + args->alignment + args->minalignslop - 1 > longest ||
1885 (args->minleft &&
1886 (int)(pag->pagf_freeblks + pag->pagf_flcount -
1887 need - args->total) <
1888 (int)args->minleft)) {
1889 if (agbp)
1890 xfs_trans_brelse(tp, agbp);
1891 args->agbp = NULL;
1892 return 0;
1893 }
1894 /*
1895 * Get the a.g. freespace buffer.
1896 * Can fail if we're not blocking on locks, and it's held.
1897 */
1898 if (agbp == NULL) {
1899 if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
1900 &agbp)))
1901 return error;
1902 if (agbp == NULL) {
1903 args->agbp = NULL;
1904 return 0;
1905 }
1906 }
1907 /*
1908 * Figure out how many blocks we should have in the freelist.
1909 */
1910 agf = XFS_BUF_TO_AGF(agbp);
1911 need = XFS_MIN_FREELIST(agf, mp);
1912 delta = need > INT_GET(agf->agf_flcount, ARCH_CONVERT) ?
1913 (need - INT_GET(agf->agf_flcount, ARCH_CONVERT)) : 0;
1914 /*
1915 * If there isn't enough total or single-extent, reject it.
1916 */
1917 longest = INT_GET(agf->agf_longest, ARCH_CONVERT);
1918 longest = (longest > delta) ? (longest - delta) :
1919 (INT_GET(agf->agf_flcount, ARCH_CONVERT) > 0 || longest > 0);
1920 if (args->minlen + args->alignment + args->minalignslop - 1 > longest ||
1921 (args->minleft &&
1922 (int)(INT_GET(agf->agf_freeblks, ARCH_CONVERT) +
1923 INT_GET(agf->agf_flcount, ARCH_CONVERT) - need - args->total) <
1924 (int)args->minleft)) {
1925 xfs_trans_brelse(tp, agbp);
1926 args->agbp = NULL;
1927 return 0;
1928 }
1929 /*
1930 * Make the freelist shorter if it's too long.
1931 */
1932 while (INT_GET(agf->agf_flcount, ARCH_CONVERT) > need) {
1933 xfs_buf_t *bp;
1934
1935 if ((error = xfs_alloc_get_freelist(tp, agbp, &bno)))
1936 return error;
1937 if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1)))
1938 return error;
1939 bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
1940 xfs_trans_binval(tp, bp);
1941 }
1942 /*
1943 * Initialize the args structure.
1944 */
1945 targs.tp = tp;
1946 targs.mp = mp;
1947 targs.agbp = agbp;
1948 targs.agno = args->agno;
1949 targs.mod = targs.minleft = targs.wasdel = targs.userdata =
1950 targs.minalignslop = 0;
1951 targs.alignment = targs.minlen = targs.prod = targs.isfl = 1;
1952 targs.type = XFS_ALLOCTYPE_THIS_AG;
1953 targs.pag = pag;
1954 if ((error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp)))
1955 return error;
1956 /*
1957 * Make the freelist longer if it's too short.
1958 */
1959 while (INT_GET(agf->agf_flcount, ARCH_CONVERT) < need) {
1960 targs.agbno = 0;
1961 targs.maxlen = need - INT_GET(agf->agf_flcount, ARCH_CONVERT);
1962 /*
1963 * Allocate as many blocks as possible at once.
1964 */
1965 if ((error = xfs_alloc_ag_vextent(&targs)))
1966 return error;
1967 /*
1968 * Stop if we run out. Won't happen if callers are obeying
1969 * the restrictions correctly. Can happen for free calls
1970 * on a completely full ag.
1971 */
1972 if (targs.agbno == NULLAGBLOCK)
1973 break;
1974 /*
1975 * Put each allocated block on the list.
1976 */
1977 for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
1978 if ((error = xfs_alloc_put_freelist(tp, agbp, agflbp,
1979 bno)))
1980 return error;
1981 }
1982 }
1983 args->agbp = agbp;
1984 return 0;
1985}
1986
1987/*
1988 * Get a block from the freelist.
1989 * Returns with the buffer for the block gotten.
1990 */
1991int /* error */
1992xfs_alloc_get_freelist(
1993 xfs_trans_t *tp, /* transaction pointer */
1994 xfs_buf_t *agbp, /* buffer containing the agf structure */
1995 xfs_agblock_t *bnop) /* block address retrieved from freelist */
1996{
1997 xfs_agf_t *agf; /* a.g. freespace structure */
1998 xfs_agfl_t *agfl; /* a.g. freelist structure */
1999 xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */
2000 xfs_agblock_t bno; /* block number returned */
2001 int error;
2002#ifdef XFS_ALLOC_TRACE
2003 static char fname[] = "xfs_alloc_get_freelist";
2004#endif
2005 xfs_mount_t *mp; /* mount structure */
2006 xfs_perag_t *pag; /* per allocation group data */
2007
2008 agf = XFS_BUF_TO_AGF(agbp);
2009 /*
2010 * Freelist is empty, give up.
2011 */
2012 if (!agf->agf_flcount) {
2013 *bnop = NULLAGBLOCK;
2014 return 0;
2015 }
2016 /*
2017 * Read the array of free blocks.
2018 */
2019 mp = tp->t_mountp;
2020 if ((error = xfs_alloc_read_agfl(mp, tp,
2021 INT_GET(agf->agf_seqno, ARCH_CONVERT), &agflbp)))
2022 return error;
2023 agfl = XFS_BUF_TO_AGFL(agflbp);
2024 /*
2025 * Get the block number and update the data structures.
2026 */
2027 bno = INT_GET(agfl->agfl_bno[INT_GET(agf->agf_flfirst, ARCH_CONVERT)], ARCH_CONVERT);
2028 INT_MOD(agf->agf_flfirst, ARCH_CONVERT, 1);
2029 xfs_trans_brelse(tp, agflbp);
2030 if (INT_GET(agf->agf_flfirst, ARCH_CONVERT) == XFS_AGFL_SIZE(mp))
2031 agf->agf_flfirst = 0;
2032 pag = &mp->m_perag[INT_GET(agf->agf_seqno, ARCH_CONVERT)];
2033 INT_MOD(agf->agf_flcount, ARCH_CONVERT, -1);
2034 xfs_trans_agflist_delta(tp, -1);
2035 pag->pagf_flcount--;
2036 TRACE_MODAGF(NULL, agf, XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT);
2037 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT);
2038 *bnop = bno;
2039
2040 /*
2041 * As blocks are freed, they are added to the per-ag busy list
2042 * and remain there until the freeing transaction is committed to
2043 * disk. Now that we have allocated blocks, this list must be
2044 * searched to see if a block is being reused. If one is, then
2045 * the freeing transaction must be pushed to disk NOW by forcing
2046 * to disk all iclogs up that transaction's LSN.
2047 */
2048 xfs_alloc_search_busy(tp, INT_GET(agf->agf_seqno, ARCH_CONVERT), bno, 1);
2049 return 0;
2050}
2051
2052/*
2053 * Log the given fields from the agf structure.
2054 */
2055void
2056xfs_alloc_log_agf(
2057 xfs_trans_t *tp, /* transaction pointer */
2058 xfs_buf_t *bp, /* buffer for a.g. freelist header */
2059 int fields) /* mask of fields to be logged (XFS_AGF_...) */
2060{
2061 int first; /* first byte offset */
2062 int last; /* last byte offset */
2063 static const short offsets[] = {
2064 offsetof(xfs_agf_t, agf_magicnum),
2065 offsetof(xfs_agf_t, agf_versionnum),
2066 offsetof(xfs_agf_t, agf_seqno),
2067 offsetof(xfs_agf_t, agf_length),
2068 offsetof(xfs_agf_t, agf_roots[0]),
2069 offsetof(xfs_agf_t, agf_levels[0]),
2070 offsetof(xfs_agf_t, agf_flfirst),
2071 offsetof(xfs_agf_t, agf_fllast),
2072 offsetof(xfs_agf_t, agf_flcount),
2073 offsetof(xfs_agf_t, agf_freeblks),
2074 offsetof(xfs_agf_t, agf_longest),
2075 sizeof(xfs_agf_t)
2076 };
2077
2078 xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
2079 xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
2080}
2081
2082/*
2083 * Interface for inode allocation to force the pag data to be initialized.
2084 */
2085int /* error */
2086xfs_alloc_pagf_init(
2087 xfs_mount_t *mp, /* file system mount structure */
2088 xfs_trans_t *tp, /* transaction pointer */
2089 xfs_agnumber_t agno, /* allocation group number */
2090 int flags) /* XFS_ALLOC_FLAGS_... */
2091{
2092 xfs_buf_t *bp;
2093 int error;
2094
2095 if ((error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp)))
2096 return error;
2097 if (bp)
2098 xfs_trans_brelse(tp, bp);
2099 return 0;
2100}
2101
2102/*
2103 * Put the block on the freelist for the allocation group.
2104 */
2105int /* error */
2106xfs_alloc_put_freelist(
2107 xfs_trans_t *tp, /* transaction pointer */
2108 xfs_buf_t *agbp, /* buffer for a.g. freelist header */
2109 xfs_buf_t *agflbp,/* buffer for a.g. free block array */
2110 xfs_agblock_t bno) /* block being freed */
2111{
2112 xfs_agf_t *agf; /* a.g. freespace structure */
2113 xfs_agfl_t *agfl; /* a.g. free block array */
2114 xfs_agblock_t *blockp;/* pointer to array entry */
2115 int error;
2116#ifdef XFS_ALLOC_TRACE
2117 static char fname[] = "xfs_alloc_put_freelist";
2118#endif
2119 xfs_mount_t *mp; /* mount structure */
2120 xfs_perag_t *pag; /* per allocation group data */
2121
2122 agf = XFS_BUF_TO_AGF(agbp);
2123 mp = tp->t_mountp;
2124
2125 if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp,
2126 INT_GET(agf->agf_seqno, ARCH_CONVERT), &agflbp)))
2127 return error;
2128 agfl = XFS_BUF_TO_AGFL(agflbp);
2129 INT_MOD(agf->agf_fllast, ARCH_CONVERT, 1);
2130 if (INT_GET(agf->agf_fllast, ARCH_CONVERT) == XFS_AGFL_SIZE(mp))
2131 agf->agf_fllast = 0;
2132 pag = &mp->m_perag[INT_GET(agf->agf_seqno, ARCH_CONVERT)];
2133 INT_MOD(agf->agf_flcount, ARCH_CONVERT, 1);
2134 xfs_trans_agflist_delta(tp, 1);
2135 pag->pagf_flcount++;
2136 ASSERT(INT_GET(agf->agf_flcount, ARCH_CONVERT) <= XFS_AGFL_SIZE(mp));
2137 blockp = &agfl->agfl_bno[INT_GET(agf->agf_fllast, ARCH_CONVERT)];
2138 INT_SET(*blockp, ARCH_CONVERT, bno);
2139 TRACE_MODAGF(NULL, agf, XFS_AGF_FLLAST | XFS_AGF_FLCOUNT);
2140 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLLAST | XFS_AGF_FLCOUNT);
2141 xfs_trans_log_buf(tp, agflbp,
2142 (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl),
2143 (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl +
2144 sizeof(xfs_agblock_t) - 1));
2145 return 0;
2146}
2147
2148/*
2149 * Read in the allocation group header (free/alloc section).
2150 */
2151int /* error */
2152xfs_alloc_read_agf(
2153 xfs_mount_t *mp, /* mount point structure */
2154 xfs_trans_t *tp, /* transaction pointer */
2155 xfs_agnumber_t agno, /* allocation group number */
2156 int flags, /* XFS_ALLOC_FLAG_... */
2157 xfs_buf_t **bpp) /* buffer for the ag freelist header */
2158{
2159 xfs_agf_t *agf; /* ag freelist header */
2160 int agf_ok; /* set if agf is consistent */
2161 xfs_buf_t *bp; /* return value */
2162 xfs_perag_t *pag; /* per allocation group data */
2163 int error;
2164
2165 ASSERT(agno != NULLAGNUMBER);
2166 error = xfs_trans_read_buf(
2167 mp, tp, mp->m_ddev_targp,
2168 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
2169 XFS_FSS_TO_BB(mp, 1),
2170 (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XFS_BUF_TRYLOCK : 0U,
2171 &bp);
2172 if (error)
2173 return error;
2174 ASSERT(!bp || !XFS_BUF_GETERROR(bp));
2175 if (!bp) {
2176 *bpp = NULL;
2177 return 0;
2178 }
2179 /*
2180 * Validate the magic number of the agf block.
2181 */
2182 agf = XFS_BUF_TO_AGF(bp);
2183 agf_ok =
2184 INT_GET(agf->agf_magicnum, ARCH_CONVERT) == XFS_AGF_MAGIC &&
2185 XFS_AGF_GOOD_VERSION(
2186 INT_GET(agf->agf_versionnum, ARCH_CONVERT)) &&
2187 INT_GET(agf->agf_freeblks, ARCH_CONVERT) <=
2188 INT_GET(agf->agf_length, ARCH_CONVERT) &&
2189 INT_GET(agf->agf_flfirst, ARCH_CONVERT) < XFS_AGFL_SIZE(mp) &&
2190 INT_GET(agf->agf_fllast, ARCH_CONVERT) < XFS_AGFL_SIZE(mp) &&
2191 INT_GET(agf->agf_flcount, ARCH_CONVERT) <= XFS_AGFL_SIZE(mp);
2192 if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
2193 XFS_RANDOM_ALLOC_READ_AGF))) {
2194 XFS_CORRUPTION_ERROR("xfs_alloc_read_agf",
2195 XFS_ERRLEVEL_LOW, mp, agf);
2196 xfs_trans_brelse(tp, bp);
2197 return XFS_ERROR(EFSCORRUPTED);
2198 }
2199 pag = &mp->m_perag[agno];
2200 if (!pag->pagf_init) {
2201 pag->pagf_freeblks = INT_GET(agf->agf_freeblks, ARCH_CONVERT);
2202 pag->pagf_flcount = INT_GET(agf->agf_flcount, ARCH_CONVERT);
2203 pag->pagf_longest = INT_GET(agf->agf_longest, ARCH_CONVERT);
2204 pag->pagf_levels[XFS_BTNUM_BNOi] =
2205 INT_GET(agf->agf_levels[XFS_BTNUM_BNOi], ARCH_CONVERT);
2206 pag->pagf_levels[XFS_BTNUM_CNTi] =
2207 INT_GET(agf->agf_levels[XFS_BTNUM_CNTi], ARCH_CONVERT);
2208 spinlock_init(&pag->pagb_lock, "xfspagb");
2209 pag->pagb_list = kmem_zalloc(XFS_PAGB_NUM_SLOTS *
2210 sizeof(xfs_perag_busy_t), KM_SLEEP);
2211 pag->pagf_init = 1;
2212 }
2213#ifdef DEBUG
2214 else if (!XFS_FORCED_SHUTDOWN(mp)) {
2215 ASSERT(pag->pagf_freeblks == INT_GET(agf->agf_freeblks, ARCH_CONVERT));
2216 ASSERT(pag->pagf_flcount == INT_GET(agf->agf_flcount, ARCH_CONVERT));
2217 ASSERT(pag->pagf_longest == INT_GET(agf->agf_longest, ARCH_CONVERT));
2218 ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
2219 INT_GET(agf->agf_levels[XFS_BTNUM_BNOi], ARCH_CONVERT));
2220 ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] ==
2221 INT_GET(agf->agf_levels[XFS_BTNUM_CNTi], ARCH_CONVERT));
2222 }
2223#endif
2224 XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGF, XFS_AGF_REF);
2225 *bpp = bp;
2226 return 0;
2227}
2228
2229/*
2230 * Allocate an extent (variable-size).
2231 * Depending on the allocation type, we either look in a single allocation
2232 * group or loop over the allocation groups to find the result.
2233 */
2234int /* error */
2235xfs_alloc_vextent(
2236 xfs_alloc_arg_t *args) /* allocation argument structure */
2237{
2238 xfs_agblock_t agsize; /* allocation group size */
2239 int error;
2240 int flags; /* XFS_ALLOC_FLAG_... locking flags */
2241#ifdef XFS_ALLOC_TRACE
2242 static char fname[] = "xfs_alloc_vextent";
2243#endif
2244 xfs_extlen_t minleft;/* minimum left value, temp copy */
2245 xfs_mount_t *mp; /* mount structure pointer */
2246 xfs_agnumber_t sagno; /* starting allocation group number */
2247 xfs_alloctype_t type; /* input allocation type */
2248 int bump_rotor = 0;
2249 int no_min = 0;
2250 xfs_agnumber_t rotorstep = xfs_rotorstep; /* inode32 agf stepper */
2251
2252 mp = args->mp;
2253 type = args->otype = args->type;
2254 args->agbno = NULLAGBLOCK;
2255 /*
2256 * Just fix this up, for the case where the last a.g. is shorter
2257 * (or there's only one a.g.) and the caller couldn't easily figure
2258 * that out (xfs_bmap_alloc).
2259 */
2260 agsize = mp->m_sb.sb_agblocks;
2261 if (args->maxlen > agsize)
2262 args->maxlen = agsize;
2263 if (args->alignment == 0)
2264 args->alignment = 1;
2265 ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount);
2266 ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize);
2267 ASSERT(args->minlen <= args->maxlen);
2268 ASSERT(args->minlen <= agsize);
2269 ASSERT(args->mod < args->prod);
2270 if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount ||
2271 XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize ||
2272 args->minlen > args->maxlen || args->minlen > agsize ||
2273 args->mod >= args->prod) {
2274 args->fsbno = NULLFSBLOCK;
2275 TRACE_ALLOC("badargs", args);
2276 return 0;
2277 }
2278 minleft = args->minleft;
2279
2280 switch (type) {
2281 case XFS_ALLOCTYPE_THIS_AG:
2282 case XFS_ALLOCTYPE_NEAR_BNO:
2283 case XFS_ALLOCTYPE_THIS_BNO:
2284 /*
2285 * These three force us into a single a.g.
2286 */
2287 args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
2288 down_read(&mp->m_peraglock);
2289 args->pag = &mp->m_perag[args->agno];
2290 args->minleft = 0;
2291 error = xfs_alloc_fix_freelist(args, 0);
2292 args->minleft = minleft;
2293 if (error) {
2294 TRACE_ALLOC("nofix", args);
2295 goto error0;
2296 }
2297 if (!args->agbp) {
2298 up_read(&mp->m_peraglock);
2299 TRACE_ALLOC("noagbp", args);
2300 break;
2301 }
2302 args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
2303 if ((error = xfs_alloc_ag_vextent(args)))
2304 goto error0;
2305 up_read(&mp->m_peraglock);
2306 break;
2307 case XFS_ALLOCTYPE_START_BNO:
2308 /*
2309 * Try near allocation first, then anywhere-in-ag after
2310 * the first a.g. fails.
2311 */
2312 if ((args->userdata == XFS_ALLOC_INITIAL_USER_DATA) &&
2313 (mp->m_flags & XFS_MOUNT_32BITINODES)) {
2314 args->fsbno = XFS_AGB_TO_FSB(mp,
2315 ((mp->m_agfrotor / rotorstep) %
2316 mp->m_sb.sb_agcount), 0);
2317 bump_rotor = 1;
2318 }
2319 args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
2320 args->type = XFS_ALLOCTYPE_NEAR_BNO;
2321 /* FALLTHROUGH */
2322 case XFS_ALLOCTYPE_ANY_AG:
2323 case XFS_ALLOCTYPE_START_AG:
2324 case XFS_ALLOCTYPE_FIRST_AG:
2325 /*
2326 * Rotate through the allocation groups looking for a winner.
2327 */
2328 if (type == XFS_ALLOCTYPE_ANY_AG) {
2329 /*
2330 * Start with the last place we left off.
2331 */
2332 args->agno = sagno = (mp->m_agfrotor / rotorstep) %
2333 mp->m_sb.sb_agcount;
2334 args->type = XFS_ALLOCTYPE_THIS_AG;
2335 flags = XFS_ALLOC_FLAG_TRYLOCK;
2336 } else if (type == XFS_ALLOCTYPE_FIRST_AG) {
2337 /*
2338 * Start with allocation group given by bno.
2339 */
2340 args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
2341 args->type = XFS_ALLOCTYPE_THIS_AG;
2342 sagno = 0;
2343 flags = 0;
2344 } else {
2345 if (type == XFS_ALLOCTYPE_START_AG)
2346 args->type = XFS_ALLOCTYPE_THIS_AG;
2347 /*
2348 * Start with the given allocation group.
2349 */
2350 args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
2351 flags = XFS_ALLOC_FLAG_TRYLOCK;
2352 }
2353 /*
2354 * Loop over allocation groups twice; first time with
2355 * trylock set, second time without.
2356 */
2357 down_read(&mp->m_peraglock);
2358 for (;;) {
2359 args->pag = &mp->m_perag[args->agno];
2360 if (no_min) args->minleft = 0;
2361 error = xfs_alloc_fix_freelist(args, flags);
2362 args->minleft = minleft;
2363 if (error) {
2364 TRACE_ALLOC("nofix", args);
2365 goto error0;
2366 }
2367 /*
2368 * If we get a buffer back then the allocation will fly.
2369 */
2370 if (args->agbp) {
2371 if ((error = xfs_alloc_ag_vextent(args)))
2372 goto error0;
2373 break;
2374 }
2375 TRACE_ALLOC("loopfailed", args);
2376 /*
2377 * Didn't work, figure out the next iteration.
2378 */
2379 if (args->agno == sagno &&
2380 type == XFS_ALLOCTYPE_START_BNO)
2381 args->type = XFS_ALLOCTYPE_THIS_AG;
2382 if (++(args->agno) == mp->m_sb.sb_agcount)
2383 args->agno = 0;
2384 /*
2385 * Reached the starting a.g., must either be done
2386 * or switch to non-trylock mode.
2387 */
2388 if (args->agno == sagno) {
2389 if (no_min == 1) {
2390 args->agbno = NULLAGBLOCK;
2391 TRACE_ALLOC("allfailed", args);
2392 break;
2393 }
2394 if (flags == 0) {
2395 no_min = 1;
2396 } else {
2397 flags = 0;
2398 if (type == XFS_ALLOCTYPE_START_BNO) {
2399 args->agbno = XFS_FSB_TO_AGBNO(mp,
2400 args->fsbno);
2401 args->type = XFS_ALLOCTYPE_NEAR_BNO;
2402 }
2403 }
2404 }
2405 }
2406 up_read(&mp->m_peraglock);
2407 if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) {
2408 if (args->agno == sagno)
2409 mp->m_agfrotor = (mp->m_agfrotor + 1) %
2410 (mp->m_sb.sb_agcount * rotorstep);
2411 else
2412 mp->m_agfrotor = (args->agno * rotorstep + 1) %
2413 (mp->m_sb.sb_agcount * rotorstep);
2414 }
2415 break;
2416 default:
2417 ASSERT(0);
2418 /* NOTREACHED */
2419 }
2420 if (args->agbno == NULLAGBLOCK)
2421 args->fsbno = NULLFSBLOCK;
2422 else {
2423 args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
2424#ifdef DEBUG
2425 ASSERT(args->len >= args->minlen);
2426 ASSERT(args->len <= args->maxlen);
2427 ASSERT(args->agbno % args->alignment == 0);
2428 XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
2429 args->len);
2430#endif
2431 }
2432 return 0;
2433error0:
2434 up_read(&mp->m_peraglock);
2435 return error;
2436}
2437
2438/*
2439 * Free an extent.
2440 * Just break up the extent address and hand off to xfs_free_ag_extent
2441 * after fixing up the freelist.
2442 */
2443int /* error */
2444xfs_free_extent(
2445 xfs_trans_t *tp, /* transaction pointer */
2446 xfs_fsblock_t bno, /* starting block number of extent */
2447 xfs_extlen_t len) /* length of extent */
2448{
2449#ifdef DEBUG
2450 xfs_agf_t *agf; /* a.g. freespace header */
2451#endif
2452 xfs_alloc_arg_t args; /* allocation argument structure */
2453 int error;
2454
2455 ASSERT(len != 0);
2456 args.tp = tp;
2457 args.mp = tp->t_mountp;
2458 args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
2459 ASSERT(args.agno < args.mp->m_sb.sb_agcount);
2460 args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
2461 args.alignment = 1;
2462 args.minlen = args.minleft = args.minalignslop = 0;
2463 down_read(&args.mp->m_peraglock);
2464 args.pag = &args.mp->m_perag[args.agno];
2465 if ((error = xfs_alloc_fix_freelist(&args, 0)))
2466 goto error0;
2467#ifdef DEBUG
2468 ASSERT(args.agbp != NULL);
2469 agf = XFS_BUF_TO_AGF(args.agbp);
2470 ASSERT(args.agbno + len <= INT_GET(agf->agf_length, ARCH_CONVERT));
2471#endif
2472 error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno,
2473 len, 0);
2474error0:
2475 up_read(&args.mp->m_peraglock);
2476 return error;
2477}
2478
2479
2480/*
2481 * AG Busy list management
2482 * The busy list contains block ranges that have been freed but whose
2483 * transacations have not yet hit disk. If any block listed in a busy
2484 * list is reused, the transaction that freed it must be forced to disk
2485 * before continuing to use the block.
2486 *
2487 * xfs_alloc_mark_busy - add to the per-ag busy list
2488 * xfs_alloc_clear_busy - remove an item from the per-ag busy list
2489 */
2490void
2491xfs_alloc_mark_busy(xfs_trans_t *tp,
2492 xfs_agnumber_t agno,
2493 xfs_agblock_t bno,
2494 xfs_extlen_t len)
2495{
2496 xfs_mount_t *mp;
2497 xfs_perag_busy_t *bsy;
2498 int n;
2499 SPLDECL(s);
2500
2501 mp = tp->t_mountp;
2502 s = mutex_spinlock(&mp->m_perag[agno].pagb_lock);
2503
2504 /* search pagb_list for an open slot */
2505 for (bsy = mp->m_perag[agno].pagb_list, n = 0;
2506 n < XFS_PAGB_NUM_SLOTS;
2507 bsy++, n++) {
2508 if (bsy->busy_tp == NULL) {
2509 break;
2510 }
2511 }
2512
2513 if (n < XFS_PAGB_NUM_SLOTS) {
2514 bsy = &mp->m_perag[agno].pagb_list[n];
2515 mp->m_perag[agno].pagb_count++;
2516 TRACE_BUSY("xfs_alloc_mark_busy", "got", agno, bno, len, n, tp);
2517 bsy->busy_start = bno;
2518 bsy->busy_length = len;
2519 bsy->busy_tp = tp;
2520 xfs_trans_add_busy(tp, agno, n);
2521 } else {
2522 TRACE_BUSY("xfs_alloc_mark_busy", "FULL", agno, bno, len, -1, tp);
2523 /*
2524 * The busy list is full! Since it is now not possible to
2525 * track the free block, make this a synchronous transaction
2526 * to insure that the block is not reused before this
2527 * transaction commits.
2528 */
2529 xfs_trans_set_sync(tp);
2530 }
2531
2532 mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s);
2533}
2534
2535void
2536xfs_alloc_clear_busy(xfs_trans_t *tp,
2537 xfs_agnumber_t agno,
2538 int idx)
2539{
2540 xfs_mount_t *mp;
2541 xfs_perag_busy_t *list;
2542 SPLDECL(s);
2543
2544 mp = tp->t_mountp;
2545
2546 s = mutex_spinlock(&mp->m_perag[agno].pagb_lock);
2547 list = mp->m_perag[agno].pagb_list;
2548
2549 ASSERT(idx < XFS_PAGB_NUM_SLOTS);
2550 if (list[idx].busy_tp == tp) {
2551 TRACE_UNBUSY("xfs_alloc_clear_busy", "found", agno, idx, tp);
2552 list[idx].busy_tp = NULL;
2553 mp->m_perag[agno].pagb_count--;
2554 } else {
2555 TRACE_UNBUSY("xfs_alloc_clear_busy", "missing", agno, idx, tp);
2556 }
2557
2558 mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s);
2559}
2560
2561
2562/*
2563 * returns non-zero if any of (agno,bno):len is in a busy list
2564 */
2565int
2566xfs_alloc_search_busy(xfs_trans_t *tp,
2567 xfs_agnumber_t agno,
2568 xfs_agblock_t bno,
2569 xfs_extlen_t len)
2570{
2571 xfs_mount_t *mp;
2572 xfs_perag_busy_t *bsy;
2573 int n;
2574 xfs_agblock_t uend, bend;
2575 xfs_lsn_t lsn;
2576 int cnt;
2577 SPLDECL(s);
2578
2579 mp = tp->t_mountp;
2580
2581 s = mutex_spinlock(&mp->m_perag[agno].pagb_lock);
2582 cnt = mp->m_perag[agno].pagb_count;
2583
2584 uend = bno + len - 1;
2585
2586 /* search pagb_list for this slot, skipping open slots */
2587 for (bsy = mp->m_perag[agno].pagb_list, n = 0;
2588 cnt; bsy++, n++) {
2589
2590 /*
2591 * (start1,length1) within (start2, length2)
2592 */
2593 if (bsy->busy_tp != NULL) {
2594 bend = bsy->busy_start + bsy->busy_length - 1;
2595 if ((bno > bend) ||
2596 (uend < bsy->busy_start)) {
2597 cnt--;
2598 } else {
2599 TRACE_BUSYSEARCH("xfs_alloc_search_busy",
2600 "found1", agno, bno, len, n,
2601 tp);
2602 break;
2603 }
2604 }
2605 }
2606
2607 /*
2608 * If a block was found, force the log through the LSN of the
2609 * transaction that freed the block
2610 */
2611 if (cnt) {
2612 TRACE_BUSYSEARCH("xfs_alloc_search_busy", "found", agno, bno, len, n, tp);
2613 lsn = bsy->busy_tp->t_commit_lsn;
2614 mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s);
2615 xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC);
2616 } else {
2617 TRACE_BUSYSEARCH("xfs_alloc_search_busy", "not-found", agno, bno, len, n, tp);
2618 n = -1;
2619 mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s);
2620 }
2621
2622 return n;
2623}
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
new file mode 100644
index 000000000000..72329c86351c
--- /dev/null
+++ b/fs/xfs/xfs_alloc.h
@@ -0,0 +1,203 @@
1/*
2 * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_ALLOC_H__
33#define __XFS_ALLOC_H__
34
35struct xfs_buf;
36struct xfs_mount;
37struct xfs_perag;
38struct xfs_trans;
39
40/*
41 * Freespace allocation types. Argument to xfs_alloc_[v]extent.
42 */
43typedef enum xfs_alloctype
44{
45 XFS_ALLOCTYPE_ANY_AG, /* allocate anywhere, use rotor */
46 XFS_ALLOCTYPE_FIRST_AG, /* ... start at ag 0 */
47 XFS_ALLOCTYPE_START_AG, /* anywhere, start in this a.g. */
48 XFS_ALLOCTYPE_THIS_AG, /* anywhere in this a.g. */
49 XFS_ALLOCTYPE_START_BNO, /* near this block else anywhere */
50 XFS_ALLOCTYPE_NEAR_BNO, /* in this a.g. and near this block */
51 XFS_ALLOCTYPE_THIS_BNO /* at exactly this block */
52} xfs_alloctype_t;
53
54/*
55 * Flags for xfs_alloc_fix_freelist.
56 */
57#define XFS_ALLOC_FLAG_TRYLOCK 0x00000001 /* use trylock for buffer locking */
58
59/*
60 * Argument structure for xfs_alloc routines.
61 * This is turned into a structure to avoid having 20 arguments passed
62 * down several levels of the stack.
63 */
64typedef struct xfs_alloc_arg {
65 struct xfs_trans *tp; /* transaction pointer */
66 struct xfs_mount *mp; /* file system mount point */
67 struct xfs_buf *agbp; /* buffer for a.g. freelist header */
68 struct xfs_perag *pag; /* per-ag struct for this agno */
69 xfs_fsblock_t fsbno; /* file system block number */
70 xfs_agnumber_t agno; /* allocation group number */
71 xfs_agblock_t agbno; /* allocation group-relative block # */
72 xfs_extlen_t minlen; /* minimum size of extent */
73 xfs_extlen_t maxlen; /* maximum size of extent */
74 xfs_extlen_t mod; /* mod value for extent size */
75 xfs_extlen_t prod; /* prod value for extent size */
76 xfs_extlen_t minleft; /* min blocks must be left after us */
77 xfs_extlen_t total; /* total blocks needed in xaction */
78 xfs_extlen_t alignment; /* align answer to multiple of this */
79 xfs_extlen_t minalignslop; /* slop for minlen+alignment calcs */
80 xfs_extlen_t len; /* output: actual size of extent */
81 xfs_alloctype_t type; /* allocation type XFS_ALLOCTYPE_... */
82 xfs_alloctype_t otype; /* original allocation type */
83 char wasdel; /* set if allocation was prev delayed */
84 char wasfromfl; /* set if allocation is from freelist */
85 char isfl; /* set if is freelist blocks - !actg */
86 char userdata; /* set if this is user data */
87} xfs_alloc_arg_t;
88
89/*
90 * Defines for userdata
91 */
92#define XFS_ALLOC_USERDATA 1 /* allocation is for user data*/
93#define XFS_ALLOC_INITIAL_USER_DATA 2 /* special case start of file */
94
95
96#ifdef __KERNEL__
97
98#if defined(XFS_ALLOC_TRACE)
99/*
100 * Allocation tracing buffer size.
101 */
102#define XFS_ALLOC_TRACE_SIZE 4096
103extern ktrace_t *xfs_alloc_trace_buf;
104
105/*
106 * Types for alloc tracing.
107 */
108#define XFS_ALLOC_KTRACE_ALLOC 1
109#define XFS_ALLOC_KTRACE_FREE 2
110#define XFS_ALLOC_KTRACE_MODAGF 3
111#define XFS_ALLOC_KTRACE_BUSY 4
112#define XFS_ALLOC_KTRACE_UNBUSY 5
113#define XFS_ALLOC_KTRACE_BUSYSEARCH 6
114#endif
115
116/*
117 * Compute and fill in value of m_ag_maxlevels.
118 */
119void
120xfs_alloc_compute_maxlevels(
121 struct xfs_mount *mp); /* file system mount structure */
122
123/*
124 * Get a block from the freelist.
125 * Returns with the buffer for the block gotten.
126 */
127int /* error */
128xfs_alloc_get_freelist(
129 struct xfs_trans *tp, /* transaction pointer */
130 struct xfs_buf *agbp, /* buffer containing the agf structure */
131 xfs_agblock_t *bnop); /* block address retrieved from freelist */
132
133/*
134 * Log the given fields from the agf structure.
135 */
136void
137xfs_alloc_log_agf(
138 struct xfs_trans *tp, /* transaction pointer */
139 struct xfs_buf *bp, /* buffer for a.g. freelist header */
140 int fields);/* mask of fields to be logged (XFS_AGF_...) */
141
142/*
143 * Interface for inode allocation to force the pag data to be initialized.
144 */
145int /* error */
146xfs_alloc_pagf_init(
147 struct xfs_mount *mp, /* file system mount structure */
148 struct xfs_trans *tp, /* transaction pointer */
149 xfs_agnumber_t agno, /* allocation group number */
150 int flags); /* XFS_ALLOC_FLAGS_... */
151
152/*
153 * Put the block on the freelist for the allocation group.
154 */
155int /* error */
156xfs_alloc_put_freelist(
157 struct xfs_trans *tp, /* transaction pointer */
158 struct xfs_buf *agbp, /* buffer for a.g. freelist header */
159 struct xfs_buf *agflbp,/* buffer for a.g. free block array */
160 xfs_agblock_t bno); /* block being freed */
161
162/*
163 * Read in the allocation group header (free/alloc section).
164 */
165int /* error */
166xfs_alloc_read_agf(
167 struct xfs_mount *mp, /* mount point structure */
168 struct xfs_trans *tp, /* transaction pointer */
169 xfs_agnumber_t agno, /* allocation group number */
170 int flags, /* XFS_ALLOC_FLAG_... */
171 struct xfs_buf **bpp); /* buffer for the ag freelist header */
172
173/*
174 * Allocate an extent (variable-size).
175 */
176int /* error */
177xfs_alloc_vextent(
178 xfs_alloc_arg_t *args); /* allocation argument structure */
179
180/*
181 * Free an extent.
182 */
183int /* error */
184xfs_free_extent(
185 struct xfs_trans *tp, /* transaction pointer */
186 xfs_fsblock_t bno, /* starting block number of extent */
187 xfs_extlen_t len); /* length of extent */
188
189void
190xfs_alloc_mark_busy(xfs_trans_t *tp,
191 xfs_agnumber_t agno,
192 xfs_agblock_t bno,
193 xfs_extlen_t len);
194
195void
196xfs_alloc_clear_busy(xfs_trans_t *tp,
197 xfs_agnumber_t ag,
198 int idx);
199
200
201#endif /* __KERNEL__ */
202
203#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
new file mode 100644
index 000000000000..e0355a12d946
--- /dev/null
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -0,0 +1,2204 @@
1/*
2 * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33/*
34 * Free space allocation for XFS.
35 */
36
37#include "xfs.h"
38#include "xfs_macros.h"
39#include "xfs_types.h"
40#include "xfs_inum.h"
41#include "xfs_log.h"
42#include "xfs_trans.h"
43#include "xfs_sb.h"
44#include "xfs_ag.h"
45#include "xfs_dir.h"
46#include "xfs_dmapi.h"
47#include "xfs_mount.h"
48#include "xfs_alloc_btree.h"
49#include "xfs_ialloc_btree.h"
50#include "xfs_bmap_btree.h"
51#include "xfs_btree.h"
52#include "xfs_ialloc.h"
53#include "xfs_alloc.h"
54#include "xfs_error.h"
55
56/*
57 * Prototypes for internal functions.
58 */
59
60STATIC void xfs_alloc_log_block(xfs_trans_t *, xfs_buf_t *, int);
61STATIC void xfs_alloc_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
62STATIC void xfs_alloc_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
63STATIC void xfs_alloc_log_recs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
64STATIC int xfs_alloc_lshift(xfs_btree_cur_t *, int, int *);
65STATIC int xfs_alloc_newroot(xfs_btree_cur_t *, int *);
66STATIC int xfs_alloc_rshift(xfs_btree_cur_t *, int, int *);
67STATIC int xfs_alloc_split(xfs_btree_cur_t *, int, xfs_agblock_t *,
68 xfs_alloc_key_t *, xfs_btree_cur_t **, int *);
69STATIC int xfs_alloc_updkey(xfs_btree_cur_t *, xfs_alloc_key_t *, int);
70
71/*
72 * Internal functions.
73 */
74
75/*
76 * Single level of the xfs_alloc_delete record deletion routine.
77 * Delete record pointed to by cur/level.
78 * Remove the record from its block then rebalance the tree.
79 * Return 0 for error, 1 for done, 2 to go on to the next level.
80 */
81STATIC int /* error */
82xfs_alloc_delrec(
83 xfs_btree_cur_t *cur, /* btree cursor */
84 int level, /* level removing record from */
85 int *stat) /* fail/done/go-on */
86{
87 xfs_agf_t *agf; /* allocation group freelist header */
88 xfs_alloc_block_t *block; /* btree block record/key lives in */
89 xfs_agblock_t bno; /* btree block number */
90 xfs_buf_t *bp; /* buffer for block */
91 int error; /* error return value */
92 int i; /* loop index */
93 xfs_alloc_key_t key; /* kp points here if block is level 0 */
94 xfs_agblock_t lbno; /* left block's block number */
95 xfs_buf_t *lbp; /* left block's buffer pointer */
96 xfs_alloc_block_t *left; /* left btree block */
97 xfs_alloc_key_t *lkp=NULL; /* left block key pointer */
98 xfs_alloc_ptr_t *lpp=NULL; /* left block address pointer */
99 int lrecs=0; /* number of records in left block */
100 xfs_alloc_rec_t *lrp; /* left block record pointer */
101 xfs_mount_t *mp; /* mount structure */
102 int ptr; /* index in btree block for this rec */
103 xfs_agblock_t rbno; /* right block's block number */
104 xfs_buf_t *rbp; /* right block's buffer pointer */
105 xfs_alloc_block_t *right; /* right btree block */
106 xfs_alloc_key_t *rkp; /* right block key pointer */
107 xfs_alloc_ptr_t *rpp; /* right block address pointer */
108 int rrecs=0; /* number of records in right block */
109 xfs_alloc_rec_t *rrp; /* right block record pointer */
110 xfs_btree_cur_t *tcur; /* temporary btree cursor */
111
112 /*
113 * Get the index of the entry being deleted, check for nothing there.
114 */
115 ptr = cur->bc_ptrs[level];
116 if (ptr == 0) {
117 *stat = 0;
118 return 0;
119 }
120 /*
121 * Get the buffer & block containing the record or key/ptr.
122 */
123 bp = cur->bc_bufs[level];
124 block = XFS_BUF_TO_ALLOC_BLOCK(bp);
125#ifdef DEBUG
126 if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
127 return error;
128#endif
129 /*
130 * Fail if we're off the end of the block.
131 */
132 if (ptr > INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
133 *stat = 0;
134 return 0;
135 }
136 XFS_STATS_INC(xs_abt_delrec);
137 /*
138 * It's a nonleaf. Excise the key and ptr being deleted, by
139 * sliding the entries past them down one.
140 * Log the changed areas of the block.
141 */
142 if (level > 0) {
143 lkp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
144 lpp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
145#ifdef DEBUG
146 for (i = ptr; i < INT_GET(block->bb_numrecs, ARCH_CONVERT); i++) {
147 if ((error = xfs_btree_check_sptr(cur, INT_GET(lpp[i], ARCH_CONVERT), level)))
148 return error;
149 }
150#endif
151 if (ptr < INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
152 memmove(&lkp[ptr - 1], &lkp[ptr],
153 (INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr) * sizeof(*lkp)); /* INT_: mem copy */
154 memmove(&lpp[ptr - 1], &lpp[ptr],
155 (INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr) * sizeof(*lpp)); /* INT_: mem copy */
156 xfs_alloc_log_ptrs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT) - 1);
157 xfs_alloc_log_keys(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT) - 1);
158 }
159 }
160 /*
161 * It's a leaf. Excise the record being deleted, by sliding the
162 * entries past it down one. Log the changed areas of the block.
163 */
164 else {
165 lrp = XFS_ALLOC_REC_ADDR(block, 1, cur);
166 if (ptr < INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
167 memmove(&lrp[ptr - 1], &lrp[ptr],
168 (INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr) * sizeof(*lrp));
169 xfs_alloc_log_recs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT) - 1);
170 }
171 /*
172 * If it's the first record in the block, we'll need a key
173 * structure to pass up to the next level (updkey).
174 */
175 if (ptr == 1) {
176 key.ar_startblock = lrp->ar_startblock; /* INT_: direct copy */
177 key.ar_blockcount = lrp->ar_blockcount; /* INT_: direct copy */
178 lkp = &key;
179 }
180 }
181 /*
182 * Decrement and log the number of entries in the block.
183 */
184 INT_MOD(block->bb_numrecs, ARCH_CONVERT, -1);
185 xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
186 /*
187 * See if the longest free extent in the allocation group was
188 * changed by this operation. True if it's the by-size btree, and
189 * this is the leaf level, and there is no right sibling block,
190 * and this was the last record.
191 */
192 agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
193 mp = cur->bc_mp;
194
195 if (level == 0 &&
196 cur->bc_btnum == XFS_BTNUM_CNT &&
197 INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK &&
198 ptr > INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
199 ASSERT(ptr == INT_GET(block->bb_numrecs, ARCH_CONVERT) + 1);
200 /*
201 * There are still records in the block. Grab the size
202 * from the last one.
203 */
204 if (INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
205 rrp = XFS_ALLOC_REC_ADDR(block, INT_GET(block->bb_numrecs, ARCH_CONVERT), cur);
206 INT_COPY(agf->agf_longest, rrp->ar_blockcount, ARCH_CONVERT);
207 }
208 /*
209 * No free extents left.
210 */
211 else
212 agf->agf_longest = 0;
213 mp->m_perag[INT_GET(agf->agf_seqno, ARCH_CONVERT)].pagf_longest =
214 INT_GET(agf->agf_longest, ARCH_CONVERT);
215 xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
216 XFS_AGF_LONGEST);
217 }
218 /*
219 * Is this the root level? If so, we're almost done.
220 */
221 if (level == cur->bc_nlevels - 1) {
222 /*
223 * If this is the root level,
224 * and there's only one entry left,
225 * and it's NOT the leaf level,
226 * then we can get rid of this level.
227 */
228 if (INT_GET(block->bb_numrecs, ARCH_CONVERT) == 1 && level > 0) {
229 /*
230 * lpp is still set to the first pointer in the block.
231 * Make it the new root of the btree.
232 */
233 bno = INT_GET(agf->agf_roots[cur->bc_btnum], ARCH_CONVERT);
234 INT_COPY(agf->agf_roots[cur->bc_btnum], *lpp, ARCH_CONVERT);
235 INT_MOD(agf->agf_levels[cur->bc_btnum], ARCH_CONVERT, -1);
236 mp->m_perag[INT_GET(agf->agf_seqno, ARCH_CONVERT)].pagf_levels[cur->bc_btnum]--;
237 /*
238 * Put this buffer/block on the ag's freelist.
239 */
240 if ((error = xfs_alloc_put_freelist(cur->bc_tp,
241 cur->bc_private.a.agbp, NULL, bno)))
242 return error;
243 /*
244 * Since blocks move to the free list without the
245 * coordination used in xfs_bmap_finish, we can't allow
246 * block to be available for reallocation and
247 * non-transaction writing (user data) until we know
248 * that the transaction that moved it to the free list
249 * is permanently on disk. We track the blocks by
250 * declaring these blocks as "busy"; the busy list is
251 * maintained on a per-ag basis and each transaction
252 * records which entries should be removed when the
253 * iclog commits to disk. If a busy block is
254 * allocated, the iclog is pushed up to the LSN
255 * that freed the block.
256 */
257 xfs_alloc_mark_busy(cur->bc_tp,
258 INT_GET(agf->agf_seqno, ARCH_CONVERT), bno, 1);
259
260 xfs_trans_agbtree_delta(cur->bc_tp, -1);
261 xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
262 XFS_AGF_ROOTS | XFS_AGF_LEVELS);
263 /*
264 * Update the cursor so there's one fewer level.
265 */
266 xfs_btree_setbuf(cur, level, NULL);
267 cur->bc_nlevels--;
268 } else if (level > 0 &&
269 (error = xfs_alloc_decrement(cur, level, &i)))
270 return error;
271 *stat = 1;
272 return 0;
273 }
274 /*
275 * If we deleted the leftmost entry in the block, update the
276 * key values above us in the tree.
277 */
278 if (ptr == 1 && (error = xfs_alloc_updkey(cur, lkp, level + 1)))
279 return error;
280 /*
281 * If the number of records remaining in the block is at least
282 * the minimum, we're done.
283 */
284 if (INT_GET(block->bb_numrecs, ARCH_CONVERT) >= XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
285 if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i)))
286 return error;
287 *stat = 1;
288 return 0;
289 }
290 /*
291 * Otherwise, we have to move some records around to keep the
292 * tree balanced. Look at the left and right sibling blocks to
293 * see if we can re-balance by moving only one record.
294 */
295 rbno = INT_GET(block->bb_rightsib, ARCH_CONVERT);
296 lbno = INT_GET(block->bb_leftsib, ARCH_CONVERT);
297 bno = NULLAGBLOCK;
298 ASSERT(rbno != NULLAGBLOCK || lbno != NULLAGBLOCK);
299 /*
300 * Duplicate the cursor so our btree manipulations here won't
301 * disrupt the next level up.
302 */
303 if ((error = xfs_btree_dup_cursor(cur, &tcur)))
304 return error;
305 /*
306 * If there's a right sibling, see if it's ok to shift an entry
307 * out of it.
308 */
309 if (rbno != NULLAGBLOCK) {
310 /*
311 * Move the temp cursor to the last entry in the next block.
312 * Actually any entry but the first would suffice.
313 */
314 i = xfs_btree_lastrec(tcur, level);
315 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
316 if ((error = xfs_alloc_increment(tcur, level, &i)))
317 goto error0;
318 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
319 i = xfs_btree_lastrec(tcur, level);
320 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
321 /*
322 * Grab a pointer to the block.
323 */
324 rbp = tcur->bc_bufs[level];
325 right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
326#ifdef DEBUG
327 if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
328 goto error0;
329#endif
330 /*
331 * Grab the current block number, for future use.
332 */
333 bno = INT_GET(right->bb_leftsib, ARCH_CONVERT);
334 /*
335 * If right block is full enough so that removing one entry
336 * won't make it too empty, and left-shifting an entry out
337 * of right to us works, we're done.
338 */
339 if (INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1 >=
340 XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
341 if ((error = xfs_alloc_lshift(tcur, level, &i)))
342 goto error0;
343 if (i) {
344 ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >=
345 XFS_ALLOC_BLOCK_MINRECS(level, cur));
346 xfs_btree_del_cursor(tcur,
347 XFS_BTREE_NOERROR);
348 if (level > 0 &&
349 (error = xfs_alloc_decrement(cur, level,
350 &i)))
351 return error;
352 *stat = 1;
353 return 0;
354 }
355 }
356 /*
357 * Otherwise, grab the number of records in right for
358 * future reference, and fix up the temp cursor to point
359 * to our block again (last record).
360 */
361 rrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT);
362 if (lbno != NULLAGBLOCK) {
363 i = xfs_btree_firstrec(tcur, level);
364 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
365 if ((error = xfs_alloc_decrement(tcur, level, &i)))
366 goto error0;
367 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
368 }
369 }
370 /*
371 * If there's a left sibling, see if it's ok to shift an entry
372 * out of it.
373 */
374 if (lbno != NULLAGBLOCK) {
375 /*
376 * Move the temp cursor to the first entry in the
377 * previous block.
378 */
379 i = xfs_btree_firstrec(tcur, level);
380 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
381 if ((error = xfs_alloc_decrement(tcur, level, &i)))
382 goto error0;
383 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
384 xfs_btree_firstrec(tcur, level);
385 /*
386 * Grab a pointer to the block.
387 */
388 lbp = tcur->bc_bufs[level];
389 left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
390#ifdef DEBUG
391 if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
392 goto error0;
393#endif
394 /*
395 * Grab the current block number, for future use.
396 */
397 bno = INT_GET(left->bb_rightsib, ARCH_CONVERT);
398 /*
399 * If left block is full enough so that removing one entry
400 * won't make it too empty, and right-shifting an entry out
401 * of left to us works, we're done.
402 */
403 if (INT_GET(left->bb_numrecs, ARCH_CONVERT) - 1 >=
404 XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
405 if ((error = xfs_alloc_rshift(tcur, level, &i)))
406 goto error0;
407 if (i) {
408 ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >=
409 XFS_ALLOC_BLOCK_MINRECS(level, cur));
410 xfs_btree_del_cursor(tcur,
411 XFS_BTREE_NOERROR);
412 if (level == 0)
413 cur->bc_ptrs[0]++;
414 *stat = 1;
415 return 0;
416 }
417 }
418 /*
419 * Otherwise, grab the number of records in right for
420 * future reference.
421 */
422 lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT);
423 }
424 /*
425 * Delete the temp cursor, we're done with it.
426 */
427 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
428 /*
429 * If here, we need to do a join to keep the tree balanced.
430 */
431 ASSERT(bno != NULLAGBLOCK);
432 /*
433 * See if we can join with the left neighbor block.
434 */
435 if (lbno != NULLAGBLOCK &&
436 lrecs + INT_GET(block->bb_numrecs, ARCH_CONVERT) <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
437 /*
438 * Set "right" to be the starting block,
439 * "left" to be the left neighbor.
440 */
441 rbno = bno;
442 right = block;
443 rbp = bp;
444 if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
445 cur->bc_private.a.agno, lbno, 0, &lbp,
446 XFS_ALLOC_BTREE_REF)))
447 return error;
448 left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
449 if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
450 return error;
451 }
452 /*
453 * If that won't work, see if we can join with the right neighbor block.
454 */
455 else if (rbno != NULLAGBLOCK &&
456 rrecs + INT_GET(block->bb_numrecs, ARCH_CONVERT) <=
457 XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
458 /*
459 * Set "left" to be the starting block,
460 * "right" to be the right neighbor.
461 */
462 lbno = bno;
463 left = block;
464 lbp = bp;
465 if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
466 cur->bc_private.a.agno, rbno, 0, &rbp,
467 XFS_ALLOC_BTREE_REF)))
468 return error;
469 right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
470 if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
471 return error;
472 }
473 /*
474 * Otherwise, we can't fix the imbalance.
475 * Just return. This is probably a logic error, but it's not fatal.
476 */
477 else {
478 if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i)))
479 return error;
480 *stat = 1;
481 return 0;
482 }
483 /*
484 * We're now going to join "left" and "right" by moving all the stuff
485 * in "right" to "left" and deleting "right".
486 */
487 if (level > 0) {
488 /*
489 * It's a non-leaf. Move keys and pointers.
490 */
491 lkp = XFS_ALLOC_KEY_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, cur);
492 lpp = XFS_ALLOC_PTR_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, cur);
493 rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
494 rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
495#ifdef DEBUG
496 for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) {
497 if ((error = xfs_btree_check_sptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level)))
498 return error;
499 }
500#endif
501 memcpy(lkp, rkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*lkp)); /* INT_: structure copy */
502 memcpy(lpp, rpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*lpp)); /* INT_: structure copy */
503 xfs_alloc_log_keys(cur, lbp, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1,
504 INT_GET(left->bb_numrecs, ARCH_CONVERT) + INT_GET(right->bb_numrecs, ARCH_CONVERT));
505 xfs_alloc_log_ptrs(cur, lbp, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1,
506 INT_GET(left->bb_numrecs, ARCH_CONVERT) + INT_GET(right->bb_numrecs, ARCH_CONVERT));
507 } else {
508 /*
509 * It's a leaf. Move records.
510 */
511 lrp = XFS_ALLOC_REC_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, cur);
512 rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
513 memcpy(lrp, rrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*lrp));
514 xfs_alloc_log_recs(cur, lbp, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1,
515 INT_GET(left->bb_numrecs, ARCH_CONVERT) + INT_GET(right->bb_numrecs, ARCH_CONVERT));
516 }
517 /*
518 * If we joined with the left neighbor, set the buffer in the
519 * cursor to the left block, and fix up the index.
520 */
521 if (bp != lbp) {
522 xfs_btree_setbuf(cur, level, lbp);
523 cur->bc_ptrs[level] += INT_GET(left->bb_numrecs, ARCH_CONVERT);
524 }
525 /*
526 * If we joined with the right neighbor and there's a level above
527 * us, increment the cursor at that level.
528 */
529 else if (level + 1 < cur->bc_nlevels &&
530 (error = xfs_alloc_increment(cur, level + 1, &i)))
531 return error;
532 /*
533 * Fix up the number of records in the surviving block.
534 */
535 INT_MOD(left->bb_numrecs, ARCH_CONVERT, INT_GET(right->bb_numrecs, ARCH_CONVERT));
536 /*
537 * Fix up the right block pointer in the surviving block, and log it.
538 */
539 left->bb_rightsib = right->bb_rightsib; /* INT_: direct copy */
540 xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
541 /*
542 * If there is a right sibling now, make it point to the
543 * remaining block.
544 */
545 if (INT_GET(left->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
546 xfs_alloc_block_t *rrblock;
547 xfs_buf_t *rrbp;
548
549 if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
550 cur->bc_private.a.agno, INT_GET(left->bb_rightsib, ARCH_CONVERT), 0,
551 &rrbp, XFS_ALLOC_BTREE_REF)))
552 return error;
553 rrblock = XFS_BUF_TO_ALLOC_BLOCK(rrbp);
554 if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
555 return error;
556 INT_SET(rrblock->bb_leftsib, ARCH_CONVERT, lbno);
557 xfs_alloc_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
558 }
559 /*
560 * Free the deleting block by putting it on the freelist.
561 */
562 if ((error = xfs_alloc_put_freelist(cur->bc_tp, cur->bc_private.a.agbp,
563 NULL, rbno)))
564 return error;
565 /*
566 * Since blocks move to the free list without the coordination
567 * used in xfs_bmap_finish, we can't allow block to be available
568 * for reallocation and non-transaction writing (user data)
569 * until we know that the transaction that moved it to the free
570 * list is permanently on disk. We track the blocks by declaring
571 * these blocks as "busy"; the busy list is maintained on a
572 * per-ag basis and each transaction records which entries
573 * should be removed when the iclog commits to disk. If a
574 * busy block is allocated, the iclog is pushed up to the
575 * LSN that freed the block.
576 */
577 xfs_alloc_mark_busy(cur->bc_tp,
578 INT_GET(agf->agf_seqno, ARCH_CONVERT), bno, 1);
579
580 xfs_trans_agbtree_delta(cur->bc_tp, -1);
581 /*
582 * Adjust the current level's cursor so that we're left referring
583 * to the right node, after we're done.
584 * If this leaves the ptr value 0 our caller will fix it up.
585 */
586 if (level > 0)
587 cur->bc_ptrs[level]--;
588 /*
589 * Return value means the next level up has something to do.
590 */
591 *stat = 2;
592 return 0;
593
594error0:
595 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
596 return error;
597}
598
599/*
600 * Insert one record/level. Return information to the caller
601 * allowing the next level up to proceed if necessary.
602 */
603STATIC int /* error */
604xfs_alloc_insrec(
605 xfs_btree_cur_t *cur, /* btree cursor */
606 int level, /* level to insert record at */
607 xfs_agblock_t *bnop, /* i/o: block number inserted */
608 xfs_alloc_rec_t *recp, /* i/o: record data inserted */
609 xfs_btree_cur_t **curp, /* output: new cursor replacing cur */
610 int *stat) /* output: success/failure */
611{
612 xfs_agf_t *agf; /* allocation group freelist header */
613 xfs_alloc_block_t *block; /* btree block record/key lives in */
614 xfs_buf_t *bp; /* buffer for block */
615 int error; /* error return value */
616 int i; /* loop index */
617 xfs_alloc_key_t key; /* key value being inserted */
618 xfs_alloc_key_t *kp; /* pointer to btree keys */
619 xfs_agblock_t nbno; /* block number of allocated block */
620 xfs_btree_cur_t *ncur; /* new cursor to be used at next lvl */
621 xfs_alloc_key_t nkey; /* new key value, from split */
622 xfs_alloc_rec_t nrec; /* new record value, for caller */
623 int optr; /* old ptr value */
624 xfs_alloc_ptr_t *pp; /* pointer to btree addresses */
625 int ptr; /* index in btree block for this rec */
626 xfs_alloc_rec_t *rp; /* pointer to btree records */
627
628 ASSERT(INT_GET(recp->ar_blockcount, ARCH_CONVERT) > 0);
629 /*
630 * If we made it to the root level, allocate a new root block
631 * and we're done.
632 */
633 if (level >= cur->bc_nlevels) {
634 XFS_STATS_INC(xs_abt_insrec);
635 if ((error = xfs_alloc_newroot(cur, &i)))
636 return error;
637 *bnop = NULLAGBLOCK;
638 *stat = i;
639 return 0;
640 }
641 /*
642 * Make a key out of the record data to be inserted, and save it.
643 */
644 key.ar_startblock = recp->ar_startblock; /* INT_: direct copy */
645 key.ar_blockcount = recp->ar_blockcount; /* INT_: direct copy */
646 optr = ptr = cur->bc_ptrs[level];
647 /*
648 * If we're off the left edge, return failure.
649 */
650 if (ptr == 0) {
651 *stat = 0;
652 return 0;
653 }
654 XFS_STATS_INC(xs_abt_insrec);
655 /*
656 * Get pointers to the btree buffer and block.
657 */
658 bp = cur->bc_bufs[level];
659 block = XFS_BUF_TO_ALLOC_BLOCK(bp);
660#ifdef DEBUG
661 if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
662 return error;
663 /*
664 * Check that the new entry is being inserted in the right place.
665 */
666 if (ptr <= INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
667 if (level == 0) {
668 rp = XFS_ALLOC_REC_ADDR(block, ptr, cur);
669 xfs_btree_check_rec(cur->bc_btnum, recp, rp);
670 } else {
671 kp = XFS_ALLOC_KEY_ADDR(block, ptr, cur);
672 xfs_btree_check_key(cur->bc_btnum, &key, kp);
673 }
674 }
675#endif
676 nbno = NULLAGBLOCK;
677 ncur = (xfs_btree_cur_t *)0;
678 /*
679 * If the block is full, we can't insert the new entry until we
680 * make the block un-full.
681 */
682 if (INT_GET(block->bb_numrecs, ARCH_CONVERT) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
683 /*
684 * First, try shifting an entry to the right neighbor.
685 */
686 if ((error = xfs_alloc_rshift(cur, level, &i)))
687 return error;
688 if (i) {
689 /* nothing */
690 }
691 /*
692 * Next, try shifting an entry to the left neighbor.
693 */
694 else {
695 if ((error = xfs_alloc_lshift(cur, level, &i)))
696 return error;
697 if (i)
698 optr = ptr = cur->bc_ptrs[level];
699 else {
700 /*
701 * Next, try splitting the current block in
702 * half. If this works we have to re-set our
703 * variables because we could be in a
704 * different block now.
705 */
706 if ((error = xfs_alloc_split(cur, level, &nbno,
707 &nkey, &ncur, &i)))
708 return error;
709 if (i) {
710 bp = cur->bc_bufs[level];
711 block = XFS_BUF_TO_ALLOC_BLOCK(bp);
712#ifdef DEBUG
713 if ((error =
714 xfs_btree_check_sblock(cur,
715 block, level, bp)))
716 return error;
717#endif
718 ptr = cur->bc_ptrs[level];
719 nrec.ar_startblock = nkey.ar_startblock; /* INT_: direct copy */
720 nrec.ar_blockcount = nkey.ar_blockcount; /* INT_: direct copy */
721 }
722 /*
723 * Otherwise the insert fails.
724 */
725 else {
726 *stat = 0;
727 return 0;
728 }
729 }
730 }
731 }
732 /*
733 * At this point we know there's room for our new entry in the block
734 * we're pointing at.
735 */
736 if (level > 0) {
737 /*
738 * It's a non-leaf entry. Make a hole for the new data
739 * in the key and ptr regions of the block.
740 */
741 kp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
742 pp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
743#ifdef DEBUG
744 for (i = INT_GET(block->bb_numrecs, ARCH_CONVERT); i >= ptr; i--) {
745 if ((error = xfs_btree_check_sptr(cur, INT_GET(pp[i - 1], ARCH_CONVERT), level)))
746 return error;
747 }
748#endif
749 memmove(&kp[ptr], &kp[ptr - 1],
750 (INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr + 1) * sizeof(*kp)); /* INT_: copy */
751 memmove(&pp[ptr], &pp[ptr - 1],
752 (INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr + 1) * sizeof(*pp)); /* INT_: copy */
753#ifdef DEBUG
754 if ((error = xfs_btree_check_sptr(cur, *bnop, level)))
755 return error;
756#endif
757 /*
758 * Now stuff the new data in, bump numrecs and log the new data.
759 */
760 kp[ptr - 1] = key;
761 INT_SET(pp[ptr - 1], ARCH_CONVERT, *bnop);
762 INT_MOD(block->bb_numrecs, ARCH_CONVERT, +1);
763 xfs_alloc_log_keys(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT));
764 xfs_alloc_log_ptrs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT));
765#ifdef DEBUG
766 if (ptr < INT_GET(block->bb_numrecs, ARCH_CONVERT))
767 xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1,
768 kp + ptr);
769#endif
770 } else {
771 /*
772 * It's a leaf entry. Make a hole for the new record.
773 */
774 rp = XFS_ALLOC_REC_ADDR(block, 1, cur);
775 memmove(&rp[ptr], &rp[ptr - 1],
776 (INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr + 1) * sizeof(*rp));
777 /*
778 * Now stuff the new record in, bump numrecs
779 * and log the new data.
780 */
781 rp[ptr - 1] = *recp; /* INT_: struct copy */
782 INT_MOD(block->bb_numrecs, ARCH_CONVERT, +1);
783 xfs_alloc_log_recs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT));
784#ifdef DEBUG
785 if (ptr < INT_GET(block->bb_numrecs, ARCH_CONVERT))
786 xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1,
787 rp + ptr);
788#endif
789 }
790 /*
791 * Log the new number of records in the btree header.
792 */
793 xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
794 /*
795 * If we inserted at the start of a block, update the parents' keys.
796 */
797 if (optr == 1 && (error = xfs_alloc_updkey(cur, &key, level + 1)))
798 return error;
799 /*
800 * Look to see if the longest extent in the allocation group
801 * needs to be updated.
802 */
803
804 agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
805 if (level == 0 &&
806 cur->bc_btnum == XFS_BTNUM_CNT &&
807 INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK &&
808 INT_GET(recp->ar_blockcount, ARCH_CONVERT) > INT_GET(agf->agf_longest, ARCH_CONVERT)) {
809 /*
810 * If this is a leaf in the by-size btree and there
811 * is no right sibling block and this block is bigger
812 * than the previous longest block, update it.
813 */
814 INT_COPY(agf->agf_longest, recp->ar_blockcount, ARCH_CONVERT);
815 cur->bc_mp->m_perag[INT_GET(agf->agf_seqno, ARCH_CONVERT)].pagf_longest
816 = INT_GET(recp->ar_blockcount, ARCH_CONVERT);
817 xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
818 XFS_AGF_LONGEST);
819 }
820 /*
821 * Return the new block number, if any.
822 * If there is one, give back a record value and a cursor too.
823 */
824 *bnop = nbno;
825 if (nbno != NULLAGBLOCK) {
826 *recp = nrec; /* INT_: struct copy */
827 *curp = ncur; /* INT_: struct copy */
828 }
829 *stat = 1;
830 return 0;
831}
832
833/*
834 * Log header fields from a btree block.
835 */
836STATIC void
837xfs_alloc_log_block(
838 xfs_trans_t *tp, /* transaction pointer */
839 xfs_buf_t *bp, /* buffer containing btree block */
840 int fields) /* mask of fields: XFS_BB_... */
841{
842 int first; /* first byte offset logged */
843 int last; /* last byte offset logged */
844 static const short offsets[] = { /* table of offsets */
845 offsetof(xfs_alloc_block_t, bb_magic),
846 offsetof(xfs_alloc_block_t, bb_level),
847 offsetof(xfs_alloc_block_t, bb_numrecs),
848 offsetof(xfs_alloc_block_t, bb_leftsib),
849 offsetof(xfs_alloc_block_t, bb_rightsib),
850 sizeof(xfs_alloc_block_t)
851 };
852
853 xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, &last);
854 xfs_trans_log_buf(tp, bp, first, last);
855}
856
857/*
858 * Log keys from a btree block (nonleaf).
859 */
860STATIC void
861xfs_alloc_log_keys(
862 xfs_btree_cur_t *cur, /* btree cursor */
863 xfs_buf_t *bp, /* buffer containing btree block */
864 int kfirst, /* index of first key to log */
865 int klast) /* index of last key to log */
866{
867 xfs_alloc_block_t *block; /* btree block to log from */
868 int first; /* first byte offset logged */
869 xfs_alloc_key_t *kp; /* key pointer in btree block */
870 int last; /* last byte offset logged */
871
872 block = XFS_BUF_TO_ALLOC_BLOCK(bp);
873 kp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
874 first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
875 last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
876 xfs_trans_log_buf(cur->bc_tp, bp, first, last);
877}
878
879/*
880 * Log block pointer fields from a btree block (nonleaf).
881 */
882STATIC void
883xfs_alloc_log_ptrs(
884 xfs_btree_cur_t *cur, /* btree cursor */
885 xfs_buf_t *bp, /* buffer containing btree block */
886 int pfirst, /* index of first pointer to log */
887 int plast) /* index of last pointer to log */
888{
889 xfs_alloc_block_t *block; /* btree block to log from */
890 int first; /* first byte offset logged */
891 int last; /* last byte offset logged */
892 xfs_alloc_ptr_t *pp; /* block-pointer pointer in btree blk */
893
894 block = XFS_BUF_TO_ALLOC_BLOCK(bp);
895 pp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
896 first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
897 last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
898 xfs_trans_log_buf(cur->bc_tp, bp, first, last);
899}
900
901/*
902 * Log records from a btree block (leaf).
903 */
904STATIC void
905xfs_alloc_log_recs(
906 xfs_btree_cur_t *cur, /* btree cursor */
907 xfs_buf_t *bp, /* buffer containing btree block */
908 int rfirst, /* index of first record to log */
909 int rlast) /* index of last record to log */
910{
911 xfs_alloc_block_t *block; /* btree block to log from */
912 int first; /* first byte offset logged */
913 int last; /* last byte offset logged */
914 xfs_alloc_rec_t *rp; /* record pointer for btree block */
915
916
917 block = XFS_BUF_TO_ALLOC_BLOCK(bp);
918 rp = XFS_ALLOC_REC_ADDR(block, 1, cur);
919#ifdef DEBUG
920 {
921 xfs_agf_t *agf;
922 xfs_alloc_rec_t *p;
923
924 agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
925 for (p = &rp[rfirst - 1]; p <= &rp[rlast - 1]; p++)
926 ASSERT(INT_GET(p->ar_startblock, ARCH_CONVERT) + INT_GET(p->ar_blockcount, ARCH_CONVERT) <=
927 INT_GET(agf->agf_length, ARCH_CONVERT));
928 }
929#endif
930 first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
931 last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
932 xfs_trans_log_buf(cur->bc_tp, bp, first, last);
933}
934
935/*
936 * Lookup the record. The cursor is made to point to it, based on dir.
937 * Return 0 if can't find any such record, 1 for success.
938 */
939STATIC int /* error */
940xfs_alloc_lookup(
941 xfs_btree_cur_t *cur, /* btree cursor */
942 xfs_lookup_t dir, /* <=, ==, or >= */
943 int *stat) /* success/failure */
944{
945 xfs_agblock_t agbno; /* a.g. relative btree block number */
946 xfs_agnumber_t agno; /* allocation group number */
947 xfs_alloc_block_t *block=NULL; /* current btree block */
948 int diff; /* difference for the current key */
949 int error; /* error return value */
950 int keyno=0; /* current key number */
951 int level; /* level in the btree */
952 xfs_mount_t *mp; /* file system mount point */
953
954 XFS_STATS_INC(xs_abt_lookup);
955 /*
956 * Get the allocation group header, and the root block number.
957 */
958 mp = cur->bc_mp;
959
960 {
961 xfs_agf_t *agf; /* a.g. freespace header */
962
963 agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
964 agno = INT_GET(agf->agf_seqno, ARCH_CONVERT);
965 agbno = INT_GET(agf->agf_roots[cur->bc_btnum], ARCH_CONVERT);
966 }
967 /*
968 * Iterate over each level in the btree, starting at the root.
969 * For each level above the leaves, find the key we need, based
970 * on the lookup record, then follow the corresponding block
971 * pointer down to the next level.
972 */
973 for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
974 xfs_buf_t *bp; /* buffer pointer for btree block */
975 xfs_daddr_t d; /* disk address of btree block */
976
977 /*
978 * Get the disk address we're looking for.
979 */
980 d = XFS_AGB_TO_DADDR(mp, agno, agbno);
981 /*
982 * If the old buffer at this level is for a different block,
983 * throw it away, otherwise just use it.
984 */
985 bp = cur->bc_bufs[level];
986 if (bp && XFS_BUF_ADDR(bp) != d)
987 bp = (xfs_buf_t *)0;
988 if (!bp) {
989 /*
990 * Need to get a new buffer. Read it, then
991 * set it in the cursor, releasing the old one.
992 */
993 if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, agno,
994 agbno, 0, &bp, XFS_ALLOC_BTREE_REF)))
995 return error;
996 xfs_btree_setbuf(cur, level, bp);
997 /*
998 * Point to the btree block, now that we have the buffer
999 */
1000 block = XFS_BUF_TO_ALLOC_BLOCK(bp);
1001 if ((error = xfs_btree_check_sblock(cur, block, level,
1002 bp)))
1003 return error;
1004 } else
1005 block = XFS_BUF_TO_ALLOC_BLOCK(bp);
1006 /*
1007 * If we already had a key match at a higher level, we know
1008 * we need to use the first entry in this block.
1009 */
1010 if (diff == 0)
1011 keyno = 1;
1012 /*
1013 * Otherwise we need to search this block. Do a binary search.
1014 */
1015 else {
1016 int high; /* high entry number */
1017 xfs_alloc_key_t *kkbase=NULL;/* base of keys in block */
1018 xfs_alloc_rec_t *krbase=NULL;/* base of records in block */
1019 int low; /* low entry number */
1020
1021 /*
1022 * Get a pointer to keys or records.
1023 */
1024 if (level > 0)
1025 kkbase = XFS_ALLOC_KEY_ADDR(block, 1, cur);
1026 else
1027 krbase = XFS_ALLOC_REC_ADDR(block, 1, cur);
1028 /*
1029 * Set low and high entry numbers, 1-based.
1030 */
1031 low = 1;
1032 if (!(high = INT_GET(block->bb_numrecs, ARCH_CONVERT))) {
1033 /*
1034 * If the block is empty, the tree must
1035 * be an empty leaf.
1036 */
1037 ASSERT(level == 0 && cur->bc_nlevels == 1);
1038 cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
1039 *stat = 0;
1040 return 0;
1041 }
1042 /*
1043 * Binary search the block.
1044 */
1045 while (low <= high) {
1046 xfs_extlen_t blockcount; /* key value */
1047 xfs_agblock_t startblock; /* key value */
1048
1049 XFS_STATS_INC(xs_abt_compare);
1050 /*
1051 * keyno is average of low and high.
1052 */
1053 keyno = (low + high) >> 1;
1054 /*
1055 * Get startblock & blockcount.
1056 */
1057 if (level > 0) {
1058 xfs_alloc_key_t *kkp;
1059
1060 kkp = kkbase + keyno - 1;
1061 startblock = INT_GET(kkp->ar_startblock, ARCH_CONVERT);
1062 blockcount = INT_GET(kkp->ar_blockcount, ARCH_CONVERT);
1063 } else {
1064 xfs_alloc_rec_t *krp;
1065
1066 krp = krbase + keyno - 1;
1067 startblock = INT_GET(krp->ar_startblock, ARCH_CONVERT);
1068 blockcount = INT_GET(krp->ar_blockcount, ARCH_CONVERT);
1069 }
1070 /*
1071 * Compute difference to get next direction.
1072 */
1073 if (cur->bc_btnum == XFS_BTNUM_BNO)
1074 diff = (int)startblock -
1075 (int)cur->bc_rec.a.ar_startblock;
1076 else if (!(diff = (int)blockcount -
1077 (int)cur->bc_rec.a.ar_blockcount))
1078 diff = (int)startblock -
1079 (int)cur->bc_rec.a.ar_startblock;
1080 /*
1081 * Less than, move right.
1082 */
1083 if (diff < 0)
1084 low = keyno + 1;
1085 /*
1086 * Greater than, move left.
1087 */
1088 else if (diff > 0)
1089 high = keyno - 1;
1090 /*
1091 * Equal, we're done.
1092 */
1093 else
1094 break;
1095 }
1096 }
1097 /*
1098 * If there are more levels, set up for the next level
1099 * by getting the block number and filling in the cursor.
1100 */
1101 if (level > 0) {
1102 /*
1103 * If we moved left, need the previous key number,
1104 * unless there isn't one.
1105 */
1106 if (diff > 0 && --keyno < 1)
1107 keyno = 1;
1108 agbno = INT_GET(*XFS_ALLOC_PTR_ADDR(block, keyno, cur), ARCH_CONVERT);
1109#ifdef DEBUG
1110 if ((error = xfs_btree_check_sptr(cur, agbno, level)))
1111 return error;
1112#endif
1113 cur->bc_ptrs[level] = keyno;
1114 }
1115 }
1116 /*
1117 * Done with the search.
1118 * See if we need to adjust the results.
1119 */
1120 if (dir != XFS_LOOKUP_LE && diff < 0) {
1121 keyno++;
1122 /*
1123 * If ge search and we went off the end of the block, but it's
1124 * not the last block, we're in the wrong block.
1125 */
1126 if (dir == XFS_LOOKUP_GE &&
1127 keyno > INT_GET(block->bb_numrecs, ARCH_CONVERT) &&
1128 INT_GET(block->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
1129 int i;
1130
1131 cur->bc_ptrs[0] = keyno;
1132 if ((error = xfs_alloc_increment(cur, 0, &i)))
1133 return error;
1134 XFS_WANT_CORRUPTED_RETURN(i == 1);
1135 *stat = 1;
1136 return 0;
1137 }
1138 }
1139 else if (dir == XFS_LOOKUP_LE && diff > 0)
1140 keyno--;
1141 cur->bc_ptrs[0] = keyno;
1142 /*
1143 * Return if we succeeded or not.
1144 */
1145 if (keyno == 0 || keyno > INT_GET(block->bb_numrecs, ARCH_CONVERT))
1146 *stat = 0;
1147 else
1148 *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
1149 return 0;
1150}
1151
1152/*
1153 * Move 1 record left from cur/level if possible.
1154 * Update cur to reflect the new path.
1155 */
1156STATIC int /* error */
1157xfs_alloc_lshift(
1158 xfs_btree_cur_t *cur, /* btree cursor */
1159 int level, /* level to shift record on */
1160 int *stat) /* success/failure */
1161{
1162 int error; /* error return value */
1163#ifdef DEBUG
1164 int i; /* loop index */
1165#endif
1166 xfs_alloc_key_t key; /* key value for leaf level upward */
1167 xfs_buf_t *lbp; /* buffer for left neighbor block */
1168 xfs_alloc_block_t *left; /* left neighbor btree block */
1169 int nrec; /* new number of left block entries */
1170 xfs_buf_t *rbp; /* buffer for right (current) block */
1171 xfs_alloc_block_t *right; /* right (current) btree block */
1172 xfs_alloc_key_t *rkp=NULL; /* key pointer for right block */
1173 xfs_alloc_ptr_t *rpp=NULL; /* address pointer for right block */
1174 xfs_alloc_rec_t *rrp=NULL; /* record pointer for right block */
1175
1176 /*
1177 * Set up variables for this block as "right".
1178 */
1179 rbp = cur->bc_bufs[level];
1180 right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
1181#ifdef DEBUG
1182 if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
1183 return error;
1184#endif
1185 /*
1186 * If we've got no left sibling then we can't shift an entry left.
1187 */
1188 if (INT_GET(right->bb_leftsib, ARCH_CONVERT) == NULLAGBLOCK) {
1189 *stat = 0;
1190 return 0;
1191 }
1192 /*
1193 * If the cursor entry is the one that would be moved, don't
1194 * do it... it's too complicated.
1195 */
1196 if (cur->bc_ptrs[level] <= 1) {
1197 *stat = 0;
1198 return 0;
1199 }
1200 /*
1201 * Set up the left neighbor as "left".
1202 */
1203 if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
1204 cur->bc_private.a.agno, INT_GET(right->bb_leftsib, ARCH_CONVERT), 0, &lbp,
1205 XFS_ALLOC_BTREE_REF)))
1206 return error;
1207 left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
1208 if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
1209 return error;
1210 /*
1211 * If it's full, it can't take another entry.
1212 */
1213 if (INT_GET(left->bb_numrecs, ARCH_CONVERT) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
1214 *stat = 0;
1215 return 0;
1216 }
1217 nrec = INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1;
1218 /*
1219 * If non-leaf, copy a key and a ptr to the left block.
1220 */
1221 if (level > 0) {
1222 xfs_alloc_key_t *lkp; /* key pointer for left block */
1223 xfs_alloc_ptr_t *lpp; /* address pointer for left block */
1224
1225 lkp = XFS_ALLOC_KEY_ADDR(left, nrec, cur);
1226 rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
1227 *lkp = *rkp;
1228 xfs_alloc_log_keys(cur, lbp, nrec, nrec);
1229 lpp = XFS_ALLOC_PTR_ADDR(left, nrec, cur);
1230 rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
1231#ifdef DEBUG
1232 if ((error = xfs_btree_check_sptr(cur, INT_GET(*rpp, ARCH_CONVERT), level)))
1233 return error;
1234#endif
1235 *lpp = *rpp; /* INT_: copy */
1236 xfs_alloc_log_ptrs(cur, lbp, nrec, nrec);
1237 xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp);
1238 }
1239 /*
1240 * If leaf, copy a record to the left block.
1241 */
1242 else {
1243 xfs_alloc_rec_t *lrp; /* record pointer for left block */
1244
1245 lrp = XFS_ALLOC_REC_ADDR(left, nrec, cur);
1246 rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
1247 *lrp = *rrp;
1248 xfs_alloc_log_recs(cur, lbp, nrec, nrec);
1249 xfs_btree_check_rec(cur->bc_btnum, lrp - 1, lrp);
1250 }
1251 /*
1252 * Bump and log left's numrecs, decrement and log right's numrecs.
1253 */
1254 INT_MOD(left->bb_numrecs, ARCH_CONVERT, +1);
1255 xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
1256 INT_MOD(right->bb_numrecs, ARCH_CONVERT, -1);
1257 xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
1258 /*
1259 * Slide the contents of right down one entry.
1260 */
1261 if (level > 0) {
1262#ifdef DEBUG
1263 for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) {
1264 if ((error = xfs_btree_check_sptr(cur, INT_GET(rpp[i + 1], ARCH_CONVERT),
1265 level)))
1266 return error;
1267 }
1268#endif
1269 memmove(rkp, rkp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp));
1270 memmove(rpp, rpp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp));
1271 xfs_alloc_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
1272 xfs_alloc_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
1273 } else {
1274 memmove(rrp, rrp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp));
1275 xfs_alloc_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
1276 key.ar_startblock = rrp->ar_startblock; /* INT_: direct copy */
1277 key.ar_blockcount = rrp->ar_blockcount; /* INT_: direct copy */
1278 rkp = &key;
1279 }
1280 /*
1281 * Update the parent key values of right.
1282 */
1283 if ((error = xfs_alloc_updkey(cur, rkp, level + 1)))
1284 return error;
1285 /*
1286 * Slide the cursor value left one.
1287 */
1288 cur->bc_ptrs[level]--;
1289 *stat = 1;
1290 return 0;
1291}
1292
1293/*
1294 * Allocate a new root block, fill it in.
1295 */
1296STATIC int /* error */
1297xfs_alloc_newroot(
1298 xfs_btree_cur_t *cur, /* btree cursor */
1299 int *stat) /* success/failure */
1300{
1301 int error; /* error return value */
1302 xfs_agblock_t lbno; /* left block number */
1303 xfs_buf_t *lbp; /* left btree buffer */
1304 xfs_alloc_block_t *left; /* left btree block */
1305 xfs_mount_t *mp; /* mount structure */
1306 xfs_agblock_t nbno; /* new block number */
1307 xfs_buf_t *nbp; /* new (root) buffer */
1308 xfs_alloc_block_t *new; /* new (root) btree block */
1309 int nptr; /* new value for key index, 1 or 2 */
1310 xfs_agblock_t rbno; /* right block number */
1311 xfs_buf_t *rbp; /* right btree buffer */
1312 xfs_alloc_block_t *right; /* right btree block */
1313
1314 mp = cur->bc_mp;
1315
1316 ASSERT(cur->bc_nlevels < XFS_AG_MAXLEVELS(mp));
1317 /*
1318 * Get a buffer from the freelist blocks, for the new root.
1319 */
1320 if ((error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
1321 &nbno)))
1322 return error;
1323 /*
1324 * None available, we fail.
1325 */
1326 if (nbno == NULLAGBLOCK) {
1327 *stat = 0;
1328 return 0;
1329 }
1330 xfs_trans_agbtree_delta(cur->bc_tp, 1);
1331 nbp = xfs_btree_get_bufs(mp, cur->bc_tp, cur->bc_private.a.agno, nbno,
1332 0);
1333 new = XFS_BUF_TO_ALLOC_BLOCK(nbp);
1334 /*
1335 * Set the root data in the a.g. freespace structure.
1336 */
1337 {
1338 xfs_agf_t *agf; /* a.g. freespace header */
1339 xfs_agnumber_t seqno;
1340
1341 agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
1342 INT_SET(agf->agf_roots[cur->bc_btnum], ARCH_CONVERT, nbno);
1343 INT_MOD(agf->agf_levels[cur->bc_btnum], ARCH_CONVERT, 1);
1344 seqno = INT_GET(agf->agf_seqno, ARCH_CONVERT);
1345 mp->m_perag[seqno].pagf_levels[cur->bc_btnum]++;
1346 xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
1347 XFS_AGF_ROOTS | XFS_AGF_LEVELS);
1348 }
1349 /*
1350 * At the previous root level there are now two blocks: the old
1351 * root, and the new block generated when it was split.
1352 * We don't know which one the cursor is pointing at, so we
1353 * set up variables "left" and "right" for each case.
1354 */
1355 lbp = cur->bc_bufs[cur->bc_nlevels - 1];
1356 left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
1357#ifdef DEBUG
1358 if ((error = xfs_btree_check_sblock(cur, left, cur->bc_nlevels - 1, lbp)))
1359 return error;
1360#endif
1361 if (INT_GET(left->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
1362 /*
1363 * Our block is left, pick up the right block.
1364 */
1365 lbno = XFS_DADDR_TO_AGBNO(mp, XFS_BUF_ADDR(lbp));
1366 rbno = INT_GET(left->bb_rightsib, ARCH_CONVERT);
1367 if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
1368 cur->bc_private.a.agno, rbno, 0, &rbp,
1369 XFS_ALLOC_BTREE_REF)))
1370 return error;
1371 right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
1372 if ((error = xfs_btree_check_sblock(cur, right,
1373 cur->bc_nlevels - 1, rbp)))
1374 return error;
1375 nptr = 1;
1376 } else {
1377 /*
1378 * Our block is right, pick up the left block.
1379 */
1380 rbp = lbp;
1381 right = left;
1382 rbno = XFS_DADDR_TO_AGBNO(mp, XFS_BUF_ADDR(rbp));
1383 lbno = INT_GET(right->bb_leftsib, ARCH_CONVERT);
1384 if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
1385 cur->bc_private.a.agno, lbno, 0, &lbp,
1386 XFS_ALLOC_BTREE_REF)))
1387 return error;
1388 left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
1389 if ((error = xfs_btree_check_sblock(cur, left,
1390 cur->bc_nlevels - 1, lbp)))
1391 return error;
1392 nptr = 2;
1393 }
1394 /*
1395 * Fill in the new block's btree header and log it.
1396 */
1397 INT_SET(new->bb_magic, ARCH_CONVERT, xfs_magics[cur->bc_btnum]);
1398 INT_SET(new->bb_level, ARCH_CONVERT, (__uint16_t)cur->bc_nlevels);
1399 INT_SET(new->bb_numrecs, ARCH_CONVERT, 2);
1400 INT_SET(new->bb_leftsib, ARCH_CONVERT, NULLAGBLOCK);
1401 INT_SET(new->bb_rightsib, ARCH_CONVERT, NULLAGBLOCK);
1402 xfs_alloc_log_block(cur->bc_tp, nbp, XFS_BB_ALL_BITS);
1403 ASSERT(lbno != NULLAGBLOCK && rbno != NULLAGBLOCK);
1404 /*
1405 * Fill in the key data in the new root.
1406 */
1407 {
1408 xfs_alloc_key_t *kp; /* btree key pointer */
1409
1410 kp = XFS_ALLOC_KEY_ADDR(new, 1, cur);
1411 if (INT_GET(left->bb_level, ARCH_CONVERT) > 0) {
1412 kp[0] = *XFS_ALLOC_KEY_ADDR(left, 1, cur); /* INT_: structure copy */
1413 kp[1] = *XFS_ALLOC_KEY_ADDR(right, 1, cur);/* INT_: structure copy */
1414 } else {
1415 xfs_alloc_rec_t *rp; /* btree record pointer */
1416
1417 rp = XFS_ALLOC_REC_ADDR(left, 1, cur);
1418 kp[0].ar_startblock = rp->ar_startblock; /* INT_: direct copy */
1419 kp[0].ar_blockcount = rp->ar_blockcount; /* INT_: direct copy */
1420 rp = XFS_ALLOC_REC_ADDR(right, 1, cur);
1421 kp[1].ar_startblock = rp->ar_startblock; /* INT_: direct copy */
1422 kp[1].ar_blockcount = rp->ar_blockcount; /* INT_: direct copy */
1423 }
1424 }
1425 xfs_alloc_log_keys(cur, nbp, 1, 2);
1426 /*
1427 * Fill in the pointer data in the new root.
1428 */
1429 {
1430 xfs_alloc_ptr_t *pp; /* btree address pointer */
1431
1432 pp = XFS_ALLOC_PTR_ADDR(new, 1, cur);
1433 INT_SET(pp[0], ARCH_CONVERT, lbno);
1434 INT_SET(pp[1], ARCH_CONVERT, rbno);
1435 }
1436 xfs_alloc_log_ptrs(cur, nbp, 1, 2);
1437 /*
1438 * Fix up the cursor.
1439 */
1440 xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
1441 cur->bc_ptrs[cur->bc_nlevels] = nptr;
1442 cur->bc_nlevels++;
1443 *stat = 1;
1444 return 0;
1445}
1446
1447/*
1448 * Move 1 record right from cur/level if possible.
1449 * Update cur to reflect the new path.
1450 */
1451STATIC int /* error */
1452xfs_alloc_rshift(
1453 xfs_btree_cur_t *cur, /* btree cursor */
1454 int level, /* level to shift record on */
1455 int *stat) /* success/failure */
1456{
1457 int error; /* error return value */
1458 int i; /* loop index */
1459 xfs_alloc_key_t key; /* key value for leaf level upward */
1460 xfs_buf_t *lbp; /* buffer for left (current) block */
1461 xfs_alloc_block_t *left; /* left (current) btree block */
1462 xfs_buf_t *rbp; /* buffer for right neighbor block */
1463 xfs_alloc_block_t *right; /* right neighbor btree block */
1464 xfs_alloc_key_t *rkp; /* key pointer for right block */
1465 xfs_btree_cur_t *tcur; /* temporary cursor */
1466
1467 /*
1468 * Set up variables for this block as "left".
1469 */
1470 lbp = cur->bc_bufs[level];
1471 left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
1472#ifdef DEBUG
1473 if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
1474 return error;
1475#endif
1476 /*
1477 * If we've got no right sibling then we can't shift an entry right.
1478 */
1479 if (INT_GET(left->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK) {
1480 *stat = 0;
1481 return 0;
1482 }
1483 /*
1484 * If the cursor entry is the one that would be moved, don't
1485 * do it... it's too complicated.
1486 */
1487 if (cur->bc_ptrs[level] >= INT_GET(left->bb_numrecs, ARCH_CONVERT)) {
1488 *stat = 0;
1489 return 0;
1490 }
1491 /*
1492 * Set up the right neighbor as "right".
1493 */
1494 if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
1495 cur->bc_private.a.agno, INT_GET(left->bb_rightsib, ARCH_CONVERT), 0, &rbp,
1496 XFS_ALLOC_BTREE_REF)))
1497 return error;
1498 right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
1499 if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
1500 return error;
1501 /*
1502 * If it's full, it can't take another entry.
1503 */
1504 if (INT_GET(right->bb_numrecs, ARCH_CONVERT) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
1505 *stat = 0;
1506 return 0;
1507 }
1508 /*
1509 * Make a hole at the start of the right neighbor block, then
1510 * copy the last left block entry to the hole.
1511 */
1512 if (level > 0) {
1513 xfs_alloc_key_t *lkp; /* key pointer for left block */
1514 xfs_alloc_ptr_t *lpp; /* address pointer for left block */
1515 xfs_alloc_ptr_t *rpp; /* address pointer for right block */
1516
1517 lkp = XFS_ALLOC_KEY_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
1518 lpp = XFS_ALLOC_PTR_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
1519 rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
1520 rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
1521#ifdef DEBUG
1522 for (i = INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1; i >= 0; i--) {
1523 if ((error = xfs_btree_check_sptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level)))
1524 return error;
1525 }
1526#endif
1527 memmove(rkp + 1, rkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp));
1528 memmove(rpp + 1, rpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp));
1529#ifdef DEBUG
1530 if ((error = xfs_btree_check_sptr(cur, INT_GET(*lpp, ARCH_CONVERT), level)))
1531 return error;
1532#endif
1533 *rkp = *lkp; /* INT_: copy */
1534 *rpp = *lpp; /* INT_: copy */
1535 xfs_alloc_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
1536 xfs_alloc_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
1537 xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1);
1538 } else {
1539 xfs_alloc_rec_t *lrp; /* record pointer for left block */
1540 xfs_alloc_rec_t *rrp; /* record pointer for right block */
1541
1542 lrp = XFS_ALLOC_REC_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
1543 rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
1544 memmove(rrp + 1, rrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp));
1545 *rrp = *lrp;
1546 xfs_alloc_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
1547 key.ar_startblock = rrp->ar_startblock; /* INT_: direct copy */
1548 key.ar_blockcount = rrp->ar_blockcount; /* INT_: direct copy */
1549 rkp = &key;
1550 xfs_btree_check_rec(cur->bc_btnum, rrp, rrp + 1);
1551 }
1552 /*
1553 * Decrement and log left's numrecs, bump and log right's numrecs.
1554 */
1555 INT_MOD(left->bb_numrecs, ARCH_CONVERT, -1);
1556 xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
1557 INT_MOD(right->bb_numrecs, ARCH_CONVERT, +1);
1558 xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
1559 /*
1560 * Using a temporary cursor, update the parent key values of the
1561 * block on the right.
1562 */
1563 if ((error = xfs_btree_dup_cursor(cur, &tcur)))
1564 return error;
1565 i = xfs_btree_lastrec(tcur, level);
1566 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1567 if ((error = xfs_alloc_increment(tcur, level, &i)) ||
1568 (error = xfs_alloc_updkey(tcur, rkp, level + 1)))
1569 goto error0;
1570 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
1571 *stat = 1;
1572 return 0;
1573error0:
1574 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
1575 return error;
1576}
1577
1578/*
1579 * Split cur/level block in half.
1580 * Return new block number and its first record (to be inserted into parent).
1581 */
1582STATIC int /* error */
1583xfs_alloc_split(
1584 xfs_btree_cur_t *cur, /* btree cursor */
1585 int level, /* level to split */
1586 xfs_agblock_t *bnop, /* output: block number allocated */
1587 xfs_alloc_key_t *keyp, /* output: first key of new block */
1588 xfs_btree_cur_t **curp, /* output: new cursor */
1589 int *stat) /* success/failure */
1590{
1591 int error; /* error return value */
1592 int i; /* loop index/record number */
1593 xfs_agblock_t lbno; /* left (current) block number */
1594 xfs_buf_t *lbp; /* buffer for left block */
1595 xfs_alloc_block_t *left; /* left (current) btree block */
1596 xfs_agblock_t rbno; /* right (new) block number */
1597 xfs_buf_t *rbp; /* buffer for right block */
1598 xfs_alloc_block_t *right; /* right (new) btree block */
1599
1600 /*
1601 * Allocate the new block from the freelist.
1602 * If we can't do it, we're toast. Give up.
1603 */
1604 if ((error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
1605 &rbno)))
1606 return error;
1607 if (rbno == NULLAGBLOCK) {
1608 *stat = 0;
1609 return 0;
1610 }
1611 xfs_trans_agbtree_delta(cur->bc_tp, 1);
1612 rbp = xfs_btree_get_bufs(cur->bc_mp, cur->bc_tp, cur->bc_private.a.agno,
1613 rbno, 0);
1614 /*
1615 * Set up the new block as "right".
1616 */
1617 right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
1618 /*
1619 * "Left" is the current (according to the cursor) block.
1620 */
1621 lbp = cur->bc_bufs[level];
1622 left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
1623#ifdef DEBUG
1624 if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
1625 return error;
1626#endif
1627 /*
1628 * Fill in the btree header for the new block.
1629 */
1630 INT_SET(right->bb_magic, ARCH_CONVERT, xfs_magics[cur->bc_btnum]);
1631 right->bb_level = left->bb_level; /* INT_: direct copy */
1632 INT_SET(right->bb_numrecs, ARCH_CONVERT, (__uint16_t)(INT_GET(left->bb_numrecs, ARCH_CONVERT) / 2));
1633 /*
1634 * Make sure that if there's an odd number of entries now, that
1635 * each new block will have the same number of entries.
1636 */
1637 if ((INT_GET(left->bb_numrecs, ARCH_CONVERT) & 1) &&
1638 cur->bc_ptrs[level] <= INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1)
1639 INT_MOD(right->bb_numrecs, ARCH_CONVERT, +1);
1640 i = INT_GET(left->bb_numrecs, ARCH_CONVERT) - INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1;
1641 /*
1642 * For non-leaf blocks, copy keys and addresses over to the new block.
1643 */
1644 if (level > 0) {
1645 xfs_alloc_key_t *lkp; /* left btree key pointer */
1646 xfs_alloc_ptr_t *lpp; /* left btree address pointer */
1647 xfs_alloc_key_t *rkp; /* right btree key pointer */
1648 xfs_alloc_ptr_t *rpp; /* right btree address pointer */
1649
1650 lkp = XFS_ALLOC_KEY_ADDR(left, i, cur);
1651 lpp = XFS_ALLOC_PTR_ADDR(left, i, cur);
1652 rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
1653 rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
1654#ifdef DEBUG
1655 for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) {
1656 if ((error = xfs_btree_check_sptr(cur, INT_GET(lpp[i], ARCH_CONVERT), level)))
1657 return error;
1658 }
1659#endif
1660 memcpy(rkp, lkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp)); /* INT_: copy */
1661 memcpy(rpp, lpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp)); /* INT_: copy */
1662 xfs_alloc_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
1663 xfs_alloc_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
1664 *keyp = *rkp;
1665 }
1666 /*
1667 * For leaf blocks, copy records over to the new block.
1668 */
1669 else {
1670 xfs_alloc_rec_t *lrp; /* left btree record pointer */
1671 xfs_alloc_rec_t *rrp; /* right btree record pointer */
1672
1673 lrp = XFS_ALLOC_REC_ADDR(left, i, cur);
1674 rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
1675 memcpy(rrp, lrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp));
1676 xfs_alloc_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
1677 keyp->ar_startblock = rrp->ar_startblock; /* INT_: direct copy */
1678 keyp->ar_blockcount = rrp->ar_blockcount; /* INT_: direct copy */
1679 }
1680 /*
1681 * Find the left block number by looking in the buffer.
1682 * Adjust numrecs, sibling pointers.
1683 */
1684 lbno = XFS_DADDR_TO_AGBNO(cur->bc_mp, XFS_BUF_ADDR(lbp));
1685 INT_MOD(left->bb_numrecs, ARCH_CONVERT, -(INT_GET(right->bb_numrecs, ARCH_CONVERT)));
1686 right->bb_rightsib = left->bb_rightsib; /* INT_: direct copy */
1687 INT_SET(left->bb_rightsib, ARCH_CONVERT, rbno);
1688 INT_SET(right->bb_leftsib, ARCH_CONVERT, lbno);
1689 xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_ALL_BITS);
1690 xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
1691 /*
1692 * If there's a block to the new block's right, make that block
1693 * point back to right instead of to left.
1694 */
1695 if (INT_GET(right->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
1696 xfs_alloc_block_t *rrblock; /* rr btree block */
1697 xfs_buf_t *rrbp; /* buffer for rrblock */
1698
1699 if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
1700 cur->bc_private.a.agno, INT_GET(right->bb_rightsib, ARCH_CONVERT), 0,
1701 &rrbp, XFS_ALLOC_BTREE_REF)))
1702 return error;
1703 rrblock = XFS_BUF_TO_ALLOC_BLOCK(rrbp);
1704 if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
1705 return error;
1706 INT_SET(rrblock->bb_leftsib, ARCH_CONVERT, rbno);
1707 xfs_alloc_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
1708 }
1709 /*
1710 * If the cursor is really in the right block, move it there.
1711 * If it's just pointing past the last entry in left, then we'll
1712 * insert there, so don't change anything in that case.
1713 */
1714 if (cur->bc_ptrs[level] > INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1) {
1715 xfs_btree_setbuf(cur, level, rbp);
1716 cur->bc_ptrs[level] -= INT_GET(left->bb_numrecs, ARCH_CONVERT);
1717 }
1718 /*
1719 * If there are more levels, we'll need another cursor which refers to
1720 * the right block, no matter where this cursor was.
1721 */
1722 if (level + 1 < cur->bc_nlevels) {
1723 if ((error = xfs_btree_dup_cursor(cur, curp)))
1724 return error;
1725 (*curp)->bc_ptrs[level + 1]++;
1726 }
1727 *bnop = rbno;
1728 *stat = 1;
1729 return 0;
1730}
1731
1732/*
1733 * Update keys at all levels from here to the root along the cursor's path.
1734 */
1735STATIC int /* error */
1736xfs_alloc_updkey(
1737 xfs_btree_cur_t *cur, /* btree cursor */
1738 xfs_alloc_key_t *keyp, /* new key value to update to */
1739 int level) /* starting level for update */
1740{
1741 int ptr; /* index of key in block */
1742
1743 /*
1744 * Go up the tree from this level toward the root.
1745 * At each level, update the key value to the value input.
1746 * Stop when we reach a level where the cursor isn't pointing
1747 * at the first entry in the block.
1748 */
1749 for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
1750 xfs_alloc_block_t *block; /* btree block */
1751 xfs_buf_t *bp; /* buffer for block */
1752#ifdef DEBUG
1753 int error; /* error return value */
1754#endif
1755 xfs_alloc_key_t *kp; /* ptr to btree block keys */
1756
1757 bp = cur->bc_bufs[level];
1758 block = XFS_BUF_TO_ALLOC_BLOCK(bp);
1759#ifdef DEBUG
1760 if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
1761 return error;
1762#endif
1763 ptr = cur->bc_ptrs[level];
1764 kp = XFS_ALLOC_KEY_ADDR(block, ptr, cur);
1765 *kp = *keyp;
1766 xfs_alloc_log_keys(cur, bp, ptr, ptr);
1767 }
1768 return 0;
1769}
1770
1771/*
1772 * Externally visible routines.
1773 */
1774
1775/*
1776 * Decrement cursor by one record at the level.
1777 * For nonzero levels the leaf-ward information is untouched.
1778 */
1779int /* error */
1780xfs_alloc_decrement(
1781 xfs_btree_cur_t *cur, /* btree cursor */
1782 int level, /* level in btree, 0 is leaf */
1783 int *stat) /* success/failure */
1784{
1785 xfs_alloc_block_t *block; /* btree block */
1786 int error; /* error return value */
1787 int lev; /* btree level */
1788
1789 ASSERT(level < cur->bc_nlevels);
1790 /*
1791 * Read-ahead to the left at this level.
1792 */
1793 xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
1794 /*
1795 * Decrement the ptr at this level. If we're still in the block
1796 * then we're done.
1797 */
1798 if (--cur->bc_ptrs[level] > 0) {
1799 *stat = 1;
1800 return 0;
1801 }
1802 /*
1803 * Get a pointer to the btree block.
1804 */
1805 block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[level]);
1806#ifdef DEBUG
1807 if ((error = xfs_btree_check_sblock(cur, block, level,
1808 cur->bc_bufs[level])))
1809 return error;
1810#endif
1811 /*
1812 * If we just went off the left edge of the tree, return failure.
1813 */
1814 if (INT_GET(block->bb_leftsib, ARCH_CONVERT) == NULLAGBLOCK) {
1815 *stat = 0;
1816 return 0;
1817 }
1818 /*
1819 * March up the tree decrementing pointers.
1820 * Stop when we don't go off the left edge of a block.
1821 */
1822 for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1823 if (--cur->bc_ptrs[lev] > 0)
1824 break;
1825 /*
1826 * Read-ahead the left block, we're going to read it
1827 * in the next loop.
1828 */
1829 xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
1830 }
1831 /*
1832 * If we went off the root then we are seriously confused.
1833 */
1834 ASSERT(lev < cur->bc_nlevels);
1835 /*
1836 * Now walk back down the tree, fixing up the cursor's buffer
1837 * pointers and key numbers.
1838 */
1839 for (block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[lev]); lev > level; ) {
1840 xfs_agblock_t agbno; /* block number of btree block */
1841 xfs_buf_t *bp; /* buffer pointer for block */
1842
1843 agbno = INT_GET(*XFS_ALLOC_PTR_ADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT);
1844 if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
1845 cur->bc_private.a.agno, agbno, 0, &bp,
1846 XFS_ALLOC_BTREE_REF)))
1847 return error;
1848 lev--;
1849 xfs_btree_setbuf(cur, lev, bp);
1850 block = XFS_BUF_TO_ALLOC_BLOCK(bp);
1851 if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
1852 return error;
1853 cur->bc_ptrs[lev] = INT_GET(block->bb_numrecs, ARCH_CONVERT);
1854 }
1855 *stat = 1;
1856 return 0;
1857}
1858
1859/*
1860 * Delete the record pointed to by cur.
1861 * The cursor refers to the place where the record was (could be inserted)
1862 * when the operation returns.
1863 */
1864int /* error */
1865xfs_alloc_delete(
1866 xfs_btree_cur_t *cur, /* btree cursor */
1867 int *stat) /* success/failure */
1868{
1869 int error; /* error return value */
1870 int i; /* result code */
1871 int level; /* btree level */
1872
1873 /*
1874 * Go up the tree, starting at leaf level.
1875 * If 2 is returned then a join was done; go to the next level.
1876 * Otherwise we are done.
1877 */
1878 for (level = 0, i = 2; i == 2; level++) {
1879 if ((error = xfs_alloc_delrec(cur, level, &i)))
1880 return error;
1881 }
1882 if (i == 0) {
1883 for (level = 1; level < cur->bc_nlevels; level++) {
1884 if (cur->bc_ptrs[level] == 0) {
1885 if ((error = xfs_alloc_decrement(cur, level, &i)))
1886 return error;
1887 break;
1888 }
1889 }
1890 }
1891 *stat = i;
1892 return 0;
1893}
1894
1895/*
1896 * Get the data from the pointed-to record.
1897 */
1898int /* error */
1899xfs_alloc_get_rec(
1900 xfs_btree_cur_t *cur, /* btree cursor */
1901 xfs_agblock_t *bno, /* output: starting block of extent */
1902 xfs_extlen_t *len, /* output: length of extent */
1903 int *stat) /* output: success/failure */
1904{
1905 xfs_alloc_block_t *block; /* btree block */
1906#ifdef DEBUG
1907 int error; /* error return value */
1908#endif
1909 int ptr; /* record number */
1910
1911 ptr = cur->bc_ptrs[0];
1912 block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[0]);
1913#ifdef DEBUG
1914 if ((error = xfs_btree_check_sblock(cur, block, 0, cur->bc_bufs[0])))
1915 return error;
1916#endif
1917 /*
1918 * Off the right end or left end, return failure.
1919 */
1920 if (ptr > INT_GET(block->bb_numrecs, ARCH_CONVERT) || ptr <= 0) {
1921 *stat = 0;
1922 return 0;
1923 }
1924 /*
1925 * Point to the record and extract its data.
1926 */
1927 {
1928 xfs_alloc_rec_t *rec; /* record data */
1929
1930 rec = XFS_ALLOC_REC_ADDR(block, ptr, cur);
1931 *bno = INT_GET(rec->ar_startblock, ARCH_CONVERT);
1932 *len = INT_GET(rec->ar_blockcount, ARCH_CONVERT);
1933 }
1934 *stat = 1;
1935 return 0;
1936}
1937
1938/*
1939 * Increment cursor by one record at the level.
1940 * For nonzero levels the leaf-ward information is untouched.
1941 */
1942int /* error */
1943xfs_alloc_increment(
1944 xfs_btree_cur_t *cur, /* btree cursor */
1945 int level, /* level in btree, 0 is leaf */
1946 int *stat) /* success/failure */
1947{
1948 xfs_alloc_block_t *block; /* btree block */
1949 xfs_buf_t *bp; /* tree block buffer */
1950 int error; /* error return value */
1951 int lev; /* btree level */
1952
1953 ASSERT(level < cur->bc_nlevels);
1954 /*
1955 * Read-ahead to the right at this level.
1956 */
1957 xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
1958 /*
1959 * Get a pointer to the btree block.
1960 */
1961 bp = cur->bc_bufs[level];
1962 block = XFS_BUF_TO_ALLOC_BLOCK(bp);
1963#ifdef DEBUG
1964 if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
1965 return error;
1966#endif
1967 /*
1968 * Increment the ptr at this level. If we're still in the block
1969 * then we're done.
1970 */
1971 if (++cur->bc_ptrs[level] <= INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
1972 *stat = 1;
1973 return 0;
1974 }
1975 /*
1976 * If we just went off the right edge of the tree, return failure.
1977 */
1978 if (INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK) {
1979 *stat = 0;
1980 return 0;
1981 }
1982 /*
1983 * March up the tree incrementing pointers.
1984 * Stop when we don't go off the right edge of a block.
1985 */
1986 for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1987 bp = cur->bc_bufs[lev];
1988 block = XFS_BUF_TO_ALLOC_BLOCK(bp);
1989#ifdef DEBUG
1990 if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
1991 return error;
1992#endif
1993 if (++cur->bc_ptrs[lev] <= INT_GET(block->bb_numrecs, ARCH_CONVERT))
1994 break;
1995 /*
1996 * Read-ahead the right block, we're going to read it
1997 * in the next loop.
1998 */
1999 xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
2000 }
2001 /*
2002 * If we went off the root then we are seriously confused.
2003 */
2004 ASSERT(lev < cur->bc_nlevels);
2005 /*
2006 * Now walk back down the tree, fixing up the cursor's buffer
2007 * pointers and key numbers.
2008 */
2009 for (bp = cur->bc_bufs[lev], block = XFS_BUF_TO_ALLOC_BLOCK(bp);
2010 lev > level; ) {
2011 xfs_agblock_t agbno; /* block number of btree block */
2012
2013 agbno = INT_GET(*XFS_ALLOC_PTR_ADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT);
2014 if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
2015 cur->bc_private.a.agno, agbno, 0, &bp,
2016 XFS_ALLOC_BTREE_REF)))
2017 return error;
2018 lev--;
2019 xfs_btree_setbuf(cur, lev, bp);
2020 block = XFS_BUF_TO_ALLOC_BLOCK(bp);
2021 if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
2022 return error;
2023 cur->bc_ptrs[lev] = 1;
2024 }
2025 *stat = 1;
2026 return 0;
2027}
2028
2029/*
2030 * Insert the current record at the point referenced by cur.
2031 * The cursor may be inconsistent on return if splits have been done.
2032 */
2033int /* error */
2034xfs_alloc_insert(
2035 xfs_btree_cur_t *cur, /* btree cursor */
2036 int *stat) /* success/failure */
2037{
2038 int error; /* error return value */
2039 int i; /* result value, 0 for failure */
2040 int level; /* current level number in btree */
2041 xfs_agblock_t nbno; /* new block number (split result) */
2042 xfs_btree_cur_t *ncur; /* new cursor (split result) */
2043 xfs_alloc_rec_t nrec; /* record being inserted this level */
2044 xfs_btree_cur_t *pcur; /* previous level's cursor */
2045
2046 level = 0;
2047 nbno = NULLAGBLOCK;
2048 INT_SET(nrec.ar_startblock, ARCH_CONVERT, cur->bc_rec.a.ar_startblock);
2049 INT_SET(nrec.ar_blockcount, ARCH_CONVERT, cur->bc_rec.a.ar_blockcount);
2050 ncur = (xfs_btree_cur_t *)0;
2051 pcur = cur;
2052 /*
2053 * Loop going up the tree, starting at the leaf level.
2054 * Stop when we don't get a split block, that must mean that
2055 * the insert is finished with this level.
2056 */
2057 do {
2058 /*
2059 * Insert nrec/nbno into this level of the tree.
2060 * Note if we fail, nbno will be null.
2061 */
2062 if ((error = xfs_alloc_insrec(pcur, level++, &nbno, &nrec, &ncur,
2063 &i))) {
2064 if (pcur != cur)
2065 xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
2066 return error;
2067 }
2068 /*
2069 * See if the cursor we just used is trash.
2070 * Can't trash the caller's cursor, but otherwise we should
2071 * if ncur is a new cursor or we're about to be done.
2072 */
2073 if (pcur != cur && (ncur || nbno == NULLAGBLOCK)) {
2074 cur->bc_nlevels = pcur->bc_nlevels;
2075 xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
2076 }
2077 /*
2078 * If we got a new cursor, switch to it.
2079 */
2080 if (ncur) {
2081 pcur = ncur;
2082 ncur = (xfs_btree_cur_t *)0;
2083 }
2084 } while (nbno != NULLAGBLOCK);
2085 *stat = i;
2086 return 0;
2087}
2088
2089/*
2090 * Lookup the record equal to [bno, len] in the btree given by cur.
2091 */
2092int /* error */
2093xfs_alloc_lookup_eq(
2094 xfs_btree_cur_t *cur, /* btree cursor */
2095 xfs_agblock_t bno, /* starting block of extent */
2096 xfs_extlen_t len, /* length of extent */
2097 int *stat) /* success/failure */
2098{
2099 cur->bc_rec.a.ar_startblock = bno;
2100 cur->bc_rec.a.ar_blockcount = len;
2101 return xfs_alloc_lookup(cur, XFS_LOOKUP_EQ, stat);
2102}
2103
2104/*
2105 * Lookup the first record greater than or equal to [bno, len]
2106 * in the btree given by cur.
2107 */
2108int /* error */
2109xfs_alloc_lookup_ge(
2110 xfs_btree_cur_t *cur, /* btree cursor */
2111 xfs_agblock_t bno, /* starting block of extent */
2112 xfs_extlen_t len, /* length of extent */
2113 int *stat) /* success/failure */
2114{
2115 cur->bc_rec.a.ar_startblock = bno;
2116 cur->bc_rec.a.ar_blockcount = len;
2117 return xfs_alloc_lookup(cur, XFS_LOOKUP_GE, stat);
2118}
2119
2120/*
2121 * Lookup the first record less than or equal to [bno, len]
2122 * in the btree given by cur.
2123 */
2124int /* error */
2125xfs_alloc_lookup_le(
2126 xfs_btree_cur_t *cur, /* btree cursor */
2127 xfs_agblock_t bno, /* starting block of extent */
2128 xfs_extlen_t len, /* length of extent */
2129 int *stat) /* success/failure */
2130{
2131 cur->bc_rec.a.ar_startblock = bno;
2132 cur->bc_rec.a.ar_blockcount = len;
2133 return xfs_alloc_lookup(cur, XFS_LOOKUP_LE, stat);
2134}
2135
2136/*
2137 * Update the record referred to by cur, to the value given by [bno, len].
2138 * This either works (return 0) or gets an EFSCORRUPTED error.
2139 */
2140int /* error */
2141xfs_alloc_update(
2142 xfs_btree_cur_t *cur, /* btree cursor */
2143 xfs_agblock_t bno, /* starting block of extent */
2144 xfs_extlen_t len) /* length of extent */
2145{
2146 xfs_alloc_block_t *block; /* btree block to update */
2147 int error; /* error return value */
2148 int ptr; /* current record number (updating) */
2149
2150 ASSERT(len > 0);
2151 /*
2152 * Pick up the a.g. freelist struct and the current block.
2153 */
2154 block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[0]);
2155#ifdef DEBUG
2156 if ((error = xfs_btree_check_sblock(cur, block, 0, cur->bc_bufs[0])))
2157 return error;
2158#endif
2159 /*
2160 * Get the address of the rec to be updated.
2161 */
2162 ptr = cur->bc_ptrs[0];
2163 {
2164 xfs_alloc_rec_t *rp; /* pointer to updated record */
2165
2166 rp = XFS_ALLOC_REC_ADDR(block, ptr, cur);
2167 /*
2168 * Fill in the new contents and log them.
2169 */
2170 INT_SET(rp->ar_startblock, ARCH_CONVERT, bno);
2171 INT_SET(rp->ar_blockcount, ARCH_CONVERT, len);
2172 xfs_alloc_log_recs(cur, cur->bc_bufs[0], ptr, ptr);
2173 }
2174 /*
2175 * If it's the by-size btree and it's the last leaf block and
2176 * it's the last record... then update the size of the longest
2177 * extent in the a.g., which we cache in the a.g. freelist header.
2178 */
2179 if (cur->bc_btnum == XFS_BTNUM_CNT &&
2180 INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK &&
2181 ptr == INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
2182 xfs_agf_t *agf; /* a.g. freespace header */
2183 xfs_agnumber_t seqno;
2184
2185 agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
2186 seqno = INT_GET(agf->agf_seqno, ARCH_CONVERT);
2187 cur->bc_mp->m_perag[seqno].pagf_longest = len;
2188 INT_SET(agf->agf_longest, ARCH_CONVERT, len);
2189 xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
2190 XFS_AGF_LONGEST);
2191 }
2192 /*
2193 * Updating first record in leaf. Pass new key value up to our parent.
2194 */
2195 if (ptr == 1) {
2196 xfs_alloc_key_t key; /* key containing [bno, len] */
2197
2198 INT_SET(key.ar_startblock, ARCH_CONVERT, bno);
2199 INT_SET(key.ar_blockcount, ARCH_CONVERT, len);
2200 if ((error = xfs_alloc_updkey(cur, &key, 1)))
2201 return error;
2202 }
2203 return 0;
2204}
diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h
new file mode 100644
index 000000000000..ed5161a572ef
--- /dev/null
+++ b/fs/xfs/xfs_alloc_btree.h
@@ -0,0 +1,257 @@
1/*
2 * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_ALLOC_BTREE_H__
33#define __XFS_ALLOC_BTREE_H__
34
35/*
36 * Freespace on-disk structures
37 */
38
39struct xfs_buf;
40struct xfs_btree_cur;
41struct xfs_btree_sblock;
42struct xfs_mount;
43
44/*
45 * There are two on-disk btrees, one sorted by blockno and one sorted
46 * by blockcount and blockno. All blocks look the same to make the code
47 * simpler; if we have time later, we'll make the optimizations.
48 */
49#define XFS_ABTB_MAGIC 0x41425442 /* 'ABTB' for bno tree */
50#define XFS_ABTC_MAGIC 0x41425443 /* 'ABTC' for cnt tree */
51
52/*
53 * Data record/key structure
54 */
55typedef struct xfs_alloc_rec
56{
57 xfs_agblock_t ar_startblock; /* starting block number */
58 xfs_extlen_t ar_blockcount; /* count of free blocks */
59} xfs_alloc_rec_t, xfs_alloc_key_t;
60
61typedef xfs_agblock_t xfs_alloc_ptr_t; /* btree pointer type */
62 /* btree block header type */
63typedef struct xfs_btree_sblock xfs_alloc_block_t;
64
65#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_ALLOC_BLOCK)
66xfs_alloc_block_t *xfs_buf_to_alloc_block(struct xfs_buf *bp);
67#define XFS_BUF_TO_ALLOC_BLOCK(bp) xfs_buf_to_alloc_block(bp)
68#else
69#define XFS_BUF_TO_ALLOC_BLOCK(bp) ((xfs_alloc_block_t *)(XFS_BUF_PTR(bp)))
70#endif
71
72/*
73 * Real block structures have a size equal to the disk block size.
74 */
75
76#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_BLOCK_SIZE)
77int xfs_alloc_block_size(int lev, struct xfs_btree_cur *cur);
78#define XFS_ALLOC_BLOCK_SIZE(lev,cur) xfs_alloc_block_size(lev,cur)
79#else
80#define XFS_ALLOC_BLOCK_SIZE(lev,cur) (1 << (cur)->bc_blocklog)
81#endif
82
83#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_BLOCK_MAXRECS)
84int xfs_alloc_block_maxrecs(int lev, struct xfs_btree_cur *cur);
85#define XFS_ALLOC_BLOCK_MAXRECS(lev,cur) xfs_alloc_block_maxrecs(lev,cur)
86#else
87#define XFS_ALLOC_BLOCK_MAXRECS(lev,cur) \
88 ((cur)->bc_mp->m_alloc_mxr[lev != 0])
89#endif
90#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_BLOCK_MINRECS)
91int xfs_alloc_block_minrecs(int lev, struct xfs_btree_cur *cur);
92#define XFS_ALLOC_BLOCK_MINRECS(lev,cur) xfs_alloc_block_minrecs(lev,cur)
93#else
94#define XFS_ALLOC_BLOCK_MINRECS(lev,cur) \
95 ((cur)->bc_mp->m_alloc_mnr[lev != 0])
96#endif
97
98/*
99 * Minimum and maximum blocksize and sectorsize.
100 * The blocksize upper limit is pretty much arbitrary.
101 * The sectorsize upper limit is due to sizeof(sb_sectsize).
102 */
103#define XFS_MIN_BLOCKSIZE_LOG 9 /* i.e. 512 bytes */
104#define XFS_MAX_BLOCKSIZE_LOG 16 /* i.e. 65536 bytes */
105#define XFS_MIN_BLOCKSIZE (1 << XFS_MIN_BLOCKSIZE_LOG)
106#define XFS_MAX_BLOCKSIZE (1 << XFS_MAX_BLOCKSIZE_LOG)
107#define XFS_MIN_SECTORSIZE_LOG 9 /* i.e. 512 bytes */
108#define XFS_MAX_SECTORSIZE_LOG 15 /* i.e. 32768 bytes */
109#define XFS_MIN_SECTORSIZE (1 << XFS_MIN_SECTORSIZE_LOG)
110#define XFS_MAX_SECTORSIZE (1 << XFS_MAX_SECTORSIZE_LOG)
111
112/*
113 * Block numbers in the AG:
114 * SB is sector 0, AGF is sector 1, AGI is sector 2, AGFL is sector 3.
115 */
116#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BNO_BLOCK)
117xfs_agblock_t xfs_bno_block(struct xfs_mount *mp);
118#define XFS_BNO_BLOCK(mp) xfs_bno_block(mp)
119#else
120#define XFS_BNO_BLOCK(mp) ((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1))
121#endif
122#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CNT_BLOCK)
123xfs_agblock_t xfs_cnt_block(struct xfs_mount *mp);
124#define XFS_CNT_BLOCK(mp) xfs_cnt_block(mp)
125#else
126#define XFS_CNT_BLOCK(mp) ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1))
127#endif
128
129/*
130 * Record, key, and pointer address macros for btree blocks.
131 */
132#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_REC_ADDR)
133xfs_alloc_rec_t *xfs_alloc_rec_addr(xfs_alloc_block_t *bb, int i,
134 struct xfs_btree_cur *cur);
135#define XFS_ALLOC_REC_ADDR(bb,i,cur) xfs_alloc_rec_addr(bb,i,cur)
136#else
137#define XFS_ALLOC_REC_ADDR(bb,i,cur) \
138 XFS_BTREE_REC_ADDR(XFS_ALLOC_BLOCK_SIZE(0,cur), xfs_alloc, bb, i, \
139 XFS_ALLOC_BLOCK_MAXRECS(0, cur))
140#endif
141
142#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_KEY_ADDR)
143xfs_alloc_key_t *xfs_alloc_key_addr(xfs_alloc_block_t *bb, int i,
144 struct xfs_btree_cur *cur);
145#define XFS_ALLOC_KEY_ADDR(bb,i,cur) xfs_alloc_key_addr(bb,i,cur)
146#else
147#define XFS_ALLOC_KEY_ADDR(bb,i,cur) \
148 XFS_BTREE_KEY_ADDR(XFS_ALLOC_BLOCK_SIZE(1,cur), xfs_alloc, bb, i, \
149 XFS_ALLOC_BLOCK_MAXRECS(1, cur))
150#endif
151
152#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_PTR_ADDR)
153xfs_alloc_ptr_t *xfs_alloc_ptr_addr(xfs_alloc_block_t *bb, int i,
154 struct xfs_btree_cur *cur);
155#define XFS_ALLOC_PTR_ADDR(bb,i,cur) xfs_alloc_ptr_addr(bb,i,cur)
156#else
157#define XFS_ALLOC_PTR_ADDR(bb,i,cur) \
158 XFS_BTREE_PTR_ADDR(XFS_ALLOC_BLOCK_SIZE(1,cur), xfs_alloc, bb, i, \
159 XFS_ALLOC_BLOCK_MAXRECS(1, cur))
160#endif
161
162/*
163 * Prototypes for externally visible routines.
164 */
165
166/*
167 * Decrement cursor by one record at the level.
168 * For nonzero levels the leaf-ward information is untouched.
169 */
170int /* error */
171xfs_alloc_decrement(
172 struct xfs_btree_cur *cur, /* btree cursor */
173 int level, /* level in btree, 0 is leaf */
174 int *stat); /* success/failure */
175
176/*
177 * Delete the record pointed to by cur.
178 * The cursor refers to the place where the record was (could be inserted)
179 * when the operation returns.
180 */
181int /* error */
182xfs_alloc_delete(
183 struct xfs_btree_cur *cur, /* btree cursor */
184 int *stat); /* success/failure */
185
186/*
187 * Get the data from the pointed-to record.
188 */
189int /* error */
190xfs_alloc_get_rec(
191 struct xfs_btree_cur *cur, /* btree cursor */
192 xfs_agblock_t *bno, /* output: starting block of extent */
193 xfs_extlen_t *len, /* output: length of extent */
194 int *stat); /* output: success/failure */
195
196/*
197 * Increment cursor by one record at the level.
198 * For nonzero levels the leaf-ward information is untouched.
199 */
200int /* error */
201xfs_alloc_increment(
202 struct xfs_btree_cur *cur, /* btree cursor */
203 int level, /* level in btree, 0 is leaf */
204 int *stat); /* success/failure */
205
206/*
207 * Insert the current record at the point referenced by cur.
208 * The cursor may be inconsistent on return if splits have been done.
209 */
210int /* error */
211xfs_alloc_insert(
212 struct xfs_btree_cur *cur, /* btree cursor */
213 int *stat); /* success/failure */
214
215/*
216 * Lookup the record equal to [bno, len] in the btree given by cur.
217 */
218int /* error */
219xfs_alloc_lookup_eq(
220 struct xfs_btree_cur *cur, /* btree cursor */
221 xfs_agblock_t bno, /* starting block of extent */
222 xfs_extlen_t len, /* length of extent */
223 int *stat); /* success/failure */
224
225/*
226 * Lookup the first record greater than or equal to [bno, len]
227 * in the btree given by cur.
228 */
229int /* error */
230xfs_alloc_lookup_ge(
231 struct xfs_btree_cur *cur, /* btree cursor */
232 xfs_agblock_t bno, /* starting block of extent */
233 xfs_extlen_t len, /* length of extent */
234 int *stat); /* success/failure */
235
236/*
237 * Lookup the first record less than or equal to [bno, len]
238 * in the btree given by cur.
239 */
240int /* error */
241xfs_alloc_lookup_le(
242 struct xfs_btree_cur *cur, /* btree cursor */
243 xfs_agblock_t bno, /* starting block of extent */
244 xfs_extlen_t len, /* length of extent */
245 int *stat); /* success/failure */
246
247/*
248 * Update the record referred to by cur, to the value given by [bno, len].
249 * This either works (return 0) or gets an EFSCORRUPTED error.
250 */
251int /* error */
252xfs_alloc_update(
253 struct xfs_btree_cur *cur, /* btree cursor */
254 xfs_agblock_t bno, /* starting block of extent */
255 xfs_extlen_t len); /* length of extent */
256
257#endif /* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h
new file mode 100644
index 000000000000..ae35189b3d70
--- /dev/null
+++ b/fs/xfs/xfs_arch.h
@@ -0,0 +1,213 @@
1/*
2 * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_ARCH_H__
33#define __XFS_ARCH_H__
34
35#ifndef XFS_BIG_INUMS
36# error XFS_BIG_INUMS must be defined true or false
37#endif
38
39#ifdef __KERNEL__
40
41#include <asm/byteorder.h>
42
43#ifdef __LITTLE_ENDIAN
44# define __BYTE_ORDER __LITTLE_ENDIAN
45#endif
46#ifdef __BIG_ENDIAN
47# define __BYTE_ORDER __BIG_ENDIAN
48#endif
49
50#endif /* __KERNEL__ */
51
52/* do we need conversion? */
53
54#define ARCH_NOCONVERT 1
55#if __BYTE_ORDER == __LITTLE_ENDIAN
56# define ARCH_CONVERT 0
57#else
58# define ARCH_CONVERT ARCH_NOCONVERT
59#endif
60
61/* generic swapping macros */
62
63#ifndef HAVE_SWABMACROS
64#define INT_SWAP16(type,var) ((typeof(type))(__swab16((__u16)(var))))
65#define INT_SWAP32(type,var) ((typeof(type))(__swab32((__u32)(var))))
66#define INT_SWAP64(type,var) ((typeof(type))(__swab64((__u64)(var))))
67#endif
68
69#define INT_SWAP(type, var) \
70 ((sizeof(type) == 8) ? INT_SWAP64(type,var) : \
71 ((sizeof(type) == 4) ? INT_SWAP32(type,var) : \
72 ((sizeof(type) == 2) ? INT_SWAP16(type,var) : \
73 (var))))
74
75/*
76 * get and set integers from potentially unaligned locations
77 */
78
79#define INT_GET_UNALIGNED_16_BE(pointer) \
80 ((__u16)((((__u8*)(pointer))[0] << 8) | (((__u8*)(pointer))[1])))
81#define INT_SET_UNALIGNED_16_BE(pointer,value) \
82 { \
83 ((__u8*)(pointer))[0] = (((value) >> 8) & 0xff); \
84 ((__u8*)(pointer))[1] = (((value) ) & 0xff); \
85 }
86
87/* define generic INT_ macros */
88
89#define INT_GET(reference,arch) \
90 (((arch) == ARCH_NOCONVERT) \
91 ? \
92 (reference) \
93 : \
94 INT_SWAP((reference),(reference)) \
95 )
96
97/* does not return a value */
98#define INT_SET(reference,arch,valueref) \
99 (__builtin_constant_p(valueref) ? \
100 (void)( (reference) = ( ((arch) != ARCH_NOCONVERT) ? (INT_SWAP((reference),(valueref))) : (valueref)) ) : \
101 (void)( \
102 ((reference) = (valueref)), \
103 ( ((arch) != ARCH_NOCONVERT) ? (reference) = INT_SWAP((reference),(reference)) : 0 ) \
104 ) \
105 )
106
107/* does not return a value */
108#define INT_MOD_EXPR(reference,arch,code) \
109 (((arch) == ARCH_NOCONVERT) \
110 ? \
111 (void)((reference) code) \
112 : \
113 (void)( \
114 (reference) = INT_GET((reference),arch) , \
115 ((reference) code), \
116 INT_SET(reference, arch, reference) \
117 ) \
118 )
119
120/* does not return a value */
121#define INT_MOD(reference,arch,delta) \
122 (void)( \
123 INT_MOD_EXPR(reference,arch,+=(delta)) \
124 )
125
126/*
127 * INT_COPY - copy a value between two locations with the
128 * _same architecture_ but _potentially different sizes_
129 *
130 * if the types of the two parameters are equal or they are
131 * in native architecture, a simple copy is done
132 *
133 * otherwise, architecture conversions are done
134 *
135 */
136
137/* does not return a value */
138#define INT_COPY(dst,src,arch) \
139 ( \
140 ((sizeof(dst) == sizeof(src)) || ((arch) == ARCH_NOCONVERT)) \
141 ? \
142 (void)((dst) = (src)) \
143 : \
144 INT_SET(dst, arch, INT_GET(src, arch)) \
145 )
146
147/*
148 * INT_XLATE - copy a value in either direction between two locations
149 * with different architectures
150 *
151 * dir < 0 - copy from memory to buffer (native to arch)
152 * dir > 0 - copy from buffer to memory (arch to native)
153 */
154
155/* does not return a value */
156#define INT_XLATE(buf,mem,dir,arch) {\
157 ASSERT(dir); \
158 if (dir>0) { \
159 (mem)=INT_GET(buf, arch); \
160 } else { \
161 INT_SET(buf, arch, mem); \
162 } \
163}
164
165/*
166 * In directories inode numbers are stored as unaligned arrays of unsigned
167 * 8bit integers on disk.
168 *
169 * For v1 directories or v2 directories that contain inode numbers that
170 * do not fit into 32bit the array has eight members, but the first member
171 * is always zero:
172 *
173 * |unused|48-55|40-47|32-39|24-31|16-23| 8-15| 0- 7|
174 *
175 * For v2 directories that only contain entries with inode numbers that fit
176 * into 32bits a four-member array is used:
177 *
178 * |24-31|16-23| 8-15| 0- 7|
179 */
180
181#define XFS_GET_DIR_INO4(di) \
182 (((u32)(di).i[0] << 24) | ((di).i[1] << 16) | ((di).i[2] << 8) | ((di).i[3]))
183
184#define XFS_PUT_DIR_INO4(from, di) \
185do { \
186 (di).i[0] = (((from) & 0xff000000ULL) >> 24); \
187 (di).i[1] = (((from) & 0x00ff0000ULL) >> 16); \
188 (di).i[2] = (((from) & 0x0000ff00ULL) >> 8); \
189 (di).i[3] = ((from) & 0x000000ffULL); \
190} while (0)
191
192#define XFS_DI_HI(di) \
193 (((u32)(di).i[1] << 16) | ((di).i[2] << 8) | ((di).i[3]))
194#define XFS_DI_LO(di) \
195 (((u32)(di).i[4] << 24) | ((di).i[5] << 16) | ((di).i[6] << 8) | ((di).i[7]))
196
197#define XFS_GET_DIR_INO8(di) \
198 (((xfs_ino_t)XFS_DI_LO(di) & 0xffffffffULL) | \
199 ((xfs_ino_t)XFS_DI_HI(di) << 32))
200
201#define XFS_PUT_DIR_INO8(from, di) \
202do { \
203 (di).i[0] = 0; \
204 (di).i[1] = (((from) & 0x00ff000000000000ULL) >> 48); \
205 (di).i[2] = (((from) & 0x0000ff0000000000ULL) >> 40); \
206 (di).i[3] = (((from) & 0x000000ff00000000ULL) >> 32); \
207 (di).i[4] = (((from) & 0x00000000ff000000ULL) >> 24); \
208 (di).i[5] = (((from) & 0x0000000000ff0000ULL) >> 16); \
209 (di).i[6] = (((from) & 0x000000000000ff00ULL) >> 8); \
210 (di).i[7] = ((from) & 0x00000000000000ffULL); \
211} while (0)
212
213#endif /* __XFS_ARCH_H__ */
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
new file mode 100644
index 000000000000..ee8b5904ec7c
--- /dev/null
+++ b/fs/xfs/xfs_attr.c
@@ -0,0 +1,2660 @@
1/*
2 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34
35#include "xfs_macros.h"
36#include "xfs_types.h"
37#include "xfs_inum.h"
38#include "xfs_log.h"
39#include "xfs_trans.h"
40#include "xfs_sb.h"
41#include "xfs_ag.h"
42#include "xfs_dir.h"
43#include "xfs_dir2.h"
44#include "xfs_dmapi.h"
45#include "xfs_mount.h"
46#include "xfs_alloc_btree.h"
47#include "xfs_bmap_btree.h"
48#include "xfs_ialloc_btree.h"
49#include "xfs_alloc.h"
50#include "xfs_btree.h"
51#include "xfs_attr_sf.h"
52#include "xfs_dir_sf.h"
53#include "xfs_dir2_sf.h"
54#include "xfs_dinode.h"
55#include "xfs_inode_item.h"
56#include "xfs_inode.h"
57#include "xfs_bmap.h"
58#include "xfs_da_btree.h"
59#include "xfs_attr.h"
60#include "xfs_attr_leaf.h"
61#include "xfs_error.h"
62#include "xfs_bit.h"
63#include "xfs_quota.h"
64#include "xfs_rw.h"
65#include "xfs_trans_space.h"
66#include "xfs_acl.h"
67
68/*
69 * xfs_attr.c
70 *
71 * Provide the external interfaces to manage attribute lists.
72 */
73
74/*========================================================================
75 * Function prototypes for the kernel.
76 *========================================================================*/
77
78/*
79 * Internal routines when attribute list fits inside the inode.
80 */
81STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args);
82
83/*
84 * Internal routines when attribute list is one block.
85 */
86STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args);
87STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args);
88STATIC int xfs_attr_leaf_list(xfs_attr_list_context_t *context);
89
90/*
91 * Internal routines when attribute list is more than one block.
92 */
93STATIC int xfs_attr_node_addname(xfs_da_args_t *args);
94STATIC int xfs_attr_node_removename(xfs_da_args_t *args);
95STATIC int xfs_attr_node_list(xfs_attr_list_context_t *context);
96STATIC int xfs_attr_fillstate(xfs_da_state_t *state);
97STATIC int xfs_attr_refillstate(xfs_da_state_t *state);
98
99/*
100 * Routines to manipulate out-of-line attribute values.
101 */
102STATIC int xfs_attr_rmtval_get(xfs_da_args_t *args);
103STATIC int xfs_attr_rmtval_set(xfs_da_args_t *args);
104STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args);
105
106#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */
107
108#if defined(XFS_ATTR_TRACE)
109ktrace_t *xfs_attr_trace_buf;
110#endif
111
112
113/*========================================================================
114 * Overall external interface routines.
115 *========================================================================*/
116
117int
118xfs_attr_fetch(xfs_inode_t *ip, char *name, int namelen,
119 char *value, int *valuelenp, int flags, struct cred *cred)
120{
121 xfs_da_args_t args;
122 int error;
123
124 if ((XFS_IFORK_Q(ip) == 0) ||
125 (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
126 ip->i_d.di_anextents == 0))
127 return(ENOATTR);
128
129 if (!(flags & (ATTR_KERNACCESS|ATTR_SECURE))) {
130 if ((error = xfs_iaccess(ip, S_IRUSR, cred)))
131 return(XFS_ERROR(error));
132 }
133
134 /*
135 * Fill in the arg structure for this request.
136 */
137 memset((char *)&args, 0, sizeof(args));
138 args.name = name;
139 args.namelen = namelen;
140 args.value = value;
141 args.valuelen = *valuelenp;
142 args.flags = flags;
143 args.hashval = xfs_da_hashname(args.name, args.namelen);
144 args.dp = ip;
145 args.whichfork = XFS_ATTR_FORK;
146
147 /*
148 * Decide on what work routines to call based on the inode size.
149 */
150 if (XFS_IFORK_Q(ip) == 0 ||
151 (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
152 ip->i_d.di_anextents == 0)) {
153 error = XFS_ERROR(ENOATTR);
154 } else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
155 error = xfs_attr_shortform_getvalue(&args);
156 } else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) {
157 error = xfs_attr_leaf_get(&args);
158 } else {
159 error = xfs_attr_node_get(&args);
160 }
161
162 /*
163 * Return the number of bytes in the value to the caller.
164 */
165 *valuelenp = args.valuelen;
166
167 if (error == EEXIST)
168 error = 0;
169 return(error);
170}
171
172int
173xfs_attr_get(bhv_desc_t *bdp, char *name, char *value, int *valuelenp,
174 int flags, struct cred *cred)
175{
176 xfs_inode_t *ip = XFS_BHVTOI(bdp);
177 int error, namelen;
178
179 XFS_STATS_INC(xs_attr_get);
180
181 if (!name)
182 return(EINVAL);
183 namelen = strlen(name);
184 if (namelen >= MAXNAMELEN)
185 return(EFAULT); /* match IRIX behaviour */
186
187 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
188 return(EIO);
189
190 xfs_ilock(ip, XFS_ILOCK_SHARED);
191 error = xfs_attr_fetch(ip, name, namelen, value, valuelenp, flags, cred);
192 xfs_iunlock(ip, XFS_ILOCK_SHARED);
193 return(error);
194}
195
196/*ARGSUSED*/
197int /* error */
198xfs_attr_set(bhv_desc_t *bdp, char *name, char *value, int valuelen, int flags,
199 struct cred *cred)
200{
201 xfs_da_args_t args;
202 xfs_inode_t *dp;
203 xfs_fsblock_t firstblock;
204 xfs_bmap_free_t flist;
205 int error, err2, committed;
206 int local, size;
207 uint nblks;
208 xfs_mount_t *mp;
209 int rsvd = (flags & ATTR_ROOT) != 0;
210 int namelen;
211
212 namelen = strlen(name);
213 if (namelen >= MAXNAMELEN)
214 return EFAULT; /* match IRIX behaviour */
215
216 XFS_STATS_INC(xs_attr_set);
217
218 dp = XFS_BHVTOI(bdp);
219 mp = dp->i_mount;
220 if (XFS_FORCED_SHUTDOWN(mp))
221 return (EIO);
222
223 xfs_ilock(dp, XFS_ILOCK_SHARED);
224 if (!(flags & ATTR_SECURE) &&
225 (error = xfs_iaccess(dp, S_IWUSR, cred))) {
226 xfs_iunlock(dp, XFS_ILOCK_SHARED);
227 return(XFS_ERROR(error));
228 }
229 xfs_iunlock(dp, XFS_ILOCK_SHARED);
230
231 /*
232 * Attach the dquots to the inode.
233 */
234 if ((error = XFS_QM_DQATTACH(mp, dp, 0)))
235 return (error);
236
237 /*
238 * If the inode doesn't have an attribute fork, add one.
239 * (inode must not be locked when we call this routine)
240 */
241 if (XFS_IFORK_Q(dp) == 0) {
242 error = xfs_bmap_add_attrfork(dp, rsvd);
243 if (error)
244 return(error);
245 }
246
247 /*
248 * Fill in the arg structure for this request.
249 */
250 memset((char *)&args, 0, sizeof(args));
251 args.name = name;
252 args.namelen = namelen;
253 args.value = value;
254 args.valuelen = valuelen;
255 args.flags = flags;
256 args.hashval = xfs_da_hashname(args.name, args.namelen);
257 args.dp = dp;
258 args.firstblock = &firstblock;
259 args.flist = &flist;
260 args.whichfork = XFS_ATTR_FORK;
261 args.oknoent = 1;
262
263 /* Determine space new attribute will use, and if it will be inline
264 * or out of line.
265 */
266 size = xfs_attr_leaf_newentsize(&args, mp->m_sb.sb_blocksize, &local);
267
268 nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
269 if (local) {
270 if (size > (mp->m_sb.sb_blocksize >> 1)) {
271 /* Double split possible */
272 nblks <<= 1;
273 }
274 } else {
275 uint dblocks = XFS_B_TO_FSB(mp, valuelen);
276 /* Out of line attribute, cannot double split, but make
277 * room for the attribute value itself.
278 */
279 nblks += dblocks;
280 nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
281 }
282
283 /* Size is now blocks for attribute data */
284 args.total = nblks;
285
286 /*
287 * Start our first transaction of the day.
288 *
289 * All future transactions during this code must be "chained" off
290 * this one via the trans_dup() call. All transactions will contain
291 * the inode, and the inode will always be marked with trans_ihold().
292 * Since the inode will be locked in all transactions, we must log
293 * the inode in every transaction to let it float upward through
294 * the log.
295 */
296 args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_SET);
297
298 /*
299 * Root fork attributes can use reserved data blocks for this
300 * operation if necessary
301 */
302
303 if (rsvd)
304 args.trans->t_flags |= XFS_TRANS_RESERVE;
305
306 if ((error = xfs_trans_reserve(args.trans, (uint) nblks,
307 XFS_ATTRSET_LOG_RES(mp, nblks),
308 0, XFS_TRANS_PERM_LOG_RES,
309 XFS_ATTRSET_LOG_COUNT))) {
310 xfs_trans_cancel(args.trans, 0);
311 return(error);
312 }
313 xfs_ilock(dp, XFS_ILOCK_EXCL);
314
315 error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, args.trans, dp, nblks, 0,
316 rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
317 XFS_QMOPT_RES_REGBLKS);
318 if (error) {
319 xfs_iunlock(dp, XFS_ILOCK_EXCL);
320 xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES);
321 return (error);
322 }
323
324 xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL);
325 xfs_trans_ihold(args.trans, dp);
326
327 /*
328 * If the attribute list is non-existant or a shortform list,
329 * upgrade it to a single-leaf-block attribute list.
330 */
331 if ((dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) ||
332 ((dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) &&
333 (dp->i_d.di_anextents == 0))) {
334
335 /*
336 * Build initial attribute list (if required).
337 */
338 if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS)
339 (void)xfs_attr_shortform_create(&args);
340
341 /*
342 * Try to add the attr to the attribute list in
343 * the inode.
344 */
345 error = xfs_attr_shortform_addname(&args);
346 if (error != ENOSPC) {
347 /*
348 * Commit the shortform mods, and we're done.
349 * NOTE: this is also the error path (EEXIST, etc).
350 */
351 ASSERT(args.trans != NULL);
352
353 /*
354 * If this is a synchronous mount, make sure that
355 * the transaction goes to disk before returning
356 * to the user.
357 */
358 if (mp->m_flags & XFS_MOUNT_WSYNC) {
359 xfs_trans_set_sync(args.trans);
360 }
361 err2 = xfs_trans_commit(args.trans,
362 XFS_TRANS_RELEASE_LOG_RES,
363 NULL);
364 xfs_iunlock(dp, XFS_ILOCK_EXCL);
365
366 /*
367 * Hit the inode change time.
368 */
369 if (!error && (flags & ATTR_KERNOTIME) == 0) {
370 xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
371 }
372 return(error == 0 ? err2 : error);
373 }
374
375 /*
376 * It won't fit in the shortform, transform to a leaf block.
377 * GROT: another possible req'mt for a double-split btree op.
378 */
379 XFS_BMAP_INIT(args.flist, args.firstblock);
380 error = xfs_attr_shortform_to_leaf(&args);
381 if (!error) {
382 error = xfs_bmap_finish(&args.trans, args.flist,
383 *args.firstblock, &committed);
384 }
385 if (error) {
386 ASSERT(committed);
387 args.trans = NULL;
388 xfs_bmap_cancel(&flist);
389 goto out;
390 }
391
392 /*
393 * bmap_finish() may have committed the last trans and started
394 * a new one. We need the inode to be in all transactions.
395 */
396 if (committed) {
397 xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL);
398 xfs_trans_ihold(args.trans, dp);
399 }
400
401 /*
402 * Commit the leaf transformation. We'll need another (linked)
403 * transaction to add the new attribute to the leaf.
404 */
405 if ((error = xfs_attr_rolltrans(&args.trans, dp)))
406 goto out;
407
408 }
409
410 if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
411 error = xfs_attr_leaf_addname(&args);
412 } else {
413 error = xfs_attr_node_addname(&args);
414 }
415 if (error) {
416 goto out;
417 }
418
419 /*
420 * If this is a synchronous mount, make sure that the
421 * transaction goes to disk before returning to the user.
422 */
423 if (mp->m_flags & XFS_MOUNT_WSYNC) {
424 xfs_trans_set_sync(args.trans);
425 }
426
427 /*
428 * Commit the last in the sequence of transactions.
429 */
430 xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
431 error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES,
432 NULL);
433 xfs_iunlock(dp, XFS_ILOCK_EXCL);
434
435 /*
436 * Hit the inode change time.
437 */
438 if (!error && (flags & ATTR_KERNOTIME) == 0) {
439 xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
440 }
441
442 return(error);
443
444out:
445 if (args.trans)
446 xfs_trans_cancel(args.trans,
447 XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
448 xfs_iunlock(dp, XFS_ILOCK_EXCL);
449 return(error);
450}
451
452/*
453 * Generic handler routine to remove a name from an attribute list.
454 * Transitions attribute list from Btree to shortform as necessary.
455 */
456/*ARGSUSED*/
457int /* error */
458xfs_attr_remove(bhv_desc_t *bdp, char *name, int flags, struct cred *cred)
459{
460 xfs_da_args_t args;
461 xfs_inode_t *dp;
462 xfs_fsblock_t firstblock;
463 xfs_bmap_free_t flist;
464 int error;
465 xfs_mount_t *mp;
466 int namelen;
467
468 ASSERT(MAXNAMELEN-1<=0xff); /* length is stored in uint8 */
469 namelen = strlen(name);
470 if (namelen>=MAXNAMELEN)
471 return EFAULT; /* match irix behaviour */
472
473 XFS_STATS_INC(xs_attr_remove);
474
475 dp = XFS_BHVTOI(bdp);
476 mp = dp->i_mount;
477 if (XFS_FORCED_SHUTDOWN(mp))
478 return (EIO);
479
480 xfs_ilock(dp, XFS_ILOCK_SHARED);
481 if (!(flags & ATTR_SECURE) &&
482 (error = xfs_iaccess(dp, S_IWUSR, cred))) {
483 xfs_iunlock(dp, XFS_ILOCK_SHARED);
484 return(XFS_ERROR(error));
485 } else if (XFS_IFORK_Q(dp) == 0 ||
486 (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
487 dp->i_d.di_anextents == 0)) {
488 xfs_iunlock(dp, XFS_ILOCK_SHARED);
489 return(XFS_ERROR(ENOATTR));
490 }
491 xfs_iunlock(dp, XFS_ILOCK_SHARED);
492
493 /*
494 * Fill in the arg structure for this request.
495 */
496 memset((char *)&args, 0, sizeof(args));
497 args.name = name;
498 args.namelen = namelen;
499 args.flags = flags;
500 args.hashval = xfs_da_hashname(args.name, args.namelen);
501 args.dp = dp;
502 args.firstblock = &firstblock;
503 args.flist = &flist;
504 args.total = 0;
505 args.whichfork = XFS_ATTR_FORK;
506
507 /*
508 * Attach the dquots to the inode.
509 */
510 if ((error = XFS_QM_DQATTACH(mp, dp, 0)))
511 return (error);
512
513 /*
514 * Start our first transaction of the day.
515 *
516 * All future transactions during this code must be "chained" off
517 * this one via the trans_dup() call. All transactions will contain
518 * the inode, and the inode will always be marked with trans_ihold().
519 * Since the inode will be locked in all transactions, we must log
520 * the inode in every transaction to let it float upward through
521 * the log.
522 */
523 args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_RM);
524
525 /*
526 * Root fork attributes can use reserved data blocks for this
527 * operation if necessary
528 */
529
530 if (flags & ATTR_ROOT)
531 args.trans->t_flags |= XFS_TRANS_RESERVE;
532
533 if ((error = xfs_trans_reserve(args.trans,
534 XFS_ATTRRM_SPACE_RES(mp),
535 XFS_ATTRRM_LOG_RES(mp),
536 0, XFS_TRANS_PERM_LOG_RES,
537 XFS_ATTRRM_LOG_COUNT))) {
538 xfs_trans_cancel(args.trans, 0);
539 return(error);
540
541 }
542
543 xfs_ilock(dp, XFS_ILOCK_EXCL);
544 /*
545 * No need to make quota reservations here. We expect to release some
546 * blocks not allocate in the common case.
547 */
548 xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL);
549 xfs_trans_ihold(args.trans, dp);
550
551 /*
552 * Decide on what work routines to call based on the inode size.
553 */
554 if (XFS_IFORK_Q(dp) == 0 ||
555 (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
556 dp->i_d.di_anextents == 0)) {
557 error = XFS_ERROR(ENOATTR);
558 goto out;
559 }
560 if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
561 ASSERT(dp->i_afp->if_flags & XFS_IFINLINE);
562 error = xfs_attr_shortform_remove(&args);
563 if (error) {
564 goto out;
565 }
566 } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
567 error = xfs_attr_leaf_removename(&args);
568 } else {
569 error = xfs_attr_node_removename(&args);
570 }
571 if (error) {
572 goto out;
573 }
574
575 /*
576 * If this is a synchronous mount, make sure that the
577 * transaction goes to disk before returning to the user.
578 */
579 if (mp->m_flags & XFS_MOUNT_WSYNC) {
580 xfs_trans_set_sync(args.trans);
581 }
582
583 /*
584 * Commit the last in the sequence of transactions.
585 */
586 xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
587 error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES,
588 NULL);
589 xfs_iunlock(dp, XFS_ILOCK_EXCL);
590
591 /*
592 * Hit the inode change time.
593 */
594 if (!error && (flags & ATTR_KERNOTIME) == 0) {
595 xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
596 }
597
598 return(error);
599
600out:
601 if (args.trans)
602 xfs_trans_cancel(args.trans,
603 XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
604 xfs_iunlock(dp, XFS_ILOCK_EXCL);
605 return(error);
606}
607
608/*
609 * Generate a list of extended attribute names and optionally
610 * also value lengths. Positive return value follows the XFS
611 * convention of being an error, zero or negative return code
612 * is the length of the buffer returned (negated), indicating
613 * success.
614 */
615int
616xfs_attr_list(bhv_desc_t *bdp, char *buffer, int bufsize, int flags,
617 attrlist_cursor_kern_t *cursor, struct cred *cred)
618{
619 xfs_attr_list_context_t context;
620 xfs_inode_t *dp;
621 int error;
622
623 XFS_STATS_INC(xs_attr_list);
624
625 /*
626 * Validate the cursor.
627 */
628 if (cursor->pad1 || cursor->pad2)
629 return(XFS_ERROR(EINVAL));
630 if ((cursor->initted == 0) &&
631 (cursor->hashval || cursor->blkno || cursor->offset))
632 return(XFS_ERROR(EINVAL));
633
634 /*
635 * Check for a properly aligned buffer.
636 */
637 if (((long)buffer) & (sizeof(int)-1))
638 return(XFS_ERROR(EFAULT));
639 if (flags & ATTR_KERNOVAL)
640 bufsize = 0;
641
642 /*
643 * Initialize the output buffer.
644 */
645 context.dp = dp = XFS_BHVTOI(bdp);
646 context.cursor = cursor;
647 context.count = 0;
648 context.dupcnt = 0;
649 context.resynch = 1;
650 context.flags = flags;
651 if (!(flags & ATTR_KERNAMELS)) {
652 context.bufsize = (bufsize & ~(sizeof(int)-1)); /* align */
653 context.firstu = context.bufsize;
654 context.alist = (attrlist_t *)buffer;
655 context.alist->al_count = 0;
656 context.alist->al_more = 0;
657 context.alist->al_offset[0] = context.bufsize;
658 }
659 else {
660 context.bufsize = bufsize;
661 context.firstu = context.bufsize;
662 context.alist = (attrlist_t *)buffer;
663 }
664
665 if (XFS_FORCED_SHUTDOWN(dp->i_mount))
666 return (EIO);
667
668 xfs_ilock(dp, XFS_ILOCK_SHARED);
669 if (!(flags & ATTR_SECURE) &&
670 (error = xfs_iaccess(dp, S_IRUSR, cred))) {
671 xfs_iunlock(dp, XFS_ILOCK_SHARED);
672 return(XFS_ERROR(error));
673 }
674
675 /*
676 * Decide on what work routines to call based on the inode size.
677 */
678 xfs_attr_trace_l_c("syscall start", &context);
679 if (XFS_IFORK_Q(dp) == 0 ||
680 (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
681 dp->i_d.di_anextents == 0)) {
682 error = 0;
683 } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
684 error = xfs_attr_shortform_list(&context);
685 } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
686 error = xfs_attr_leaf_list(&context);
687 } else {
688 error = xfs_attr_node_list(&context);
689 }
690 xfs_iunlock(dp, XFS_ILOCK_SHARED);
691 xfs_attr_trace_l_c("syscall end", &context);
692
693 if (!(context.flags & (ATTR_KERNOVAL|ATTR_KERNAMELS))) {
694 ASSERT(error >= 0);
695 }
696 else { /* must return negated buffer size or the error */
697 if (context.count < 0)
698 error = XFS_ERROR(ERANGE);
699 else
700 error = -context.count;
701 }
702
703 return(error);
704}
705
706int /* error */
707xfs_attr_inactive(xfs_inode_t *dp)
708{
709 xfs_trans_t *trans;
710 xfs_mount_t *mp;
711 int error;
712
713 mp = dp->i_mount;
714 ASSERT(! XFS_NOT_DQATTACHED(mp, dp));
715
716 xfs_ilock(dp, XFS_ILOCK_SHARED);
717 if ((XFS_IFORK_Q(dp) == 0) ||
718 (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) ||
719 (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
720 dp->i_d.di_anextents == 0)) {
721 xfs_iunlock(dp, XFS_ILOCK_SHARED);
722 return(0);
723 }
724 xfs_iunlock(dp, XFS_ILOCK_SHARED);
725
726 /*
727 * Start our first transaction of the day.
728 *
729 * All future transactions during this code must be "chained" off
730 * this one via the trans_dup() call. All transactions will contain
731 * the inode, and the inode will always be marked with trans_ihold().
732 * Since the inode will be locked in all transactions, we must log
733 * the inode in every transaction to let it float upward through
734 * the log.
735 */
736 trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL);
737 if ((error = xfs_trans_reserve(trans, 0, XFS_ATTRINVAL_LOG_RES(mp), 0,
738 XFS_TRANS_PERM_LOG_RES,
739 XFS_ATTRINVAL_LOG_COUNT))) {
740 xfs_trans_cancel(trans, 0);
741 return(error);
742 }
743 xfs_ilock(dp, XFS_ILOCK_EXCL);
744
745 /*
746 * No need to make quota reservations here. We expect to release some
747 * blocks, not allocate, in the common case.
748 */
749 xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL);
750 xfs_trans_ihold(trans, dp);
751
752 /*
753 * Decide on what work routines to call based on the inode size.
754 */
755 if ((XFS_IFORK_Q(dp) == 0) ||
756 (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) ||
757 (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
758 dp->i_d.di_anextents == 0)) {
759 error = 0;
760 goto out;
761 }
762 error = xfs_attr_root_inactive(&trans, dp);
763 if (error)
764 goto out;
765 /*
766 * signal synchronous inactive transactions unless this
767 * is a synchronous mount filesystem in which case we
768 * know that we're here because we've been called out of
769 * xfs_inactive which means that the last reference is gone
770 * and the unlink transaction has already hit the disk so
771 * async inactive transactions are safe.
772 */
773 if ((error = xfs_itruncate_finish(&trans, dp, 0LL, XFS_ATTR_FORK,
774 (!(mp->m_flags & XFS_MOUNT_WSYNC)
775 ? 1 : 0))))
776 goto out;
777
778 /*
779 * Commit the last in the sequence of transactions.
780 */
781 xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
782 error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES,
783 NULL);
784 xfs_iunlock(dp, XFS_ILOCK_EXCL);
785
786 return(error);
787
788out:
789 xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
790 xfs_iunlock(dp, XFS_ILOCK_EXCL);
791 return(error);
792}
793
794
795
796/*========================================================================
797 * External routines when attribute list is inside the inode
798 *========================================================================*/
799
800/*
801 * Add a name to the shortform attribute list structure
802 * This is the external routine.
803 */
804STATIC int
805xfs_attr_shortform_addname(xfs_da_args_t *args)
806{
807 int newsize, retval;
808
809 retval = xfs_attr_shortform_lookup(args);
810 if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
811 return(retval);
812 } else if (retval == EEXIST) {
813 if (args->flags & ATTR_CREATE)
814 return(retval);
815 retval = xfs_attr_shortform_remove(args);
816 ASSERT(retval == 0);
817 }
818
819 newsize = XFS_ATTR_SF_TOTSIZE(args->dp);
820 newsize += XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
821 if ((newsize <= XFS_IFORK_ASIZE(args->dp)) &&
822 (args->namelen < XFS_ATTR_SF_ENTSIZE_MAX) &&
823 (args->valuelen < XFS_ATTR_SF_ENTSIZE_MAX)) {
824 retval = xfs_attr_shortform_add(args);
825 ASSERT(retval == 0);
826 } else {
827 return(XFS_ERROR(ENOSPC));
828 }
829 return(0);
830}
831
832
833/*========================================================================
834 * External routines when attribute list is one block
835 *========================================================================*/
836
837/*
838 * Add a name to the leaf attribute list structure
839 *
840 * This leaf block cannot have a "remote" value, we only call this routine
841 * if bmap_one_block() says there is only one block (ie: no remote blks).
842 */
843int
844xfs_attr_leaf_addname(xfs_da_args_t *args)
845{
846 xfs_inode_t *dp;
847 xfs_dabuf_t *bp;
848 int retval, error, committed;
849
850 /*
851 * Read the (only) block in the attribute list in.
852 */
853 dp = args->dp;
854 args->blkno = 0;
855 error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
856 XFS_ATTR_FORK);
857 if (error)
858 return(error);
859 ASSERT(bp != NULL);
860
861 /*
862 * Look up the given attribute in the leaf block. Figure out if
863 * the given flags produce an error or call for an atomic rename.
864 */
865 retval = xfs_attr_leaf_lookup_int(bp, args);
866 if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
867 xfs_da_brelse(args->trans, bp);
868 return(retval);
869 } else if (retval == EEXIST) {
870 if (args->flags & ATTR_CREATE) { /* pure create op */
871 xfs_da_brelse(args->trans, bp);
872 return(retval);
873 }
874 args->rename = 1; /* an atomic rename */
875 args->blkno2 = args->blkno; /* set 2nd entry info*/
876 args->index2 = args->index;
877 args->rmtblkno2 = args->rmtblkno;
878 args->rmtblkcnt2 = args->rmtblkcnt;
879 }
880
881 /*
882 * Add the attribute to the leaf block, transitioning to a Btree
883 * if required.
884 */
885 retval = xfs_attr_leaf_add(bp, args);
886 xfs_da_buf_done(bp);
887 if (retval == ENOSPC) {
888 /*
889 * Promote the attribute list to the Btree format, then
890 * Commit that transaction so that the node_addname() call
891 * can manage its own transactions.
892 */
893 XFS_BMAP_INIT(args->flist, args->firstblock);
894 error = xfs_attr_leaf_to_node(args);
895 if (!error) {
896 error = xfs_bmap_finish(&args->trans, args->flist,
897 *args->firstblock, &committed);
898 }
899 if (error) {
900 ASSERT(committed);
901 args->trans = NULL;
902 xfs_bmap_cancel(args->flist);
903 return(error);
904 }
905
906 /*
907 * bmap_finish() may have committed the last trans and started
908 * a new one. We need the inode to be in all transactions.
909 */
910 if (committed) {
911 xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
912 xfs_trans_ihold(args->trans, dp);
913 }
914
915 /*
916 * Commit the current trans (including the inode) and start
917 * a new one.
918 */
919 if ((error = xfs_attr_rolltrans(&args->trans, dp)))
920 return (error);
921
922 /*
923 * Fob the whole rest of the problem off on the Btree code.
924 */
925 error = xfs_attr_node_addname(args);
926 return(error);
927 }
928
929 /*
930 * Commit the transaction that added the attr name so that
931 * later routines can manage their own transactions.
932 */
933 if ((error = xfs_attr_rolltrans(&args->trans, dp)))
934 return (error);
935
936 /*
937 * If there was an out-of-line value, allocate the blocks we
938 * identified for its storage and copy the value. This is done
939 * after we create the attribute so that we don't overflow the
940 * maximum size of a transaction and/or hit a deadlock.
941 */
942 if (args->rmtblkno > 0) {
943 error = xfs_attr_rmtval_set(args);
944 if (error)
945 return(error);
946 }
947
948 /*
949 * If this is an atomic rename operation, we must "flip" the
950 * incomplete flags on the "new" and "old" attribute/value pairs
951 * so that one disappears and one appears atomically. Then we
952 * must remove the "old" attribute/value pair.
953 */
954 if (args->rename) {
955 /*
956 * In a separate transaction, set the incomplete flag on the
957 * "old" attr and clear the incomplete flag on the "new" attr.
958 */
959 error = xfs_attr_leaf_flipflags(args);
960 if (error)
961 return(error);
962
963 /*
964 * Dismantle the "old" attribute/value pair by removing
965 * a "remote" value (if it exists).
966 */
967 args->index = args->index2;
968 args->blkno = args->blkno2;
969 args->rmtblkno = args->rmtblkno2;
970 args->rmtblkcnt = args->rmtblkcnt2;
971 if (args->rmtblkno) {
972 error = xfs_attr_rmtval_remove(args);
973 if (error)
974 return(error);
975 }
976
977 /*
978 * Read in the block containing the "old" attr, then
979 * remove the "old" attr from that block (neat, huh!)
980 */
981 error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1,
982 &bp, XFS_ATTR_FORK);
983 if (error)
984 return(error);
985 ASSERT(bp != NULL);
986 (void)xfs_attr_leaf_remove(bp, args);
987
988 /*
989 * If the result is small enough, shrink it all into the inode.
990 */
991 if (xfs_attr_shortform_allfit(bp, dp)) {
992 XFS_BMAP_INIT(args->flist, args->firstblock);
993 error = xfs_attr_leaf_to_shortform(bp, args);
994 /* bp is gone due to xfs_da_shrink_inode */
995 if (!error) {
996 error = xfs_bmap_finish(&args->trans,
997 args->flist,
998 *args->firstblock,
999 &committed);
1000 }
1001 if (error) {
1002 ASSERT(committed);
1003 args->trans = NULL;
1004 xfs_bmap_cancel(args->flist);
1005 return(error);
1006 }
1007
1008 /*
1009 * bmap_finish() may have committed the last trans
1010 * and started a new one. We need the inode to be
1011 * in all transactions.
1012 */
1013 if (committed) {
1014 xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
1015 xfs_trans_ihold(args->trans, dp);
1016 }
1017 } else
1018 xfs_da_buf_done(bp);
1019
1020 /*
1021 * Commit the remove and start the next trans in series.
1022 */
1023 error = xfs_attr_rolltrans(&args->trans, dp);
1024
1025 } else if (args->rmtblkno > 0) {
1026 /*
1027 * Added a "remote" value, just clear the incomplete flag.
1028 */
1029 error = xfs_attr_leaf_clearflag(args);
1030 }
1031 return(error);
1032}
1033
1034/*
1035 * Remove a name from the leaf attribute list structure
1036 *
1037 * This leaf block cannot have a "remote" value, we only call this routine
1038 * if bmap_one_block() says there is only one block (ie: no remote blks).
1039 */
1040STATIC int
1041xfs_attr_leaf_removename(xfs_da_args_t *args)
1042{
1043 xfs_inode_t *dp;
1044 xfs_dabuf_t *bp;
1045 int committed;
1046 int error;
1047
1048 /*
1049 * Remove the attribute.
1050 */
1051 dp = args->dp;
1052 args->blkno = 0;
1053 error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
1054 XFS_ATTR_FORK);
1055 if (error) {
1056 return(error);
1057 }
1058
1059 ASSERT(bp != NULL);
1060 error = xfs_attr_leaf_lookup_int(bp, args);
1061 if (error == ENOATTR) {
1062 xfs_da_brelse(args->trans, bp);
1063 return(error);
1064 }
1065
1066 (void)xfs_attr_leaf_remove(bp, args);
1067
1068 /*
1069 * If the result is small enough, shrink it all into the inode.
1070 */
1071 if (xfs_attr_shortform_allfit(bp, dp)) {
1072 XFS_BMAP_INIT(args->flist, args->firstblock);
1073 error = xfs_attr_leaf_to_shortform(bp, args);
1074 /* bp is gone due to xfs_da_shrink_inode */
1075 if (!error) {
1076 error = xfs_bmap_finish(&args->trans, args->flist,
1077 *args->firstblock, &committed);
1078 }
1079 if (error) {
1080 ASSERT(committed);
1081 args->trans = NULL;
1082 xfs_bmap_cancel(args->flist);
1083 return(error);
1084 }
1085
1086 /*
1087 * bmap_finish() may have committed the last trans and started
1088 * a new one. We need the inode to be in all transactions.
1089 */
1090 if (committed) {
1091 xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
1092 xfs_trans_ihold(args->trans, dp);
1093 }
1094 } else
1095 xfs_da_buf_done(bp);
1096 return(0);
1097}
1098
1099/*
1100 * Look up a name in a leaf attribute list structure.
1101 *
1102 * This leaf block cannot have a "remote" value, we only call this routine
1103 * if bmap_one_block() says there is only one block (ie: no remote blks).
1104 */
1105int
1106xfs_attr_leaf_get(xfs_da_args_t *args)
1107{
1108 xfs_dabuf_t *bp;
1109 int error;
1110
1111 args->blkno = 0;
1112 error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
1113 XFS_ATTR_FORK);
1114 if (error)
1115 return(error);
1116 ASSERT(bp != NULL);
1117
1118 error = xfs_attr_leaf_lookup_int(bp, args);
1119 if (error != EEXIST) {
1120 xfs_da_brelse(args->trans, bp);
1121 return(error);
1122 }
1123 error = xfs_attr_leaf_getvalue(bp, args);
1124 xfs_da_brelse(args->trans, bp);
1125 if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) {
1126 error = xfs_attr_rmtval_get(args);
1127 }
1128 return(error);
1129}
1130
1131/*
1132 * Copy out attribute entries for attr_list(), for leaf attribute lists.
1133 */
1134STATIC int
1135xfs_attr_leaf_list(xfs_attr_list_context_t *context)
1136{
1137 xfs_attr_leafblock_t *leaf;
1138 int error;
1139 xfs_dabuf_t *bp;
1140
1141 context->cursor->blkno = 0;
1142 error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK);
1143 if (error)
1144 return(error);
1145 ASSERT(bp != NULL);
1146 leaf = bp->data;
1147 if (unlikely(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
1148 != XFS_ATTR_LEAF_MAGIC)) {
1149 XFS_CORRUPTION_ERROR("xfs_attr_leaf_list", XFS_ERRLEVEL_LOW,
1150 context->dp->i_mount, leaf);
1151 xfs_da_brelse(NULL, bp);
1152 return(XFS_ERROR(EFSCORRUPTED));
1153 }
1154
1155 (void)xfs_attr_leaf_list_int(bp, context);
1156 xfs_da_brelse(NULL, bp);
1157 return(0);
1158}
1159
1160
1161/*========================================================================
1162 * External routines when attribute list size > XFS_LBSIZE(mp).
1163 *========================================================================*/
1164
1165/*
1166 * Add a name to a Btree-format attribute list.
1167 *
1168 * This will involve walking down the Btree, and may involve splitting
1169 * leaf nodes and even splitting intermediate nodes up to and including
1170 * the root node (a special case of an intermediate node).
1171 *
1172 * "Remote" attribute values confuse the issue and atomic rename operations
1173 * add a whole extra layer of confusion on top of that.
1174 */
1175STATIC int
1176xfs_attr_node_addname(xfs_da_args_t *args)
1177{
1178 xfs_da_state_t *state;
1179 xfs_da_state_blk_t *blk;
1180 xfs_inode_t *dp;
1181 xfs_mount_t *mp;
1182 int committed, retval, error;
1183
1184 /*
1185 * Fill in bucket of arguments/results/context to carry around.
1186 */
1187 dp = args->dp;
1188 mp = dp->i_mount;
1189restart:
1190 state = xfs_da_state_alloc();
1191 state->args = args;
1192 state->mp = mp;
1193 state->blocksize = state->mp->m_sb.sb_blocksize;
1194 state->node_ents = state->mp->m_attr_node_ents;
1195
1196 /*
1197 * Search to see if name already exists, and get back a pointer
1198 * to where it should go.
1199 */
1200 error = xfs_da_node_lookup_int(state, &retval);
1201 if (error)
1202 goto out;
1203 blk = &state->path.blk[ state->path.active-1 ];
1204 ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
1205 if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
1206 goto out;
1207 } else if (retval == EEXIST) {
1208 if (args->flags & ATTR_CREATE)
1209 goto out;
1210 args->rename = 1; /* atomic rename op */
1211 args->blkno2 = args->blkno; /* set 2nd entry info*/
1212 args->index2 = args->index;
1213 args->rmtblkno2 = args->rmtblkno;
1214 args->rmtblkcnt2 = args->rmtblkcnt;
1215 args->rmtblkno = 0;
1216 args->rmtblkcnt = 0;
1217 }
1218
1219 retval = xfs_attr_leaf_add(blk->bp, state->args);
1220 if (retval == ENOSPC) {
1221 if (state->path.active == 1) {
1222 /*
1223 * Its really a single leaf node, but it had
1224 * out-of-line values so it looked like it *might*
1225 * have been a b-tree.
1226 */
1227 xfs_da_state_free(state);
1228 XFS_BMAP_INIT(args->flist, args->firstblock);
1229 error = xfs_attr_leaf_to_node(args);
1230 if (!error) {
1231 error = xfs_bmap_finish(&args->trans,
1232 args->flist,
1233 *args->firstblock,
1234 &committed);
1235 }
1236 if (error) {
1237 ASSERT(committed);
1238 args->trans = NULL;
1239 xfs_bmap_cancel(args->flist);
1240 goto out;
1241 }
1242
1243 /*
1244 * bmap_finish() may have committed the last trans
1245 * and started a new one. We need the inode to be
1246 * in all transactions.
1247 */
1248 if (committed) {
1249 xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
1250 xfs_trans_ihold(args->trans, dp);
1251 }
1252
1253 /*
1254 * Commit the node conversion and start the next
1255 * trans in the chain.
1256 */
1257 if ((error = xfs_attr_rolltrans(&args->trans, dp)))
1258 goto out;
1259
1260 goto restart;
1261 }
1262
1263 /*
1264 * Split as many Btree elements as required.
1265 * This code tracks the new and old attr's location
1266 * in the index/blkno/rmtblkno/rmtblkcnt fields and
1267 * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields.
1268 */
1269 XFS_BMAP_INIT(args->flist, args->firstblock);
1270 error = xfs_da_split(state);
1271 if (!error) {
1272 error = xfs_bmap_finish(&args->trans, args->flist,
1273 *args->firstblock, &committed);
1274 }
1275 if (error) {
1276 ASSERT(committed);
1277 args->trans = NULL;
1278 xfs_bmap_cancel(args->flist);
1279 goto out;
1280 }
1281
1282 /*
1283 * bmap_finish() may have committed the last trans and started
1284 * a new one. We need the inode to be in all transactions.
1285 */
1286 if (committed) {
1287 xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
1288 xfs_trans_ihold(args->trans, dp);
1289 }
1290 } else {
1291 /*
1292 * Addition succeeded, update Btree hashvals.
1293 */
1294 xfs_da_fixhashpath(state, &state->path);
1295 }
1296
1297 /*
1298 * Kill the state structure, we're done with it and need to
1299 * allow the buffers to come back later.
1300 */
1301 xfs_da_state_free(state);
1302 state = NULL;
1303
1304 /*
1305 * Commit the leaf addition or btree split and start the next
1306 * trans in the chain.
1307 */
1308 if ((error = xfs_attr_rolltrans(&args->trans, dp)))
1309 goto out;
1310
1311 /*
1312 * If there was an out-of-line value, allocate the blocks we
1313 * identified for its storage and copy the value. This is done
1314 * after we create the attribute so that we don't overflow the
1315 * maximum size of a transaction and/or hit a deadlock.
1316 */
1317 if (args->rmtblkno > 0) {
1318 error = xfs_attr_rmtval_set(args);
1319 if (error)
1320 return(error);
1321 }
1322
1323 /*
1324 * If this is an atomic rename operation, we must "flip" the
1325 * incomplete flags on the "new" and "old" attribute/value pairs
1326 * so that one disappears and one appears atomically. Then we
1327 * must remove the "old" attribute/value pair.
1328 */
1329 if (args->rename) {
1330 /*
1331 * In a separate transaction, set the incomplete flag on the
1332 * "old" attr and clear the incomplete flag on the "new" attr.
1333 */
1334 error = xfs_attr_leaf_flipflags(args);
1335 if (error)
1336 goto out;
1337
1338 /*
1339 * Dismantle the "old" attribute/value pair by removing
1340 * a "remote" value (if it exists).
1341 */
1342 args->index = args->index2;
1343 args->blkno = args->blkno2;
1344 args->rmtblkno = args->rmtblkno2;
1345 args->rmtblkcnt = args->rmtblkcnt2;
1346 if (args->rmtblkno) {
1347 error = xfs_attr_rmtval_remove(args);
1348 if (error)
1349 return(error);
1350 }
1351
1352 /*
1353 * Re-find the "old" attribute entry after any split ops.
1354 * The INCOMPLETE flag means that we will find the "old"
1355 * attr, not the "new" one.
1356 */
1357 args->flags |= XFS_ATTR_INCOMPLETE;
1358 state = xfs_da_state_alloc();
1359 state->args = args;
1360 state->mp = mp;
1361 state->blocksize = state->mp->m_sb.sb_blocksize;
1362 state->node_ents = state->mp->m_attr_node_ents;
1363 state->inleaf = 0;
1364 error = xfs_da_node_lookup_int(state, &retval);
1365 if (error)
1366 goto out;
1367
1368 /*
1369 * Remove the name and update the hashvals in the tree.
1370 */
1371 blk = &state->path.blk[ state->path.active-1 ];
1372 ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
1373 error = xfs_attr_leaf_remove(blk->bp, args);
1374 xfs_da_fixhashpath(state, &state->path);
1375
1376 /*
1377 * Check to see if the tree needs to be collapsed.
1378 */
1379 if (retval && (state->path.active > 1)) {
1380 XFS_BMAP_INIT(args->flist, args->firstblock);
1381 error = xfs_da_join(state);
1382 if (!error) {
1383 error = xfs_bmap_finish(&args->trans,
1384 args->flist,
1385 *args->firstblock,
1386 &committed);
1387 }
1388 if (error) {
1389 ASSERT(committed);
1390 args->trans = NULL;
1391 xfs_bmap_cancel(args->flist);
1392 goto out;
1393 }
1394
1395 /*
1396 * bmap_finish() may have committed the last trans
1397 * and started a new one. We need the inode to be
1398 * in all transactions.
1399 */
1400 if (committed) {
1401 xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
1402 xfs_trans_ihold(args->trans, dp);
1403 }
1404 }
1405
1406 /*
1407 * Commit and start the next trans in the chain.
1408 */
1409 if ((error = xfs_attr_rolltrans(&args->trans, dp)))
1410 goto out;
1411
1412 } else if (args->rmtblkno > 0) {
1413 /*
1414 * Added a "remote" value, just clear the incomplete flag.
1415 */
1416 error = xfs_attr_leaf_clearflag(args);
1417 if (error)
1418 goto out;
1419 }
1420 retval = error = 0;
1421
1422out:
1423 if (state)
1424 xfs_da_state_free(state);
1425 if (error)
1426 return(error);
1427 return(retval);
1428}
1429
1430/*
1431 * Remove a name from a B-tree attribute list.
1432 *
1433 * This will involve walking down the Btree, and may involve joining
1434 * leaf nodes and even joining intermediate nodes up to and including
1435 * the root node (a special case of an intermediate node).
1436 */
1437STATIC int
1438xfs_attr_node_removename(xfs_da_args_t *args)
1439{
1440 xfs_da_state_t *state;
1441 xfs_da_state_blk_t *blk;
1442 xfs_inode_t *dp;
1443 xfs_dabuf_t *bp;
1444 int retval, error, committed;
1445
1446 /*
1447 * Tie a string around our finger to remind us where we are.
1448 */
1449 dp = args->dp;
1450 state = xfs_da_state_alloc();
1451 state->args = args;
1452 state->mp = dp->i_mount;
1453 state->blocksize = state->mp->m_sb.sb_blocksize;
1454 state->node_ents = state->mp->m_attr_node_ents;
1455
1456 /*
1457 * Search to see if name exists, and get back a pointer to it.
1458 */
1459 error = xfs_da_node_lookup_int(state, &retval);
1460 if (error || (retval != EEXIST)) {
1461 if (error == 0)
1462 error = retval;
1463 goto out;
1464 }
1465
1466 /*
1467 * If there is an out-of-line value, de-allocate the blocks.
1468 * This is done before we remove the attribute so that we don't
1469 * overflow the maximum size of a transaction and/or hit a deadlock.
1470 */
1471 blk = &state->path.blk[ state->path.active-1 ];
1472 ASSERT(blk->bp != NULL);
1473 ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
1474 if (args->rmtblkno > 0) {
1475 /*
1476 * Fill in disk block numbers in the state structure
1477 * so that we can get the buffers back after we commit
1478 * several transactions in the following calls.
1479 */
1480 error = xfs_attr_fillstate(state);
1481 if (error)
1482 goto out;
1483
1484 /*
1485 * Mark the attribute as INCOMPLETE, then bunmapi() the
1486 * remote value.
1487 */
1488 error = xfs_attr_leaf_setflag(args);
1489 if (error)
1490 goto out;
1491 error = xfs_attr_rmtval_remove(args);
1492 if (error)
1493 goto out;
1494
1495 /*
1496 * Refill the state structure with buffers, the prior calls
1497 * released our buffers.
1498 */
1499 error = xfs_attr_refillstate(state);
1500 if (error)
1501 goto out;
1502 }
1503
1504 /*
1505 * Remove the name and update the hashvals in the tree.
1506 */
1507 blk = &state->path.blk[ state->path.active-1 ];
1508 ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
1509 retval = xfs_attr_leaf_remove(blk->bp, args);
1510 xfs_da_fixhashpath(state, &state->path);
1511
1512 /*
1513 * Check to see if the tree needs to be collapsed.
1514 */
1515 if (retval && (state->path.active > 1)) {
1516 XFS_BMAP_INIT(args->flist, args->firstblock);
1517 error = xfs_da_join(state);
1518 if (!error) {
1519 error = xfs_bmap_finish(&args->trans, args->flist,
1520 *args->firstblock, &committed);
1521 }
1522 if (error) {
1523 ASSERT(committed);
1524 args->trans = NULL;
1525 xfs_bmap_cancel(args->flist);
1526 goto out;
1527 }
1528
1529 /*
1530 * bmap_finish() may have committed the last trans and started
1531 * a new one. We need the inode to be in all transactions.
1532 */
1533 if (committed) {
1534 xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
1535 xfs_trans_ihold(args->trans, dp);
1536 }
1537
1538 /*
1539 * Commit the Btree join operation and start a new trans.
1540 */
1541 if ((error = xfs_attr_rolltrans(&args->trans, dp)))
1542 goto out;
1543 }
1544
1545 /*
1546 * If the result is small enough, push it all into the inode.
1547 */
1548 if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
1549 /*
1550 * Have to get rid of the copy of this dabuf in the state.
1551 */
1552 ASSERT(state->path.active == 1);
1553 ASSERT(state->path.blk[0].bp);
1554 xfs_da_buf_done(state->path.blk[0].bp);
1555 state->path.blk[0].bp = NULL;
1556
1557 error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
1558 XFS_ATTR_FORK);
1559 if (error)
1560 goto out;
1561 ASSERT(INT_GET(((xfs_attr_leafblock_t *)
1562 bp->data)->hdr.info.magic, ARCH_CONVERT)
1563 == XFS_ATTR_LEAF_MAGIC);
1564
1565 if (xfs_attr_shortform_allfit(bp, dp)) {
1566 XFS_BMAP_INIT(args->flist, args->firstblock);
1567 error = xfs_attr_leaf_to_shortform(bp, args);
1568 /* bp is gone due to xfs_da_shrink_inode */
1569 if (!error) {
1570 error = xfs_bmap_finish(&args->trans,
1571 args->flist,
1572 *args->firstblock,
1573 &committed);
1574 }
1575 if (error) {
1576 ASSERT(committed);
1577 args->trans = NULL;
1578 xfs_bmap_cancel(args->flist);
1579 goto out;
1580 }
1581
1582 /*
1583 * bmap_finish() may have committed the last trans
1584 * and started a new one. We need the inode to be
1585 * in all transactions.
1586 */
1587 if (committed) {
1588 xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
1589 xfs_trans_ihold(args->trans, dp);
1590 }
1591 } else
1592 xfs_da_brelse(args->trans, bp);
1593 }
1594 error = 0;
1595
1596out:
1597 xfs_da_state_free(state);
1598 return(error);
1599}
1600
1601/*
1602 * Fill in the disk block numbers in the state structure for the buffers
1603 * that are attached to the state structure.
1604 * This is done so that we can quickly reattach ourselves to those buffers
1605 * after some set of transaction commit's has released these buffers.
1606 */
1607STATIC int
1608xfs_attr_fillstate(xfs_da_state_t *state)
1609{
1610 xfs_da_state_path_t *path;
1611 xfs_da_state_blk_t *blk;
1612 int level;
1613
1614 /*
1615 * Roll down the "path" in the state structure, storing the on-disk
1616 * block number for those buffers in the "path".
1617 */
1618 path = &state->path;
1619 ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
1620 for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
1621 if (blk->bp) {
1622 blk->disk_blkno = xfs_da_blkno(blk->bp);
1623 xfs_da_buf_done(blk->bp);
1624 blk->bp = NULL;
1625 } else {
1626 blk->disk_blkno = 0;
1627 }
1628 }
1629
1630 /*
1631 * Roll down the "altpath" in the state structure, storing the on-disk
1632 * block number for those buffers in the "altpath".
1633 */
1634 path = &state->altpath;
1635 ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
1636 for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
1637 if (blk->bp) {
1638 blk->disk_blkno = xfs_da_blkno(blk->bp);
1639 xfs_da_buf_done(blk->bp);
1640 blk->bp = NULL;
1641 } else {
1642 blk->disk_blkno = 0;
1643 }
1644 }
1645
1646 return(0);
1647}
1648
1649/*
1650 * Reattach the buffers to the state structure based on the disk block
1651 * numbers stored in the state structure.
1652 * This is done after some set of transaction commit's has released those
1653 * buffers from our grip.
1654 */
1655STATIC int
1656xfs_attr_refillstate(xfs_da_state_t *state)
1657{
1658 xfs_da_state_path_t *path;
1659 xfs_da_state_blk_t *blk;
1660 int level, error;
1661
1662 /*
1663 * Roll down the "path" in the state structure, storing the on-disk
1664 * block number for those buffers in the "path".
1665 */
1666 path = &state->path;
1667 ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
1668 for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
1669 if (blk->disk_blkno) {
1670 error = xfs_da_read_buf(state->args->trans,
1671 state->args->dp,
1672 blk->blkno, blk->disk_blkno,
1673 &blk->bp, XFS_ATTR_FORK);
1674 if (error)
1675 return(error);
1676 } else {
1677 blk->bp = NULL;
1678 }
1679 }
1680
1681 /*
1682 * Roll down the "altpath" in the state structure, storing the on-disk
1683 * block number for those buffers in the "altpath".
1684 */
1685 path = &state->altpath;
1686 ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
1687 for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
1688 if (blk->disk_blkno) {
1689 error = xfs_da_read_buf(state->args->trans,
1690 state->args->dp,
1691 blk->blkno, blk->disk_blkno,
1692 &blk->bp, XFS_ATTR_FORK);
1693 if (error)
1694 return(error);
1695 } else {
1696 blk->bp = NULL;
1697 }
1698 }
1699
1700 return(0);
1701}
1702
1703/*
1704 * Look up a filename in a node attribute list.
1705 *
1706 * This routine gets called for any attribute fork that has more than one
1707 * block, ie: both true Btree attr lists and for single-leaf-blocks with
1708 * "remote" values taking up more blocks.
1709 */
1710int
1711xfs_attr_node_get(xfs_da_args_t *args)
1712{
1713 xfs_da_state_t *state;
1714 xfs_da_state_blk_t *blk;
1715 int error, retval;
1716 int i;
1717
1718 state = xfs_da_state_alloc();
1719 state->args = args;
1720 state->mp = args->dp->i_mount;
1721 state->blocksize = state->mp->m_sb.sb_blocksize;
1722 state->node_ents = state->mp->m_attr_node_ents;
1723
1724 /*
1725 * Search to see if name exists, and get back a pointer to it.
1726 */
1727 error = xfs_da_node_lookup_int(state, &retval);
1728 if (error) {
1729 retval = error;
1730 } else if (retval == EEXIST) {
1731 blk = &state->path.blk[ state->path.active-1 ];
1732 ASSERT(blk->bp != NULL);
1733 ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
1734
1735 /*
1736 * Get the value, local or "remote"
1737 */
1738 retval = xfs_attr_leaf_getvalue(blk->bp, args);
1739 if (!retval && (args->rmtblkno > 0)
1740 && !(args->flags & ATTR_KERNOVAL)) {
1741 retval = xfs_attr_rmtval_get(args);
1742 }
1743 }
1744
1745 /*
1746 * If not in a transaction, we have to release all the buffers.
1747 */
1748 for (i = 0; i < state->path.active; i++) {
1749 xfs_da_brelse(args->trans, state->path.blk[i].bp);
1750 state->path.blk[i].bp = NULL;
1751 }
1752
1753 xfs_da_state_free(state);
1754 return(retval);
1755}
1756
1757STATIC int /* error */
1758xfs_attr_node_list(xfs_attr_list_context_t *context)
1759{
1760 attrlist_cursor_kern_t *cursor;
1761 xfs_attr_leafblock_t *leaf;
1762 xfs_da_intnode_t *node;
1763 xfs_da_node_entry_t *btree;
1764 int error, i;
1765 xfs_dabuf_t *bp;
1766
1767 cursor = context->cursor;
1768 cursor->initted = 1;
1769
1770 /*
1771 * Do all sorts of validation on the passed-in cursor structure.
1772 * If anything is amiss, ignore the cursor and look up the hashval
1773 * starting from the btree root.
1774 */
1775 bp = NULL;
1776 if (cursor->blkno > 0) {
1777 error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1,
1778 &bp, XFS_ATTR_FORK);
1779 if ((error != 0) && (error != EFSCORRUPTED))
1780 return(error);
1781 if (bp) {
1782 node = bp->data;
1783 switch (INT_GET(node->hdr.info.magic, ARCH_CONVERT)) {
1784 case XFS_DA_NODE_MAGIC:
1785 xfs_attr_trace_l_cn("wrong blk", context, node);
1786 xfs_da_brelse(NULL, bp);
1787 bp = NULL;
1788 break;
1789 case XFS_ATTR_LEAF_MAGIC:
1790 leaf = bp->data;
1791 if (cursor->hashval >
1792 INT_GET(leaf->entries[
1793 INT_GET(leaf->hdr.count,
1794 ARCH_CONVERT)-1].hashval,
1795 ARCH_CONVERT)) {
1796 xfs_attr_trace_l_cl("wrong blk",
1797 context, leaf);
1798 xfs_da_brelse(NULL, bp);
1799 bp = NULL;
1800 } else if (cursor->hashval <=
1801 INT_GET(leaf->entries[0].hashval,
1802 ARCH_CONVERT)) {
1803 xfs_attr_trace_l_cl("maybe wrong blk",
1804 context, leaf);
1805 xfs_da_brelse(NULL, bp);
1806 bp = NULL;
1807 }
1808 break;
1809 default:
1810 xfs_attr_trace_l_c("wrong blk - ??", context);
1811 xfs_da_brelse(NULL, bp);
1812 bp = NULL;
1813 }
1814 }
1815 }
1816
1817 /*
1818 * We did not find what we expected given the cursor's contents,
1819 * so we start from the top and work down based on the hash value.
1820 * Note that start of node block is same as start of leaf block.
1821 */
1822 if (bp == NULL) {
1823 cursor->blkno = 0;
1824 for (;;) {
1825 error = xfs_da_read_buf(NULL, context->dp,
1826 cursor->blkno, -1, &bp,
1827 XFS_ATTR_FORK);
1828 if (error)
1829 return(error);
1830 if (unlikely(bp == NULL)) {
1831 XFS_ERROR_REPORT("xfs_attr_node_list(2)",
1832 XFS_ERRLEVEL_LOW,
1833 context->dp->i_mount);
1834 return(XFS_ERROR(EFSCORRUPTED));
1835 }
1836 node = bp->data;
1837 if (INT_GET(node->hdr.info.magic, ARCH_CONVERT)
1838 == XFS_ATTR_LEAF_MAGIC)
1839 break;
1840 if (unlikely(INT_GET(node->hdr.info.magic, ARCH_CONVERT)
1841 != XFS_DA_NODE_MAGIC)) {
1842 XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)",
1843 XFS_ERRLEVEL_LOW,
1844 context->dp->i_mount,
1845 node);
1846 xfs_da_brelse(NULL, bp);
1847 return(XFS_ERROR(EFSCORRUPTED));
1848 }
1849 btree = node->btree;
1850 for (i = 0;
1851 i < INT_GET(node->hdr.count, ARCH_CONVERT);
1852 btree++, i++) {
1853 if (cursor->hashval
1854 <= INT_GET(btree->hashval,
1855 ARCH_CONVERT)) {
1856 cursor->blkno = INT_GET(btree->before, ARCH_CONVERT);
1857 xfs_attr_trace_l_cb("descending",
1858 context, btree);
1859 break;
1860 }
1861 }
1862 if (i == INT_GET(node->hdr.count, ARCH_CONVERT)) {
1863 xfs_da_brelse(NULL, bp);
1864 return(0);
1865 }
1866 xfs_da_brelse(NULL, bp);
1867 }
1868 }
1869 ASSERT(bp != NULL);
1870
1871 /*
1872 * Roll upward through the blocks, processing each leaf block in
1873 * order. As long as there is space in the result buffer, keep
1874 * adding the information.
1875 */
1876 for (;;) {
1877 leaf = bp->data;
1878 if (unlikely(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
1879 != XFS_ATTR_LEAF_MAGIC)) {
1880 XFS_CORRUPTION_ERROR("xfs_attr_node_list(4)",
1881 XFS_ERRLEVEL_LOW,
1882 context->dp->i_mount, leaf);
1883 xfs_da_brelse(NULL, bp);
1884 return(XFS_ERROR(EFSCORRUPTED));
1885 }
1886 error = xfs_attr_leaf_list_int(bp, context);
1887 if (error || !leaf->hdr.info.forw)
1888 break; /* not really an error, buffer full or EOF */
1889 cursor->blkno = INT_GET(leaf->hdr.info.forw, ARCH_CONVERT);
1890 xfs_da_brelse(NULL, bp);
1891 error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1,
1892 &bp, XFS_ATTR_FORK);
1893 if (error)
1894 return(error);
1895 if (unlikely((bp == NULL))) {
1896 XFS_ERROR_REPORT("xfs_attr_node_list(5)",
1897 XFS_ERRLEVEL_LOW,
1898 context->dp->i_mount);
1899 return(XFS_ERROR(EFSCORRUPTED));
1900 }
1901 }
1902 xfs_da_brelse(NULL, bp);
1903 return(0);
1904}
1905
1906
1907/*========================================================================
1908 * External routines for manipulating out-of-line attribute values.
1909 *========================================================================*/
1910
1911/*
1912 * Read the value associated with an attribute from the out-of-line buffer
1913 * that we stored it in.
1914 */
1915STATIC int
1916xfs_attr_rmtval_get(xfs_da_args_t *args)
1917{
1918 xfs_bmbt_irec_t map[ATTR_RMTVALUE_MAPSIZE];
1919 xfs_mount_t *mp;
1920 xfs_daddr_t dblkno;
1921 xfs_caddr_t dst;
1922 xfs_buf_t *bp;
1923 int nmap, error, tmp, valuelen, blkcnt, i;
1924 xfs_dablk_t lblkno;
1925
1926 ASSERT(!(args->flags & ATTR_KERNOVAL));
1927
1928 mp = args->dp->i_mount;
1929 dst = args->value;
1930 valuelen = args->valuelen;
1931 lblkno = args->rmtblkno;
1932 while (valuelen > 0) {
1933 nmap = ATTR_RMTVALUE_MAPSIZE;
1934 error = xfs_bmapi(args->trans, args->dp, (xfs_fileoff_t)lblkno,
1935 args->rmtblkcnt,
1936 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
1937 NULL, 0, map, &nmap, NULL);
1938 if (error)
1939 return(error);
1940 ASSERT(nmap >= 1);
1941
1942 for (i = 0; (i < nmap) && (valuelen > 0); i++) {
1943 ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) &&
1944 (map[i].br_startblock != HOLESTARTBLOCK));
1945 dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
1946 blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
1947 error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno,
1948 blkcnt, XFS_BUF_LOCK, &bp);
1949 if (error)
1950 return(error);
1951
1952 tmp = (valuelen < XFS_BUF_SIZE(bp))
1953 ? valuelen : XFS_BUF_SIZE(bp);
1954 xfs_biomove(bp, 0, tmp, dst, XFS_B_READ);
1955 xfs_buf_relse(bp);
1956 dst += tmp;
1957 valuelen -= tmp;
1958
1959 lblkno += map[i].br_blockcount;
1960 }
1961 }
1962 ASSERT(valuelen == 0);
1963 return(0);
1964}
1965
1966/*
1967 * Write the value associated with an attribute into the out-of-line buffer
1968 * that we have defined for it.
1969 */
1970STATIC int
1971xfs_attr_rmtval_set(xfs_da_args_t *args)
1972{
1973 xfs_mount_t *mp;
1974 xfs_fileoff_t lfileoff;
1975 xfs_inode_t *dp;
1976 xfs_bmbt_irec_t map;
1977 xfs_daddr_t dblkno;
1978 xfs_caddr_t src;
1979 xfs_buf_t *bp;
1980 xfs_dablk_t lblkno;
1981 int blkcnt, valuelen, nmap, error, tmp, committed;
1982
1983 dp = args->dp;
1984 mp = dp->i_mount;
1985 src = args->value;
1986
1987 /*
1988 * Find a "hole" in the attribute address space large enough for
1989 * us to drop the new attribute's value into.
1990 */
1991 blkcnt = XFS_B_TO_FSB(mp, args->valuelen);
1992 lfileoff = 0;
1993 error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
1994 XFS_ATTR_FORK);
1995 if (error) {
1996 return(error);
1997 }
1998 args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff;
1999 args->rmtblkcnt = blkcnt;
2000
2001 /*
2002 * Roll through the "value", allocating blocks on disk as required.
2003 */
2004 while (blkcnt > 0) {
2005 /*
2006 * Allocate a single extent, up to the size of the value.
2007 */
2008 XFS_BMAP_INIT(args->flist, args->firstblock);
2009 nmap = 1;
2010 error = xfs_bmapi(args->trans, dp, (xfs_fileoff_t)lblkno,
2011 blkcnt,
2012 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA |
2013 XFS_BMAPI_WRITE,
2014 args->firstblock, args->total, &map, &nmap,
2015 args->flist);
2016 if (!error) {
2017 error = xfs_bmap_finish(&args->trans, args->flist,
2018 *args->firstblock, &committed);
2019 }
2020 if (error) {
2021 ASSERT(committed);
2022 args->trans = NULL;
2023 xfs_bmap_cancel(args->flist);
2024 return(error);
2025 }
2026
2027 /*
2028 * bmap_finish() may have committed the last trans and started
2029 * a new one. We need the inode to be in all transactions.
2030 */
2031 if (committed) {
2032 xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
2033 xfs_trans_ihold(args->trans, dp);
2034 }
2035
2036 ASSERT(nmap == 1);
2037 ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
2038 (map.br_startblock != HOLESTARTBLOCK));
2039 lblkno += map.br_blockcount;
2040 blkcnt -= map.br_blockcount;
2041
2042 /*
2043 * Start the next trans in the chain.
2044 */
2045 if ((error = xfs_attr_rolltrans(&args->trans, dp)))
2046 return (error);
2047 }
2048
2049 /*
2050 * Roll through the "value", copying the attribute value to the
2051 * already-allocated blocks. Blocks are written synchronously
2052 * so that we can know they are all on disk before we turn off
2053 * the INCOMPLETE flag.
2054 */
2055 lblkno = args->rmtblkno;
2056 valuelen = args->valuelen;
2057 while (valuelen > 0) {
2058 /*
2059 * Try to remember where we decided to put the value.
2060 */
2061 XFS_BMAP_INIT(args->flist, args->firstblock);
2062 nmap = 1;
2063 error = xfs_bmapi(NULL, dp, (xfs_fileoff_t)lblkno,
2064 args->rmtblkcnt,
2065 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
2066 args->firstblock, 0, &map, &nmap, NULL);
2067 if (error) {
2068 return(error);
2069 }
2070 ASSERT(nmap == 1);
2071 ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
2072 (map.br_startblock != HOLESTARTBLOCK));
2073
2074 dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
2075 blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
2076
2077 bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno,
2078 blkcnt, XFS_BUF_LOCK);
2079 ASSERT(bp);
2080 ASSERT(!XFS_BUF_GETERROR(bp));
2081
2082 tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen :
2083 XFS_BUF_SIZE(bp);
2084 xfs_biomove(bp, 0, tmp, src, XFS_B_WRITE);
2085 if (tmp < XFS_BUF_SIZE(bp))
2086 xfs_biozero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
2087 if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */
2088 return (error);
2089 }
2090 src += tmp;
2091 valuelen -= tmp;
2092
2093 lblkno += map.br_blockcount;
2094 }
2095 ASSERT(valuelen == 0);
2096 return(0);
2097}
2098
2099/*
2100 * Remove the value associated with an attribute by deleting the
2101 * out-of-line buffer that it is stored on.
2102 */
2103STATIC int
2104xfs_attr_rmtval_remove(xfs_da_args_t *args)
2105{
2106 xfs_mount_t *mp;
2107 xfs_bmbt_irec_t map;
2108 xfs_buf_t *bp;
2109 xfs_daddr_t dblkno;
2110 xfs_dablk_t lblkno;
2111 int valuelen, blkcnt, nmap, error, done, committed;
2112
2113 mp = args->dp->i_mount;
2114
2115 /*
2116 * Roll through the "value", invalidating the attribute value's
2117 * blocks.
2118 */
2119 lblkno = args->rmtblkno;
2120 valuelen = args->rmtblkcnt;
2121 while (valuelen > 0) {
2122 /*
2123 * Try to remember where we decided to put the value.
2124 */
2125 XFS_BMAP_INIT(args->flist, args->firstblock);
2126 nmap = 1;
2127 error = xfs_bmapi(NULL, args->dp, (xfs_fileoff_t)lblkno,
2128 args->rmtblkcnt,
2129 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
2130 args->firstblock, 0, &map, &nmap,
2131 args->flist);
2132 if (error) {
2133 return(error);
2134 }
2135 ASSERT(nmap == 1);
2136 ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
2137 (map.br_startblock != HOLESTARTBLOCK));
2138
2139 dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
2140 blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
2141
2142 /*
2143 * If the "remote" value is in the cache, remove it.
2144 */
2145 bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt,
2146 XFS_INCORE_TRYLOCK);
2147 if (bp) {
2148 XFS_BUF_STALE(bp);
2149 XFS_BUF_UNDELAYWRITE(bp);
2150 xfs_buf_relse(bp);
2151 bp = NULL;
2152 }
2153
2154 valuelen -= map.br_blockcount;
2155
2156 lblkno += map.br_blockcount;
2157 }
2158
2159 /*
2160 * Keep de-allocating extents until the remote-value region is gone.
2161 */
2162 lblkno = args->rmtblkno;
2163 blkcnt = args->rmtblkcnt;
2164 done = 0;
2165 while (!done) {
2166 XFS_BMAP_INIT(args->flist, args->firstblock);
2167 error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
2168 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
2169 1, args->firstblock, args->flist, &done);
2170 if (!error) {
2171 error = xfs_bmap_finish(&args->trans, args->flist,
2172 *args->firstblock, &committed);
2173 }
2174 if (error) {
2175 ASSERT(committed);
2176 args->trans = NULL;
2177 xfs_bmap_cancel(args->flist);
2178 return(error);
2179 }
2180
2181 /*
2182 * bmap_finish() may have committed the last trans and started
2183 * a new one. We need the inode to be in all transactions.
2184 */
2185 if (committed) {
2186 xfs_trans_ijoin(args->trans, args->dp, XFS_ILOCK_EXCL);
2187 xfs_trans_ihold(args->trans, args->dp);
2188 }
2189
2190 /*
2191 * Close out trans and start the next one in the chain.
2192 */
2193 if ((error = xfs_attr_rolltrans(&args->trans, args->dp)))
2194 return (error);
2195 }
2196 return(0);
2197}
2198
2199#if defined(XFS_ATTR_TRACE)
2200/*
2201 * Add a trace buffer entry for an attr_list context structure.
2202 */
2203void
2204xfs_attr_trace_l_c(char *where, struct xfs_attr_list_context *context)
2205{
2206 xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_C, where,
2207 (__psunsigned_t)context->dp,
2208 (__psunsigned_t)context->cursor->hashval,
2209 (__psunsigned_t)context->cursor->blkno,
2210 (__psunsigned_t)context->cursor->offset,
2211 (__psunsigned_t)context->alist,
2212 (__psunsigned_t)context->bufsize,
2213 (__psunsigned_t)context->count,
2214 (__psunsigned_t)context->firstu,
2215 (__psunsigned_t)
2216 ((context->count > 0) &&
2217 !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL)))
2218 ? (ATTR_ENTRY(context->alist,
2219 context->count-1)->a_valuelen)
2220 : 0,
2221 (__psunsigned_t)context->dupcnt,
2222 (__psunsigned_t)context->flags,
2223 (__psunsigned_t)NULL,
2224 (__psunsigned_t)NULL,
2225 (__psunsigned_t)NULL);
2226}
2227
2228/*
2229 * Add a trace buffer entry for a context structure and a Btree node.
2230 */
2231void
2232xfs_attr_trace_l_cn(char *where, struct xfs_attr_list_context *context,
2233 struct xfs_da_intnode *node)
2234{
2235 xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CN, where,
2236 (__psunsigned_t)context->dp,
2237 (__psunsigned_t)context->cursor->hashval,
2238 (__psunsigned_t)context->cursor->blkno,
2239 (__psunsigned_t)context->cursor->offset,
2240 (__psunsigned_t)context->alist,
2241 (__psunsigned_t)context->bufsize,
2242 (__psunsigned_t)context->count,
2243 (__psunsigned_t)context->firstu,
2244 (__psunsigned_t)
2245 ((context->count > 0) &&
2246 !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL)))
2247 ? (ATTR_ENTRY(context->alist,
2248 context->count-1)->a_valuelen)
2249 : 0,
2250 (__psunsigned_t)context->dupcnt,
2251 (__psunsigned_t)context->flags,
2252 (__psunsigned_t)INT_GET(node->hdr.count, ARCH_CONVERT),
2253 (__psunsigned_t)INT_GET(node->btree[0].hashval, ARCH_CONVERT),
2254 (__psunsigned_t)INT_GET(node->btree[INT_GET(node->hdr.count, ARCH_CONVERT)-1].hashval, ARCH_CONVERT));
2255}
2256
2257/*
2258 * Add a trace buffer entry for a context structure and a Btree element.
2259 */
2260void
2261xfs_attr_trace_l_cb(char *where, struct xfs_attr_list_context *context,
2262 struct xfs_da_node_entry *btree)
2263{
2264 xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CB, where,
2265 (__psunsigned_t)context->dp,
2266 (__psunsigned_t)context->cursor->hashval,
2267 (__psunsigned_t)context->cursor->blkno,
2268 (__psunsigned_t)context->cursor->offset,
2269 (__psunsigned_t)context->alist,
2270 (__psunsigned_t)context->bufsize,
2271 (__psunsigned_t)context->count,
2272 (__psunsigned_t)context->firstu,
2273 (__psunsigned_t)
2274 ((context->count > 0) &&
2275 !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL)))
2276 ? (ATTR_ENTRY(context->alist,
2277 context->count-1)->a_valuelen)
2278 : 0,
2279 (__psunsigned_t)context->dupcnt,
2280 (__psunsigned_t)context->flags,
2281 (__psunsigned_t)INT_GET(btree->hashval, ARCH_CONVERT),
2282 (__psunsigned_t)INT_GET(btree->before, ARCH_CONVERT),
2283 (__psunsigned_t)NULL);
2284}
2285
2286/*
2287 * Add a trace buffer entry for a context structure and a leaf block.
2288 */
2289void
2290xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context,
2291 struct xfs_attr_leafblock *leaf)
2292{
2293 xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CL, where,
2294 (__psunsigned_t)context->dp,
2295 (__psunsigned_t)context->cursor->hashval,
2296 (__psunsigned_t)context->cursor->blkno,
2297 (__psunsigned_t)context->cursor->offset,
2298 (__psunsigned_t)context->alist,
2299 (__psunsigned_t)context->bufsize,
2300 (__psunsigned_t)context->count,
2301 (__psunsigned_t)context->firstu,
2302 (__psunsigned_t)
2303 ((context->count > 0) &&
2304 !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL)))
2305 ? (ATTR_ENTRY(context->alist,
2306 context->count-1)->a_valuelen)
2307 : 0,
2308 (__psunsigned_t)context->dupcnt,
2309 (__psunsigned_t)context->flags,
2310 (__psunsigned_t)INT_GET(leaf->hdr.count, ARCH_CONVERT),
2311 (__psunsigned_t)INT_GET(leaf->entries[0].hashval, ARCH_CONVERT),
2312 (__psunsigned_t)INT_GET(leaf->entries[INT_GET(leaf->hdr.count, ARCH_CONVERT)-1].hashval, ARCH_CONVERT));
2313}
2314
2315/*
2316 * Add a trace buffer entry for the arguments given to the routine,
2317 * generic form.
2318 */
2319void
2320xfs_attr_trace_enter(int type, char *where,
2321 __psunsigned_t a2, __psunsigned_t a3,
2322 __psunsigned_t a4, __psunsigned_t a5,
2323 __psunsigned_t a6, __psunsigned_t a7,
2324 __psunsigned_t a8, __psunsigned_t a9,
2325 __psunsigned_t a10, __psunsigned_t a11,
2326 __psunsigned_t a12, __psunsigned_t a13,
2327 __psunsigned_t a14, __psunsigned_t a15)
2328{
2329 ASSERT(xfs_attr_trace_buf);
2330 ktrace_enter(xfs_attr_trace_buf, (void *)((__psunsigned_t)type),
2331 (void *)where,
2332 (void *)a2, (void *)a3, (void *)a4,
2333 (void *)a5, (void *)a6, (void *)a7,
2334 (void *)a8, (void *)a9, (void *)a10,
2335 (void *)a11, (void *)a12, (void *)a13,
2336 (void *)a14, (void *)a15);
2337}
2338#endif /* XFS_ATTR_TRACE */
2339
2340
2341/*========================================================================
2342 * System (pseudo) namespace attribute interface routines.
2343 *========================================================================*/
2344
2345STATIC int
2346posix_acl_access_set(
2347 vnode_t *vp, char *name, void *data, size_t size, int xflags)
2348{
2349 return xfs_acl_vset(vp, data, size, _ACL_TYPE_ACCESS);
2350}
2351
2352STATIC int
2353posix_acl_access_remove(
2354 struct vnode *vp, char *name, int xflags)
2355{
2356 return xfs_acl_vremove(vp, _ACL_TYPE_ACCESS);
2357}
2358
2359STATIC int
2360posix_acl_access_get(
2361 vnode_t *vp, char *name, void *data, size_t size, int xflags)
2362{
2363 return xfs_acl_vget(vp, data, size, _ACL_TYPE_ACCESS);
2364}
2365
2366STATIC int
2367posix_acl_access_exists(
2368 vnode_t *vp)
2369{
2370 return xfs_acl_vhasacl_access(vp);
2371}
2372
2373STATIC int
2374posix_acl_default_set(
2375 vnode_t *vp, char *name, void *data, size_t size, int xflags)
2376{
2377 return xfs_acl_vset(vp, data, size, _ACL_TYPE_DEFAULT);
2378}
2379
2380STATIC int
2381posix_acl_default_get(
2382 vnode_t *vp, char *name, void *data, size_t size, int xflags)
2383{
2384 return xfs_acl_vget(vp, data, size, _ACL_TYPE_DEFAULT);
2385}
2386
2387STATIC int
2388posix_acl_default_remove(
2389 struct vnode *vp, char *name, int xflags)
2390{
2391 return xfs_acl_vremove(vp, _ACL_TYPE_DEFAULT);
2392}
2393
2394STATIC int
2395posix_acl_default_exists(
2396 vnode_t *vp)
2397{
2398 return xfs_acl_vhasacl_default(vp);
2399}
2400
2401struct attrnames posix_acl_access = {
2402 .attr_name = "posix_acl_access",
2403 .attr_namelen = sizeof("posix_acl_access") - 1,
2404 .attr_get = posix_acl_access_get,
2405 .attr_set = posix_acl_access_set,
2406 .attr_remove = posix_acl_access_remove,
2407 .attr_exists = posix_acl_access_exists,
2408};
2409
2410struct attrnames posix_acl_default = {
2411 .attr_name = "posix_acl_default",
2412 .attr_namelen = sizeof("posix_acl_default") - 1,
2413 .attr_get = posix_acl_default_get,
2414 .attr_set = posix_acl_default_set,
2415 .attr_remove = posix_acl_default_remove,
2416 .attr_exists = posix_acl_default_exists,
2417};
2418
2419struct attrnames *attr_system_names[] =
2420 { &posix_acl_access, &posix_acl_default };
2421
2422
2423/*========================================================================
2424 * Namespace-prefix-style attribute name interface routines.
2425 *========================================================================*/
2426
2427STATIC int
2428attr_generic_set(
2429 struct vnode *vp, char *name, void *data, size_t size, int xflags)
2430{
2431 int error;
2432
2433 VOP_ATTR_SET(vp, name, data, size, xflags, NULL, error);
2434 return -error;
2435}
2436
2437STATIC int
2438attr_generic_get(
2439 struct vnode *vp, char *name, void *data, size_t size, int xflags)
2440{
2441 int error, asize = size;
2442
2443 VOP_ATTR_GET(vp, name, data, &asize, xflags, NULL, error);
2444 if (!error)
2445 return asize;
2446 return -error;
2447}
2448
2449STATIC int
2450attr_generic_remove(
2451 struct vnode *vp, char *name, int xflags)
2452{
2453 int error;
2454
2455 VOP_ATTR_REMOVE(vp, name, xflags, NULL, error);
2456 return -error;
2457}
2458
2459STATIC int
2460attr_generic_listadd(
2461 attrnames_t *prefix,
2462 attrnames_t *namesp,
2463 void *data,
2464 size_t size,
2465 ssize_t *result)
2466{
2467 char *p = data + *result;
2468
2469 *result += prefix->attr_namelen;
2470 *result += namesp->attr_namelen + 1;
2471 if (!size)
2472 return 0;
2473 if (*result > size)
2474 return -ERANGE;
2475 strcpy(p, prefix->attr_name);
2476 p += prefix->attr_namelen;
2477 strcpy(p, namesp->attr_name);
2478 p += namesp->attr_namelen + 1;
2479 return 0;
2480}
2481
2482STATIC int
2483attr_system_list(
2484 struct vnode *vp,
2485 void *data,
2486 size_t size,
2487 ssize_t *result)
2488{
2489 attrnames_t *namesp;
2490 int i, error = 0;
2491
2492 for (i = 0; i < ATTR_SYSCOUNT; i++) {
2493 namesp = attr_system_names[i];
2494 if (!namesp->attr_exists || !namesp->attr_exists(vp))
2495 continue;
2496 error = attr_generic_listadd(&attr_system, namesp,
2497 data, size, result);
2498 if (error)
2499 break;
2500 }
2501 return error;
2502}
2503
2504int
2505attr_generic_list(
2506 struct vnode *vp, void *data, size_t size, int xflags, ssize_t *result)
2507{
2508 attrlist_cursor_kern_t cursor = { 0 };
2509 int error;
2510
2511 VOP_ATTR_LIST(vp, data, size, xflags, &cursor, NULL, error);
2512 if (error > 0)
2513 return -error;
2514 *result = -error;
2515 return attr_system_list(vp, data, size, result);
2516}
2517
2518attrnames_t *
2519attr_lookup_namespace(
2520 char *name,
2521 struct attrnames **names,
2522 int nnames)
2523{
2524 int i;
2525
2526 for (i = 0; i < nnames; i++)
2527 if (!strncmp(name, names[i]->attr_name, names[i]->attr_namelen))
2528 return names[i];
2529 return NULL;
2530}
2531
2532/*
2533 * Some checks to prevent people abusing EAs to get over quota:
2534 * - Don't allow modifying user EAs on devices/symlinks;
2535 * - Don't allow modifying user EAs if sticky bit set;
2536 */
2537STATIC int
2538attr_user_capable(
2539 struct vnode *vp,
2540 cred_t *cred)
2541{
2542 struct inode *inode = LINVFS_GET_IP(vp);
2543
2544 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
2545 return -EPERM;
2546 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) &&
2547 !capable(CAP_SYS_ADMIN))
2548 return -EPERM;
2549 if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
2550 (current_fsuid(cred) != inode->i_uid) && !capable(CAP_FOWNER))
2551 return -EPERM;
2552 return 0;
2553}
2554
2555STATIC int
2556attr_trusted_capable(
2557 struct vnode *vp,
2558 cred_t *cred)
2559{
2560 struct inode *inode = LINVFS_GET_IP(vp);
2561
2562 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
2563 return -EPERM;
2564 if (!capable(CAP_SYS_ADMIN))
2565 return -EPERM;
2566 return 0;
2567}
2568
2569STATIC int
2570attr_secure_capable(
2571 struct vnode *vp,
2572 cred_t *cred)
2573{
2574 return -ENOSECURITY;
2575}
2576
2577STATIC int
2578attr_system_set(
2579 struct vnode *vp, char *name, void *data, size_t size, int xflags)
2580{
2581 attrnames_t *namesp;
2582 int error;
2583
2584 if (xflags & ATTR_CREATE)
2585 return -EINVAL;
2586
2587 namesp = attr_lookup_namespace(name, attr_system_names, ATTR_SYSCOUNT);
2588 if (!namesp)
2589 return -EOPNOTSUPP;
2590 error = namesp->attr_set(vp, name, data, size, xflags);
2591 if (!error)
2592 error = vn_revalidate(vp);
2593 return error;
2594}
2595
2596STATIC int
2597attr_system_get(
2598 struct vnode *vp, char *name, void *data, size_t size, int xflags)
2599{
2600 attrnames_t *namesp;
2601
2602 namesp = attr_lookup_namespace(name, attr_system_names, ATTR_SYSCOUNT);
2603 if (!namesp)
2604 return -EOPNOTSUPP;
2605 return namesp->attr_get(vp, name, data, size, xflags);
2606}
2607
2608STATIC int
2609attr_system_remove(
2610 struct vnode *vp, char *name, int xflags)
2611{
2612 attrnames_t *namesp;
2613
2614 namesp = attr_lookup_namespace(name, attr_system_names, ATTR_SYSCOUNT);
2615 if (!namesp)
2616 return -EOPNOTSUPP;
2617 return namesp->attr_remove(vp, name, xflags);
2618}
2619
2620struct attrnames attr_system = {
2621 .attr_name = "system.",
2622 .attr_namelen = sizeof("system.") - 1,
2623 .attr_flag = ATTR_SYSTEM,
2624 .attr_get = attr_system_get,
2625 .attr_set = attr_system_set,
2626 .attr_remove = attr_system_remove,
2627 .attr_capable = (attrcapable_t)fs_noerr,
2628};
2629
2630struct attrnames attr_trusted = {
2631 .attr_name = "trusted.",
2632 .attr_namelen = sizeof("trusted.") - 1,
2633 .attr_flag = ATTR_ROOT,
2634 .attr_get = attr_generic_get,
2635 .attr_set = attr_generic_set,
2636 .attr_remove = attr_generic_remove,
2637 .attr_capable = attr_trusted_capable,
2638};
2639
2640struct attrnames attr_secure = {
2641 .attr_name = "security.",
2642 .attr_namelen = sizeof("security.") - 1,
2643 .attr_flag = ATTR_SECURE,
2644 .attr_get = attr_generic_get,
2645 .attr_set = attr_generic_set,
2646 .attr_remove = attr_generic_remove,
2647 .attr_capable = attr_secure_capable,
2648};
2649
2650struct attrnames attr_user = {
2651 .attr_name = "user.",
2652 .attr_namelen = sizeof("user.") - 1,
2653 .attr_get = attr_generic_get,
2654 .attr_set = attr_generic_set,
2655 .attr_remove = attr_generic_remove,
2656 .attr_capable = attr_user_capable,
2657};
2658
2659struct attrnames *attr_namespaces[] =
2660 { &attr_system, &attr_trusted, &attr_secure, &attr_user };
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
new file mode 100644
index 000000000000..67cd0f5ac1a7
--- /dev/null
+++ b/fs/xfs/xfs_attr.h
@@ -0,0 +1,193 @@
1/*
2 * Copyright (c) 2000, 2002-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_ATTR_H__
33#define __XFS_ATTR_H__
34
35/*
36 * xfs_attr.h
37 *
38 * Large attribute lists are structured around Btrees where all the data
39 * elements are in the leaf nodes. Attribute names are hashed into an int,
40 * then that int is used as the index into the Btree. Since the hashval
41 * of an attribute name may not be unique, we may have duplicate keys.
42 * The internal links in the Btree are logical block offsets into the file.
43 *
44 * Small attribute lists use a different format and are packed as tightly
45 * as possible so as to fit into the literal area of the inode.
46 */
47
48/*========================================================================
49 * External interfaces
50 *========================================================================*/
51
52struct cred;
53struct vnode;
54
55typedef int (*attrset_t)(struct vnode *, char *, void *, size_t, int);
56typedef int (*attrget_t)(struct vnode *, char *, void *, size_t, int);
57typedef int (*attrremove_t)(struct vnode *, char *, int);
58typedef int (*attrexists_t)(struct vnode *);
59typedef int (*attrcapable_t)(struct vnode *, struct cred *);
60
61typedef struct attrnames {
62 char * attr_name;
63 unsigned int attr_namelen;
64 unsigned int attr_flag;
65 attrget_t attr_get;
66 attrset_t attr_set;
67 attrremove_t attr_remove;
68 attrexists_t attr_exists;
69 attrcapable_t attr_capable;
70} attrnames_t;
71
72#define ATTR_NAMECOUNT 4
73extern struct attrnames attr_user;
74extern struct attrnames attr_secure;
75extern struct attrnames attr_system;
76extern struct attrnames attr_trusted;
77extern struct attrnames *attr_namespaces[ATTR_NAMECOUNT];
78
79#define ATTR_SYSCOUNT 2
80extern struct attrnames posix_acl_access;
81extern struct attrnames posix_acl_default;
82extern struct attrnames *attr_system_names[ATTR_SYSCOUNT];
83
84extern attrnames_t *attr_lookup_namespace(char *, attrnames_t **, int);
85extern int attr_generic_list(struct vnode *, void *, size_t, int, ssize_t *);
86
87#define ATTR_DONTFOLLOW 0x0001 /* -- unused, from IRIX -- */
88#define ATTR_ROOT 0x0002 /* use attrs in root (trusted) namespace */
89#define ATTR_TRUST 0x0004 /* -- unused, from IRIX -- */
90#define ATTR_SECURE 0x0008 /* use attrs in security namespace */
91#define ATTR_CREATE 0x0010 /* pure create: fail if attr already exists */
92#define ATTR_REPLACE 0x0020 /* pure set: fail if attr does not exist */
93#define ATTR_SYSTEM 0x0100 /* use attrs in system (pseudo) namespace */
94
95#define ATTR_KERNACCESS 0x0400 /* [kernel] iaccess, inode held io-locked */
96#define ATTR_KERNOTIME 0x1000 /* [kernel] don't update inode timestamps */
97#define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */
98#define ATTR_KERNAMELS 0x4000 /* [kernel] list attr names (simple list) */
99
100#define ATTR_KERNORMALS 0x0800 /* [kernel] normal attr list: user+secure */
101#define ATTR_KERNROOTLS 0x8000 /* [kernel] include root in the attr list */
102#define ATTR_KERNFULLS (ATTR_KERNORMALS|ATTR_KERNROOTLS)
103
104/*
105 * The maximum size (into the kernel or returned from the kernel) of an
106 * attribute value or the buffer used for an attr_list() call. Larger
107 * sizes will result in an ERANGE return code.
108 */
109#define ATTR_MAX_VALUELEN (64*1024) /* max length of a value */
110
111/*
112 * Define how lists of attribute names are returned to the user from
113 * the attr_list() call. A large, 32bit aligned, buffer is passed in
114 * along with its size. We put an array of offsets at the top that each
115 * reference an attrlist_ent_t and pack the attrlist_ent_t's at the bottom.
116 */
117typedef struct attrlist {
118 __s32 al_count; /* number of entries in attrlist */
119 __s32 al_more; /* T/F: more attrs (do call again) */
120 __s32 al_offset[1]; /* byte offsets of attrs [var-sized] */
121} attrlist_t;
122
123/*
124 * Show the interesting info about one attribute. This is what the
125 * al_offset[i] entry points to.
126 */
127typedef struct attrlist_ent { /* data from attr_list() */
128 __u32 a_valuelen; /* number bytes in value of attr */
129 char a_name[1]; /* attr name (NULL terminated) */
130} attrlist_ent_t;
131
132/*
133 * Given a pointer to the (char*) buffer containing the attr_list() result,
134 * and an index, return a pointer to the indicated attribute in the buffer.
135 */
136#define ATTR_ENTRY(buffer, index) \
137 ((attrlist_ent_t *) \
138 &((char *)buffer)[ ((attrlist_t *)(buffer))->al_offset[index] ])
139
140/*
141 * Multi-attribute operation vector.
142 */
143typedef struct attr_multiop {
144 int am_opcode; /* operation to perform (ATTR_OP_GET, etc.) */
145 int am_error; /* [out arg] result of this sub-op (an errno) */
146 char *am_attrname; /* attribute name to work with */
147 char *am_attrvalue; /* [in/out arg] attribute value (raw bytes) */
148 int am_length; /* [in/out arg] length of value */
149 int am_flags; /* bitwise OR of attr API flags defined above */
150} attr_multiop_t;
151
152#define ATTR_OP_GET 1 /* return the indicated attr's value */
153#define ATTR_OP_SET 2 /* set/create the indicated attr/value pair */
154#define ATTR_OP_REMOVE 3 /* remove the indicated attr */
155
156/*
157 * Kernel-internal version of the attrlist cursor.
158 */
159typedef struct attrlist_cursor_kern {
160 __u32 hashval; /* hash value of next entry to add */
161 __u32 blkno; /* block containing entry (suggestion) */
162 __u32 offset; /* offset in list of equal-hashvals */
163 __u16 pad1; /* padding to match user-level */
164 __u8 pad2; /* padding to match user-level */
165 __u8 initted; /* T/F: cursor has been initialized */
166} attrlist_cursor_kern_t;
167
168
169/*========================================================================
170 * Function prototypes for the kernel.
171 *========================================================================*/
172
173struct xfs_inode;
174struct attrlist_cursor_kern;
175struct xfs_da_args;
176
177/*
178 * Overall external interface routines.
179 */
180int xfs_attr_get(bhv_desc_t *, char *, char *, int *, int, struct cred *);
181int xfs_attr_set(bhv_desc_t *, char *, char *, int, int, struct cred *);
182int xfs_attr_remove(bhv_desc_t *, char *, int, struct cred *);
183int xfs_attr_list(bhv_desc_t *, char *, int, int,
184 struct attrlist_cursor_kern *, struct cred *);
185int xfs_attr_inactive(struct xfs_inode *dp);
186
187int xfs_attr_node_get(struct xfs_da_args *);
188int xfs_attr_leaf_get(struct xfs_da_args *);
189int xfs_attr_shortform_getvalue(struct xfs_da_args *);
190int xfs_attr_fetch(struct xfs_inode *, char *, int,
191 char *, int *, int, struct cred *);
192
193#endif /* __XFS_ATTR_H__ */
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
new file mode 100644
index 000000000000..b11256e58bf4
--- /dev/null
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -0,0 +1,3050 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32/*
33 * xfs_attr_leaf.c
34 *
35 * GROT: figure out how to recover gracefully when bmap returns ENOSPC.
36 */
37
38#include "xfs.h"
39
40#include "xfs_macros.h"
41#include "xfs_types.h"
42#include "xfs_inum.h"
43#include "xfs_log.h"
44#include "xfs_trans.h"
45#include "xfs_sb.h"
46#include "xfs_ag.h"
47#include "xfs_dir.h"
48#include "xfs_dir2.h"
49#include "xfs_dmapi.h"
50#include "xfs_mount.h"
51#include "xfs_alloc_btree.h"
52#include "xfs_bmap_btree.h"
53#include "xfs_ialloc_btree.h"
54#include "xfs_alloc.h"
55#include "xfs_btree.h"
56#include "xfs_attr_sf.h"
57#include "xfs_dir_sf.h"
58#include "xfs_dir2_sf.h"
59#include "xfs_dinode.h"
60#include "xfs_inode_item.h"
61#include "xfs_inode.h"
62#include "xfs_bmap.h"
63#include "xfs_da_btree.h"
64#include "xfs_attr.h"
65#include "xfs_attr_leaf.h"
66#include "xfs_error.h"
67#include "xfs_bit.h"
68
69/*
70 * xfs_attr_leaf.c
71 *
72 * Routines to implement leaf blocks of attributes as Btrees of hashed names.
73 */
74
75/*========================================================================
76 * Function prototypes for the kernel.
77 *========================================================================*/
78
79/*
80 * Routines used for growing the Btree.
81 */
82STATIC int xfs_attr_leaf_add_work(xfs_dabuf_t *leaf_buffer, xfs_da_args_t *args,
83 int freemap_index);
84STATIC void xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *leaf_buffer);
85STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state,
86 xfs_da_state_blk_t *blk1,
87 xfs_da_state_blk_t *blk2);
88STATIC int xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
89 xfs_da_state_blk_t *leaf_blk_1,
90 xfs_da_state_blk_t *leaf_blk_2,
91 int *number_entries_in_blk1,
92 int *number_usedbytes_in_blk1);
93
94/*
95 * Utility routines.
96 */
97STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf,
98 int src_start,
99 xfs_attr_leafblock_t *dst_leaf,
100 int dst_start, int move_count,
101 xfs_mount_t *mp);
102
103
104/*========================================================================
105 * External routines when dirsize < XFS_LITINO(mp).
106 *========================================================================*/
107
108/*
109 * Create the initial contents of a shortform attribute list.
110 */
111int
112xfs_attr_shortform_create(xfs_da_args_t *args)
113{
114 xfs_attr_sf_hdr_t *hdr;
115 xfs_inode_t *dp;
116 xfs_ifork_t *ifp;
117
118 dp = args->dp;
119 ASSERT(dp != NULL);
120 ifp = dp->i_afp;
121 ASSERT(ifp != NULL);
122 ASSERT(ifp->if_bytes == 0);
123 if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) {
124 ifp->if_flags &= ~XFS_IFEXTENTS; /* just in case */
125 dp->i_d.di_aformat = XFS_DINODE_FMT_LOCAL;
126 ifp->if_flags |= XFS_IFINLINE;
127 } else {
128 ASSERT(ifp->if_flags & XFS_IFINLINE);
129 }
130 xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK);
131 hdr = (xfs_attr_sf_hdr_t *)ifp->if_u1.if_data;
132 hdr->count = 0;
133 INT_SET(hdr->totsize, ARCH_CONVERT, sizeof(*hdr));
134 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
135 return(0);
136}
137
138/*
139 * Add a name/value pair to the shortform attribute list.
140 * Overflow from the inode has already been checked for.
141 */
142int
143xfs_attr_shortform_add(xfs_da_args_t *args)
144{
145 xfs_attr_shortform_t *sf;
146 xfs_attr_sf_entry_t *sfe;
147 int i, offset, size;
148 xfs_inode_t *dp;
149 xfs_ifork_t *ifp;
150
151 dp = args->dp;
152 ifp = dp->i_afp;
153 ASSERT(ifp->if_flags & XFS_IFINLINE);
154 sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
155 sfe = &sf->list[0];
156 for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT);
157 sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
158 if (sfe->namelen != args->namelen)
159 continue;
160 if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
161 continue;
162 if (((args->flags & ATTR_SECURE) != 0) !=
163 ((sfe->flags & XFS_ATTR_SECURE) != 0))
164 continue;
165 if (((args->flags & ATTR_ROOT) != 0) !=
166 ((sfe->flags & XFS_ATTR_ROOT) != 0))
167 continue;
168 return(XFS_ERROR(EEXIST));
169 }
170
171 offset = (char *)sfe - (char *)sf;
172 size = XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
173 xfs_idata_realloc(dp, size, XFS_ATTR_FORK);
174 sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
175 sfe = (xfs_attr_sf_entry_t *)((char *)sf + offset);
176
177 sfe->namelen = args->namelen;
178 INT_SET(sfe->valuelen, ARCH_CONVERT, args->valuelen);
179 sfe->flags = (args->flags & ATTR_SECURE) ? XFS_ATTR_SECURE :
180 ((args->flags & ATTR_ROOT) ? XFS_ATTR_ROOT : 0);
181 memcpy(sfe->nameval, args->name, args->namelen);
182 memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen);
183 INT_MOD(sf->hdr.count, ARCH_CONVERT, 1);
184 INT_MOD(sf->hdr.totsize, ARCH_CONVERT, size);
185 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
186
187 return(0);
188}
189
190/*
191 * Remove a name from the shortform attribute list structure.
192 */
193int
194xfs_attr_shortform_remove(xfs_da_args_t *args)
195{
196 xfs_attr_shortform_t *sf;
197 xfs_attr_sf_entry_t *sfe;
198 int base, size=0, end, totsize, i;
199 xfs_inode_t *dp;
200
201 /*
202 * Remove the attribute.
203 */
204 dp = args->dp;
205 base = sizeof(xfs_attr_sf_hdr_t);
206 sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
207 sfe = &sf->list[0];
208 for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT);
209 sfe = XFS_ATTR_SF_NEXTENTRY(sfe),
210 base += size, i++) {
211 size = XFS_ATTR_SF_ENTSIZE(sfe);
212 if (sfe->namelen != args->namelen)
213 continue;
214 if (memcmp(sfe->nameval, args->name, args->namelen) != 0)
215 continue;
216 if (((args->flags & ATTR_SECURE) != 0) !=
217 ((sfe->flags & XFS_ATTR_SECURE) != 0))
218 continue;
219 if (((args->flags & ATTR_ROOT) != 0) !=
220 ((sfe->flags & XFS_ATTR_ROOT) != 0))
221 continue;
222 break;
223 }
224 if (i == INT_GET(sf->hdr.count, ARCH_CONVERT))
225 return(XFS_ERROR(ENOATTR));
226
227 end = base + size;
228 totsize = INT_GET(sf->hdr.totsize, ARCH_CONVERT);
229 if (end != totsize) {
230 memmove(&((char *)sf)[base], &((char *)sf)[end],
231 totsize - end);
232 }
233 INT_MOD(sf->hdr.count, ARCH_CONVERT, -1);
234 INT_MOD(sf->hdr.totsize, ARCH_CONVERT, -size);
235 xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
236 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
237
238 return(0);
239}
240
241/*
242 * Look up a name in a shortform attribute list structure.
243 */
244/*ARGSUSED*/
245int
246xfs_attr_shortform_lookup(xfs_da_args_t *args)
247{
248 xfs_attr_shortform_t *sf;
249 xfs_attr_sf_entry_t *sfe;
250 int i;
251 xfs_ifork_t *ifp;
252
253 ifp = args->dp->i_afp;
254 ASSERT(ifp->if_flags & XFS_IFINLINE);
255 sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
256 sfe = &sf->list[0];
257 for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT);
258 sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
259 if (sfe->namelen != args->namelen)
260 continue;
261 if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
262 continue;
263 if (((args->flags & ATTR_SECURE) != 0) !=
264 ((sfe->flags & XFS_ATTR_SECURE) != 0))
265 continue;
266 if (((args->flags & ATTR_ROOT) != 0) !=
267 ((sfe->flags & XFS_ATTR_ROOT) != 0))
268 continue;
269 return(XFS_ERROR(EEXIST));
270 }
271 return(XFS_ERROR(ENOATTR));
272}
273
274/*
275 * Look up a name in a shortform attribute list structure.
276 */
277/*ARGSUSED*/
278int
279xfs_attr_shortform_getvalue(xfs_da_args_t *args)
280{
281 xfs_attr_shortform_t *sf;
282 xfs_attr_sf_entry_t *sfe;
283 int i;
284
285 ASSERT(args->dp->i_d.di_aformat == XFS_IFINLINE);
286 sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data;
287 sfe = &sf->list[0];
288 for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT);
289 sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
290 if (sfe->namelen != args->namelen)
291 continue;
292 if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
293 continue;
294 if (((args->flags & ATTR_SECURE) != 0) !=
295 ((sfe->flags & XFS_ATTR_SECURE) != 0))
296 continue;
297 if (((args->flags & ATTR_ROOT) != 0) !=
298 ((sfe->flags & XFS_ATTR_ROOT) != 0))
299 continue;
300 if (args->flags & ATTR_KERNOVAL) {
301 args->valuelen = INT_GET(sfe->valuelen, ARCH_CONVERT);
302 return(XFS_ERROR(EEXIST));
303 }
304 if (args->valuelen < INT_GET(sfe->valuelen, ARCH_CONVERT)) {
305 args->valuelen = INT_GET(sfe->valuelen, ARCH_CONVERT);
306 return(XFS_ERROR(ERANGE));
307 }
308 args->valuelen = INT_GET(sfe->valuelen, ARCH_CONVERT);
309 memcpy(args->value, &sfe->nameval[args->namelen],
310 args->valuelen);
311 return(XFS_ERROR(EEXIST));
312 }
313 return(XFS_ERROR(ENOATTR));
314}
315
316/*
317 * Convert from using the shortform to the leaf.
318 */
319int
320xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
321{
322 xfs_inode_t *dp;
323 xfs_attr_shortform_t *sf;
324 xfs_attr_sf_entry_t *sfe;
325 xfs_da_args_t nargs;
326 char *tmpbuffer;
327 int error, i, size;
328 xfs_dablk_t blkno;
329 xfs_dabuf_t *bp;
330 xfs_ifork_t *ifp;
331
332 dp = args->dp;
333 ifp = dp->i_afp;
334 sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
335 size = INT_GET(sf->hdr.totsize, ARCH_CONVERT);
336 tmpbuffer = kmem_alloc(size, KM_SLEEP);
337 ASSERT(tmpbuffer != NULL);
338 memcpy(tmpbuffer, ifp->if_u1.if_data, size);
339 sf = (xfs_attr_shortform_t *)tmpbuffer;
340
341 xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
342 bp = NULL;
343 error = xfs_da_grow_inode(args, &blkno);
344 if (error) {
345 /*
346 * If we hit an IO error middle of the transaction inside
347 * grow_inode(), we may have inconsistent data. Bail out.
348 */
349 if (error == EIO)
350 goto out;
351 xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */
352 memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */
353 goto out;
354 }
355
356 ASSERT(blkno == 0);
357 error = xfs_attr_leaf_create(args, blkno, &bp);
358 if (error) {
359 error = xfs_da_shrink_inode(args, 0, bp);
360 bp = NULL;
361 if (error)
362 goto out;
363 xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */
364 memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */
365 goto out;
366 }
367
368 memset((char *)&nargs, 0, sizeof(nargs));
369 nargs.dp = dp;
370 nargs.firstblock = args->firstblock;
371 nargs.flist = args->flist;
372 nargs.total = args->total;
373 nargs.whichfork = XFS_ATTR_FORK;
374 nargs.trans = args->trans;
375 nargs.oknoent = 1;
376
377 sfe = &sf->list[0];
378 for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT); i++) {
379 nargs.name = (char *)sfe->nameval;
380 nargs.namelen = sfe->namelen;
381 nargs.value = (char *)&sfe->nameval[nargs.namelen];
382 nargs.valuelen = INT_GET(sfe->valuelen, ARCH_CONVERT);
383 nargs.hashval = xfs_da_hashname((char *)sfe->nameval,
384 sfe->namelen);
385 nargs.flags = (sfe->flags & XFS_ATTR_SECURE) ? ATTR_SECURE :
386 ((sfe->flags & XFS_ATTR_ROOT) ? ATTR_ROOT : 0);
387 error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */
388 ASSERT(error == ENOATTR);
389 error = xfs_attr_leaf_add(bp, &nargs);
390 ASSERT(error != ENOSPC);
391 if (error)
392 goto out;
393 sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
394 }
395 error = 0;
396
397out:
398 if(bp)
399 xfs_da_buf_done(bp);
400 kmem_free(tmpbuffer, size);
401 return(error);
402}
403
404STATIC int
405xfs_attr_shortform_compare(const void *a, const void *b)
406{
407 xfs_attr_sf_sort_t *sa, *sb;
408
409 sa = (xfs_attr_sf_sort_t *)a;
410 sb = (xfs_attr_sf_sort_t *)b;
411 if (INT_GET(sa->hash, ARCH_CONVERT)
412 < INT_GET(sb->hash, ARCH_CONVERT)) {
413 return(-1);
414 } else if (INT_GET(sa->hash, ARCH_CONVERT)
415 > INT_GET(sb->hash, ARCH_CONVERT)) {
416 return(1);
417 } else {
418 return(sa->entno - sb->entno);
419 }
420}
421
422/*
423 * Copy out entries of shortform attribute lists for attr_list().
424 * Shortform atrtribute lists are not stored in hashval sorted order.
425 * If the output buffer is not large enough to hold them all, then we
426 * we have to calculate each entries' hashvalue and sort them before
427 * we can begin returning them to the user.
428 */
429/*ARGSUSED*/
430int
431xfs_attr_shortform_list(xfs_attr_list_context_t *context)
432{
433 attrlist_cursor_kern_t *cursor;
434 xfs_attr_sf_sort_t *sbuf, *sbp;
435 xfs_attr_shortform_t *sf;
436 xfs_attr_sf_entry_t *sfe;
437 xfs_inode_t *dp;
438 int sbsize, nsbuf, count, i;
439
440 ASSERT(context != NULL);
441 dp = context->dp;
442 ASSERT(dp != NULL);
443 ASSERT(dp->i_afp != NULL);
444 sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
445 ASSERT(sf != NULL);
446 if (!sf->hdr.count)
447 return(0);
448 cursor = context->cursor;
449 ASSERT(cursor != NULL);
450
451 xfs_attr_trace_l_c("sf start", context);
452
453 /*
454 * If the buffer is large enough, do not bother with sorting.
455 * Note the generous fudge factor of 16 overhead bytes per entry.
456 */
457 if ((dp->i_afp->if_bytes + INT_GET(sf->hdr.count, ARCH_CONVERT) * 16)
458 < context->bufsize) {
459 for (i = 0, sfe = &sf->list[0];
460 i < INT_GET(sf->hdr.count, ARCH_CONVERT); i++) {
461 attrnames_t *namesp;
462
463 if (((context->flags & ATTR_SECURE) != 0) !=
464 ((sfe->flags & XFS_ATTR_SECURE) != 0) &&
465 !(context->flags & ATTR_KERNORMALS)) {
466 sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
467 continue;
468 }
469 if (((context->flags & ATTR_ROOT) != 0) !=
470 ((sfe->flags & XFS_ATTR_ROOT) != 0) &&
471 !(context->flags & ATTR_KERNROOTLS)) {
472 sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
473 continue;
474 }
475 namesp = (sfe->flags & XFS_ATTR_SECURE) ? &attr_secure:
476 ((sfe->flags & XFS_ATTR_ROOT) ? &attr_trusted :
477 &attr_user);
478 if (context->flags & ATTR_KERNOVAL) {
479 ASSERT(context->flags & ATTR_KERNAMELS);
480 context->count += namesp->attr_namelen +
481 INT_GET(sfe->namelen, ARCH_CONVERT) + 1;
482 }
483 else {
484 if (xfs_attr_put_listent(context, namesp,
485 (char *)sfe->nameval,
486 (int)sfe->namelen,
487 (int)INT_GET(sfe->valuelen,
488 ARCH_CONVERT)))
489 break;
490 }
491 sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
492 }
493 xfs_attr_trace_l_c("sf big-gulp", context);
494 return(0);
495 }
496
497 /*
498 * It didn't all fit, so we have to sort everything on hashval.
499 */
500 sbsize = INT_GET(sf->hdr.count, ARCH_CONVERT) * sizeof(*sbuf);
501 sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP);
502
503 /*
504 * Scan the attribute list for the rest of the entries, storing
505 * the relevant info from only those that match into a buffer.
506 */
507 nsbuf = 0;
508 for (i = 0, sfe = &sf->list[0];
509 i < INT_GET(sf->hdr.count, ARCH_CONVERT); i++) {
510 if (unlikely(
511 ((char *)sfe < (char *)sf) ||
512 ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)))) {
513 XFS_CORRUPTION_ERROR("xfs_attr_shortform_list",
514 XFS_ERRLEVEL_LOW,
515 context->dp->i_mount, sfe);
516 xfs_attr_trace_l_c("sf corrupted", context);
517 kmem_free(sbuf, sbsize);
518 return XFS_ERROR(EFSCORRUPTED);
519 }
520 if (((context->flags & ATTR_SECURE) != 0) !=
521 ((sfe->flags & XFS_ATTR_SECURE) != 0) &&
522 !(context->flags & ATTR_KERNORMALS)) {
523 sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
524 continue;
525 }
526 if (((context->flags & ATTR_ROOT) != 0) !=
527 ((sfe->flags & XFS_ATTR_ROOT) != 0) &&
528 !(context->flags & ATTR_KERNROOTLS)) {
529 sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
530 continue;
531 }
532 sbp->entno = i;
533 INT_SET(sbp->hash, ARCH_CONVERT,
534 xfs_da_hashname((char *)sfe->nameval, sfe->namelen));
535 sbp->name = (char *)sfe->nameval;
536 sbp->namelen = sfe->namelen;
537 /* These are bytes, and both on-disk, don't endian-flip */
538 sbp->valuelen = sfe->valuelen;
539 sbp->flags = sfe->flags;
540 sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
541 sbp++;
542 nsbuf++;
543 }
544
545 /*
546 * Sort the entries on hash then entno.
547 */
548 qsort(sbuf, nsbuf, sizeof(*sbuf), xfs_attr_shortform_compare);
549
550 /*
551 * Re-find our place IN THE SORTED LIST.
552 */
553 count = 0;
554 cursor->initted = 1;
555 cursor->blkno = 0;
556 for (sbp = sbuf, i = 0; i < nsbuf; i++, sbp++) {
557 if (INT_GET(sbp->hash, ARCH_CONVERT) == cursor->hashval) {
558 if (cursor->offset == count) {
559 break;
560 }
561 count++;
562 } else if (INT_GET(sbp->hash, ARCH_CONVERT) > cursor->hashval) {
563 break;
564 }
565 }
566 if (i == nsbuf) {
567 kmem_free(sbuf, sbsize);
568 xfs_attr_trace_l_c("blk end", context);
569 return(0);
570 }
571
572 /*
573 * Loop putting entries into the user buffer.
574 */
575 for ( ; i < nsbuf; i++, sbp++) {
576 attrnames_t *namesp;
577
578 namesp = (sbp->flags & XFS_ATTR_SECURE) ? &attr_secure :
579 ((sbp->flags & XFS_ATTR_ROOT) ? &attr_trusted :
580 &attr_user);
581
582 if (cursor->hashval != INT_GET(sbp->hash, ARCH_CONVERT)) {
583 cursor->hashval = INT_GET(sbp->hash, ARCH_CONVERT);
584 cursor->offset = 0;
585 }
586 if (context->flags & ATTR_KERNOVAL) {
587 ASSERT(context->flags & ATTR_KERNAMELS);
588 context->count += namesp->attr_namelen +
589 sbp->namelen + 1;
590 } else {
591 if (xfs_attr_put_listent(context, namesp,
592 sbp->name, sbp->namelen,
593 INT_GET(sbp->valuelen, ARCH_CONVERT)))
594 break;
595 }
596 cursor->offset++;
597 }
598
599 kmem_free(sbuf, sbsize);
600 xfs_attr_trace_l_c("sf E-O-F", context);
601 return(0);
602}
603
604/*
605 * Check a leaf attribute block to see if all the entries would fit into
606 * a shortform attribute list.
607 */
608int
609xfs_attr_shortform_allfit(xfs_dabuf_t *bp, xfs_inode_t *dp)
610{
611 xfs_attr_leafblock_t *leaf;
612 xfs_attr_leaf_entry_t *entry;
613 xfs_attr_leaf_name_local_t *name_loc;
614 int bytes, i;
615
616 leaf = bp->data;
617 ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
618 == XFS_ATTR_LEAF_MAGIC);
619
620 entry = &leaf->entries[0];
621 bytes = sizeof(struct xfs_attr_sf_hdr);
622 for (i = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); entry++, i++) {
623 if (entry->flags & XFS_ATTR_INCOMPLETE)
624 continue; /* don't copy partial entries */
625 if (!(entry->flags & XFS_ATTR_LOCAL))
626 return(0);
627 name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i);
628 if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX)
629 return(0);
630 if (INT_GET(name_loc->valuelen, ARCH_CONVERT) >= XFS_ATTR_SF_ENTSIZE_MAX)
631 return(0);
632 bytes += sizeof(struct xfs_attr_sf_entry)-1
633 + name_loc->namelen
634 + INT_GET(name_loc->valuelen, ARCH_CONVERT);
635 }
636 return( bytes < XFS_IFORK_ASIZE(dp) );
637}
638
639/*
640 * Convert a leaf attribute list to shortform attribute list
641 */
642int
643xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args)
644{
645 xfs_attr_leafblock_t *leaf;
646 xfs_attr_leaf_entry_t *entry;
647 xfs_attr_leaf_name_local_t *name_loc;
648 xfs_da_args_t nargs;
649 xfs_inode_t *dp;
650 char *tmpbuffer;
651 int error, i;
652
653 dp = args->dp;
654 tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP);
655 ASSERT(tmpbuffer != NULL);
656
657 ASSERT(bp != NULL);
658 memcpy(tmpbuffer, bp->data, XFS_LBSIZE(dp->i_mount));
659 leaf = (xfs_attr_leafblock_t *)tmpbuffer;
660 ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
661 == XFS_ATTR_LEAF_MAGIC);
662 memset(bp->data, 0, XFS_LBSIZE(dp->i_mount));
663
664 /*
665 * Clean out the prior contents of the attribute list.
666 */
667 error = xfs_da_shrink_inode(args, 0, bp);
668 if (error)
669 goto out;
670 error = xfs_attr_shortform_create(args);
671 if (error)
672 goto out;
673
674 /*
675 * Copy the attributes
676 */
677 memset((char *)&nargs, 0, sizeof(nargs));
678 nargs.dp = dp;
679 nargs.firstblock = args->firstblock;
680 nargs.flist = args->flist;
681 nargs.total = args->total;
682 nargs.whichfork = XFS_ATTR_FORK;
683 nargs.trans = args->trans;
684 nargs.oknoent = 1;
685 entry = &leaf->entries[0];
686 for (i = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); entry++, i++) {
687 if (entry->flags & XFS_ATTR_INCOMPLETE)
688 continue; /* don't copy partial entries */
689 if (!entry->nameidx)
690 continue;
691 ASSERT(entry->flags & XFS_ATTR_LOCAL);
692 name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i);
693 nargs.name = (char *)name_loc->nameval;
694 nargs.namelen = name_loc->namelen;
695 nargs.value = (char *)&name_loc->nameval[nargs.namelen];
696 nargs.valuelen = INT_GET(name_loc->valuelen, ARCH_CONVERT);
697 nargs.hashval = INT_GET(entry->hashval, ARCH_CONVERT);
698 nargs.flags = (entry->flags & XFS_ATTR_SECURE) ? ATTR_SECURE :
699 ((entry->flags & XFS_ATTR_ROOT) ? ATTR_ROOT : 0);
700 xfs_attr_shortform_add(&nargs);
701 }
702 error = 0;
703
704out:
705 kmem_free(tmpbuffer, XFS_LBSIZE(dp->i_mount));
706 return(error);
707}
708
709/*
710 * Convert from using a single leaf to a root node and a leaf.
711 */
712int
713xfs_attr_leaf_to_node(xfs_da_args_t *args)
714{
715 xfs_attr_leafblock_t *leaf;
716 xfs_da_intnode_t *node;
717 xfs_inode_t *dp;
718 xfs_dabuf_t *bp1, *bp2;
719 xfs_dablk_t blkno;
720 int error;
721
722 dp = args->dp;
723 bp1 = bp2 = NULL;
724 error = xfs_da_grow_inode(args, &blkno);
725 if (error)
726 goto out;
727 error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1,
728 XFS_ATTR_FORK);
729 if (error)
730 goto out;
731 ASSERT(bp1 != NULL);
732 bp2 = NULL;
733 error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2,
734 XFS_ATTR_FORK);
735 if (error)
736 goto out;
737 ASSERT(bp2 != NULL);
738 memcpy(bp2->data, bp1->data, XFS_LBSIZE(dp->i_mount));
739 xfs_da_buf_done(bp1);
740 bp1 = NULL;
741 xfs_da_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1);
742
743 /*
744 * Set up the new root node.
745 */
746 error = xfs_da_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK);
747 if (error)
748 goto out;
749 node = bp1->data;
750 leaf = bp2->data;
751 ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
752 == XFS_ATTR_LEAF_MAGIC);
753 /* both on-disk, don't endian-flip twice */
754 node->btree[0].hashval =
755 leaf->entries[INT_GET(leaf->hdr.count, ARCH_CONVERT)-1 ].hashval;
756 INT_SET(node->btree[0].before, ARCH_CONVERT, blkno);
757 INT_SET(node->hdr.count, ARCH_CONVERT, 1);
758 xfs_da_log_buf(args->trans, bp1, 0, XFS_LBSIZE(dp->i_mount) - 1);
759 error = 0;
760out:
761 if (bp1)
762 xfs_da_buf_done(bp1);
763 if (bp2)
764 xfs_da_buf_done(bp2);
765 return(error);
766}
767
768
769/*========================================================================
770 * Routines used for growing the Btree.
771 *========================================================================*/
772
773/*
774 * Create the initial contents of a leaf attribute list
775 * or a leaf in a node attribute list.
776 */
777int
778xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp)
779{
780 xfs_attr_leafblock_t *leaf;
781 xfs_attr_leaf_hdr_t *hdr;
782 xfs_inode_t *dp;
783 xfs_dabuf_t *bp;
784 int error;
785
786 dp = args->dp;
787 ASSERT(dp != NULL);
788 error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp,
789 XFS_ATTR_FORK);
790 if (error)
791 return(error);
792 ASSERT(bp != NULL);
793 leaf = bp->data;
794 memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount));
795 hdr = &leaf->hdr;
796 INT_SET(hdr->info.magic, ARCH_CONVERT, XFS_ATTR_LEAF_MAGIC);
797 INT_SET(hdr->firstused, ARCH_CONVERT, XFS_LBSIZE(dp->i_mount));
798 if (!hdr->firstused) {
799 INT_SET(hdr->firstused, ARCH_CONVERT,
800 XFS_LBSIZE(dp->i_mount) - XFS_ATTR_LEAF_NAME_ALIGN);
801 }
802
803 INT_SET(hdr->freemap[0].base, ARCH_CONVERT,
804 sizeof(xfs_attr_leaf_hdr_t));
805 INT_SET(hdr->freemap[0].size, ARCH_CONVERT,
806 INT_GET(hdr->firstused, ARCH_CONVERT)
807 - INT_GET(hdr->freemap[0].base,
808 ARCH_CONVERT));
809
810 xfs_da_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1);
811
812 *bpp = bp;
813 return(0);
814}
815
816/*
817 * Split the leaf node, rebalance, then add the new entry.
818 */
819int
820xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
821 xfs_da_state_blk_t *newblk)
822{
823 xfs_dablk_t blkno;
824 int error;
825
826 /*
827 * Allocate space for a new leaf node.
828 */
829 ASSERT(oldblk->magic == XFS_ATTR_LEAF_MAGIC);
830 error = xfs_da_grow_inode(state->args, &blkno);
831 if (error)
832 return(error);
833 error = xfs_attr_leaf_create(state->args, blkno, &newblk->bp);
834 if (error)
835 return(error);
836 newblk->blkno = blkno;
837 newblk->magic = XFS_ATTR_LEAF_MAGIC;
838
839 /*
840 * Rebalance the entries across the two leaves.
841 * NOTE: rebalance() currently depends on the 2nd block being empty.
842 */
843 xfs_attr_leaf_rebalance(state, oldblk, newblk);
844 error = xfs_da_blk_link(state, oldblk, newblk);
845 if (error)
846 return(error);
847
848 /*
849 * Save info on "old" attribute for "atomic rename" ops, leaf_add()
850 * modifies the index/blkno/rmtblk/rmtblkcnt fields to show the
851 * "new" attrs info. Will need the "old" info to remove it later.
852 *
853 * Insert the "new" entry in the correct block.
854 */
855 if (state->inleaf)
856 error = xfs_attr_leaf_add(oldblk->bp, state->args);
857 else
858 error = xfs_attr_leaf_add(newblk->bp, state->args);
859
860 /*
861 * Update last hashval in each block since we added the name.
862 */
863 oldblk->hashval = xfs_attr_leaf_lasthash(oldblk->bp, NULL);
864 newblk->hashval = xfs_attr_leaf_lasthash(newblk->bp, NULL);
865 return(error);
866}
867
868/*
869 * Add a name to the leaf attribute list structure.
870 */
871int
872xfs_attr_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args)
873{
874 xfs_attr_leafblock_t *leaf;
875 xfs_attr_leaf_hdr_t *hdr;
876 xfs_attr_leaf_map_t *map;
877 int tablesize, entsize, sum, tmp, i;
878
879 leaf = bp->data;
880 ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
881 == XFS_ATTR_LEAF_MAGIC);
882 ASSERT((args->index >= 0)
883 && (args->index <= INT_GET(leaf->hdr.count, ARCH_CONVERT)));
884 hdr = &leaf->hdr;
885 entsize = xfs_attr_leaf_newentsize(args,
886 args->trans->t_mountp->m_sb.sb_blocksize, NULL);
887
888 /*
889 * Search through freemap for first-fit on new name length.
890 * (may need to figure in size of entry struct too)
891 */
892 tablesize = (INT_GET(hdr->count, ARCH_CONVERT) + 1)
893 * sizeof(xfs_attr_leaf_entry_t)
894 + sizeof(xfs_attr_leaf_hdr_t);
895 map = &hdr->freemap[XFS_ATTR_LEAF_MAPSIZE-1];
896 for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE-1; i >= 0; map--, i--) {
897 if (tablesize > INT_GET(hdr->firstused, ARCH_CONVERT)) {
898 sum += INT_GET(map->size, ARCH_CONVERT);
899 continue;
900 }
901 if (!map->size)
902 continue; /* no space in this map */
903 tmp = entsize;
904 if (INT_GET(map->base, ARCH_CONVERT)
905 < INT_GET(hdr->firstused, ARCH_CONVERT))
906 tmp += sizeof(xfs_attr_leaf_entry_t);
907 if (INT_GET(map->size, ARCH_CONVERT) >= tmp) {
908 tmp = xfs_attr_leaf_add_work(bp, args, i);
909 return(tmp);
910 }
911 sum += INT_GET(map->size, ARCH_CONVERT);
912 }
913
914 /*
915 * If there are no holes in the address space of the block,
916 * and we don't have enough freespace, then compaction will do us
917 * no good and we should just give up.
918 */
919 if (!hdr->holes && (sum < entsize))
920 return(XFS_ERROR(ENOSPC));
921
922 /*
923 * Compact the entries to coalesce free space.
924 * This may change the hdr->count via dropping INCOMPLETE entries.
925 */
926 xfs_attr_leaf_compact(args->trans, bp);
927
928 /*
929 * After compaction, the block is guaranteed to have only one
930 * free region, in freemap[0]. If it is not big enough, give up.
931 */
932 if (INT_GET(hdr->freemap[0].size, ARCH_CONVERT)
933 < (entsize + sizeof(xfs_attr_leaf_entry_t)))
934 return(XFS_ERROR(ENOSPC));
935
936 return(xfs_attr_leaf_add_work(bp, args, 0));
937}
938
939/*
940 * Add a name to a leaf attribute list structure.
941 */
942STATIC int
943xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
944{
945 xfs_attr_leafblock_t *leaf;
946 xfs_attr_leaf_hdr_t *hdr;
947 xfs_attr_leaf_entry_t *entry;
948 xfs_attr_leaf_name_local_t *name_loc;
949 xfs_attr_leaf_name_remote_t *name_rmt;
950 xfs_attr_leaf_map_t *map;
951 xfs_mount_t *mp;
952 int tmp, i;
953
954 leaf = bp->data;
955 ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
956 == XFS_ATTR_LEAF_MAGIC);
957 hdr = &leaf->hdr;
958 ASSERT((mapindex >= 0) && (mapindex < XFS_ATTR_LEAF_MAPSIZE));
959 ASSERT((args->index >= 0)
960 && (args->index <= INT_GET(hdr->count, ARCH_CONVERT)));
961
962 /*
963 * Force open some space in the entry array and fill it in.
964 */
965 entry = &leaf->entries[args->index];
966 if (args->index < INT_GET(hdr->count, ARCH_CONVERT)) {
967 tmp = INT_GET(hdr->count, ARCH_CONVERT) - args->index;
968 tmp *= sizeof(xfs_attr_leaf_entry_t);
969 memmove((char *)(entry+1), (char *)entry, tmp);
970 xfs_da_log_buf(args->trans, bp,
971 XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry)));
972 }
973 INT_MOD(hdr->count, ARCH_CONVERT, 1);
974
975 /*
976 * Allocate space for the new string (at the end of the run).
977 */
978 map = &hdr->freemap[mapindex];
979 mp = args->trans->t_mountp;
980 ASSERT(INT_GET(map->base, ARCH_CONVERT) < XFS_LBSIZE(mp));
981 ASSERT((INT_GET(map->base, ARCH_CONVERT) & 0x3) == 0);
982 ASSERT(INT_GET(map->size, ARCH_CONVERT)
983 >= xfs_attr_leaf_newentsize(args,
984 mp->m_sb.sb_blocksize, NULL));
985 ASSERT(INT_GET(map->size, ARCH_CONVERT) < XFS_LBSIZE(mp));
986 ASSERT((INT_GET(map->size, ARCH_CONVERT) & 0x3) == 0);
987 INT_MOD(map->size, ARCH_CONVERT,
988 -xfs_attr_leaf_newentsize(args, mp->m_sb.sb_blocksize, &tmp));
989 INT_SET(entry->nameidx, ARCH_CONVERT,
990 INT_GET(map->base, ARCH_CONVERT)
991 + INT_GET(map->size, ARCH_CONVERT));
992 INT_SET(entry->hashval, ARCH_CONVERT, args->hashval);
993 entry->flags = tmp ? XFS_ATTR_LOCAL : 0;
994 entry->flags |= (args->flags & ATTR_SECURE) ? XFS_ATTR_SECURE :
995 ((args->flags & ATTR_ROOT) ? XFS_ATTR_ROOT : 0);
996 if (args->rename) {
997 entry->flags |= XFS_ATTR_INCOMPLETE;
998 if ((args->blkno2 == args->blkno) &&
999 (args->index2 <= args->index)) {
1000 args->index2++;
1001 }
1002 }
1003 xfs_da_log_buf(args->trans, bp,
1004 XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
1005 ASSERT((args->index == 0) || (INT_GET(entry->hashval, ARCH_CONVERT)
1006 >= INT_GET((entry-1)->hashval,
1007 ARCH_CONVERT)));
1008 ASSERT((args->index == INT_GET(hdr->count, ARCH_CONVERT)-1) ||
1009 (INT_GET(entry->hashval, ARCH_CONVERT)
1010 <= (INT_GET((entry+1)->hashval, ARCH_CONVERT))));
1011
1012 /*
1013 * Copy the attribute name and value into the new space.
1014 *
1015 * For "remote" attribute values, simply note that we need to
1016 * allocate space for the "remote" value. We can't actually
1017 * allocate the extents in this transaction, and we can't decide
1018 * which blocks they should be as we might allocate more blocks
1019 * as part of this transaction (a split operation for example).
1020 */
1021 if (entry->flags & XFS_ATTR_LOCAL) {
1022 name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index);
1023 name_loc->namelen = args->namelen;
1024 INT_SET(name_loc->valuelen, ARCH_CONVERT, args->valuelen);
1025 memcpy((char *)name_loc->nameval, args->name, args->namelen);
1026 memcpy((char *)&name_loc->nameval[args->namelen], args->value,
1027 INT_GET(name_loc->valuelen, ARCH_CONVERT));
1028 } else {
1029 name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
1030 name_rmt->namelen = args->namelen;
1031 memcpy((char *)name_rmt->name, args->name, args->namelen);
1032 entry->flags |= XFS_ATTR_INCOMPLETE;
1033 /* just in case */
1034 name_rmt->valuelen = 0;
1035 name_rmt->valueblk = 0;
1036 args->rmtblkno = 1;
1037 args->rmtblkcnt = XFS_B_TO_FSB(mp, args->valuelen);
1038 }
1039 xfs_da_log_buf(args->trans, bp,
1040 XFS_DA_LOGRANGE(leaf, XFS_ATTR_LEAF_NAME(leaf, args->index),
1041 xfs_attr_leaf_entsize(leaf, args->index)));
1042
1043 /*
1044 * Update the control info for this leaf node
1045 */
1046 if (INT_GET(entry->nameidx, ARCH_CONVERT)
1047 < INT_GET(hdr->firstused, ARCH_CONVERT)) {
1048 /* both on-disk, don't endian-flip twice */
1049 hdr->firstused = entry->nameidx;
1050 }
1051 ASSERT(INT_GET(hdr->firstused, ARCH_CONVERT)
1052 >= ((INT_GET(hdr->count, ARCH_CONVERT)
1053 * sizeof(*entry))+sizeof(*hdr)));
1054 tmp = (INT_GET(hdr->count, ARCH_CONVERT)-1)
1055 * sizeof(xfs_attr_leaf_entry_t)
1056 + sizeof(xfs_attr_leaf_hdr_t);
1057 map = &hdr->freemap[0];
1058 for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) {
1059 if (INT_GET(map->base, ARCH_CONVERT) == tmp) {
1060 INT_MOD(map->base, ARCH_CONVERT,
1061 sizeof(xfs_attr_leaf_entry_t));
1062 INT_MOD(map->size, ARCH_CONVERT,
1063 -sizeof(xfs_attr_leaf_entry_t));
1064 }
1065 }
1066 INT_MOD(hdr->usedbytes, ARCH_CONVERT,
1067 xfs_attr_leaf_entsize(leaf, args->index));
1068 xfs_da_log_buf(args->trans, bp,
1069 XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
1070 return(0);
1071}
1072
1073/*
1074 * Garbage collect a leaf attribute list block by copying it to a new buffer.
1075 */
1076STATIC void
1077xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp)
1078{
1079 xfs_attr_leafblock_t *leaf_s, *leaf_d;
1080 xfs_attr_leaf_hdr_t *hdr_s, *hdr_d;
1081 xfs_mount_t *mp;
1082 char *tmpbuffer;
1083
1084 mp = trans->t_mountp;
1085 tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP);
1086 ASSERT(tmpbuffer != NULL);
1087 memcpy(tmpbuffer, bp->data, XFS_LBSIZE(mp));
1088 memset(bp->data, 0, XFS_LBSIZE(mp));
1089
1090 /*
1091 * Copy basic information
1092 */
1093 leaf_s = (xfs_attr_leafblock_t *)tmpbuffer;
1094 leaf_d = bp->data;
1095 hdr_s = &leaf_s->hdr;
1096 hdr_d = &leaf_d->hdr;
1097 hdr_d->info = hdr_s->info; /* struct copy */
1098 INT_SET(hdr_d->firstused, ARCH_CONVERT, XFS_LBSIZE(mp));
1099 /* handle truncation gracefully */
1100 if (!hdr_d->firstused) {
1101 INT_SET(hdr_d->firstused, ARCH_CONVERT,
1102 XFS_LBSIZE(mp) - XFS_ATTR_LEAF_NAME_ALIGN);
1103 }
1104 hdr_d->usedbytes = 0;
1105 hdr_d->count = 0;
1106 hdr_d->holes = 0;
1107 INT_SET(hdr_d->freemap[0].base, ARCH_CONVERT,
1108 sizeof(xfs_attr_leaf_hdr_t));
1109 INT_SET(hdr_d->freemap[0].size, ARCH_CONVERT,
1110 INT_GET(hdr_d->firstused, ARCH_CONVERT)
1111 - INT_GET(hdr_d->freemap[0].base, ARCH_CONVERT));
1112
1113 /*
1114 * Copy all entry's in the same (sorted) order,
1115 * but allocate name/value pairs packed and in sequence.
1116 */
1117 xfs_attr_leaf_moveents(leaf_s, 0, leaf_d, 0,
1118 (int)INT_GET(hdr_s->count, ARCH_CONVERT), mp);
1119
1120 xfs_da_log_buf(trans, bp, 0, XFS_LBSIZE(mp) - 1);
1121
1122 kmem_free(tmpbuffer, XFS_LBSIZE(mp));
1123}
1124
1125/*
1126 * Redistribute the attribute list entries between two leaf nodes,
1127 * taking into account the size of the new entry.
1128 *
1129 * NOTE: if new block is empty, then it will get the upper half of the
1130 * old block. At present, all (one) callers pass in an empty second block.
1131 *
1132 * This code adjusts the args->index/blkno and args->index2/blkno2 fields
1133 * to match what it is doing in splitting the attribute leaf block. Those
1134 * values are used in "atomic rename" operations on attributes. Note that
1135 * the "new" and "old" values can end up in different blocks.
1136 */
1137STATIC void
1138xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
1139 xfs_da_state_blk_t *blk2)
1140{
1141 xfs_da_args_t *args;
1142 xfs_da_state_blk_t *tmp_blk;
1143 xfs_attr_leafblock_t *leaf1, *leaf2;
1144 xfs_attr_leaf_hdr_t *hdr1, *hdr2;
1145 int count, totallen, max, space, swap;
1146
1147 /*
1148 * Set up environment.
1149 */
1150 ASSERT(blk1->magic == XFS_ATTR_LEAF_MAGIC);
1151 ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC);
1152 leaf1 = blk1->bp->data;
1153 leaf2 = blk2->bp->data;
1154 ASSERT(INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT)
1155 == XFS_ATTR_LEAF_MAGIC);
1156 ASSERT(INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT)
1157 == XFS_ATTR_LEAF_MAGIC);
1158 args = state->args;
1159
1160 /*
1161 * Check ordering of blocks, reverse if it makes things simpler.
1162 *
1163 * NOTE: Given that all (current) callers pass in an empty
1164 * second block, this code should never set "swap".
1165 */
1166 swap = 0;
1167 if (xfs_attr_leaf_order(blk1->bp, blk2->bp)) {
1168 tmp_blk = blk1;
1169 blk1 = blk2;
1170 blk2 = tmp_blk;
1171 leaf1 = blk1->bp->data;
1172 leaf2 = blk2->bp->data;
1173 swap = 1;
1174 }
1175 hdr1 = &leaf1->hdr;
1176 hdr2 = &leaf2->hdr;
1177
1178 /*
1179 * Examine entries until we reduce the absolute difference in
1180 * byte usage between the two blocks to a minimum. Then get
1181 * the direction to copy and the number of elements to move.
1182 *
1183 * "inleaf" is true if the new entry should be inserted into blk1.
1184 * If "swap" is also true, then reverse the sense of "inleaf".
1185 */
1186 state->inleaf = xfs_attr_leaf_figure_balance(state, blk1, blk2,
1187 &count, &totallen);
1188 if (swap)
1189 state->inleaf = !state->inleaf;
1190
1191 /*
1192 * Move any entries required from leaf to leaf:
1193 */
1194 if (count < INT_GET(hdr1->count, ARCH_CONVERT)) {
1195 /*
1196 * Figure the total bytes to be added to the destination leaf.
1197 */
1198 /* number entries being moved */
1199 count = INT_GET(hdr1->count, ARCH_CONVERT) - count;
1200 space = INT_GET(hdr1->usedbytes, ARCH_CONVERT) - totallen;
1201 space += count * sizeof(xfs_attr_leaf_entry_t);
1202
1203 /*
1204 * leaf2 is the destination, compact it if it looks tight.
1205 */
1206 max = INT_GET(hdr2->firstused, ARCH_CONVERT)
1207 - sizeof(xfs_attr_leaf_hdr_t);
1208 max -= INT_GET(hdr2->count, ARCH_CONVERT)
1209 * sizeof(xfs_attr_leaf_entry_t);
1210 if (space > max) {
1211 xfs_attr_leaf_compact(args->trans, blk2->bp);
1212 }
1213
1214 /*
1215 * Move high entries from leaf1 to low end of leaf2.
1216 */
1217 xfs_attr_leaf_moveents(leaf1,
1218 INT_GET(hdr1->count, ARCH_CONVERT)-count,
1219 leaf2, 0, count, state->mp);
1220
1221 xfs_da_log_buf(args->trans, blk1->bp, 0, state->blocksize-1);
1222 xfs_da_log_buf(args->trans, blk2->bp, 0, state->blocksize-1);
1223 } else if (count > INT_GET(hdr1->count, ARCH_CONVERT)) {
1224 /*
1225 * I assert that since all callers pass in an empty
1226 * second buffer, this code should never execute.
1227 */
1228
1229 /*
1230 * Figure the total bytes to be added to the destination leaf.
1231 */
1232 /* number entries being moved */
1233 count -= INT_GET(hdr1->count, ARCH_CONVERT);
1234 space = totallen - INT_GET(hdr1->usedbytes, ARCH_CONVERT);
1235 space += count * sizeof(xfs_attr_leaf_entry_t);
1236
1237 /*
1238 * leaf1 is the destination, compact it if it looks tight.
1239 */
1240 max = INT_GET(hdr1->firstused, ARCH_CONVERT)
1241 - sizeof(xfs_attr_leaf_hdr_t);
1242 max -= INT_GET(hdr1->count, ARCH_CONVERT)
1243 * sizeof(xfs_attr_leaf_entry_t);
1244 if (space > max) {
1245 xfs_attr_leaf_compact(args->trans, blk1->bp);
1246 }
1247
1248 /*
1249 * Move low entries from leaf2 to high end of leaf1.
1250 */
1251 xfs_attr_leaf_moveents(leaf2, 0, leaf1,
1252 (int)INT_GET(hdr1->count, ARCH_CONVERT), count,
1253 state->mp);
1254
1255 xfs_da_log_buf(args->trans, blk1->bp, 0, state->blocksize-1);
1256 xfs_da_log_buf(args->trans, blk2->bp, 0, state->blocksize-1);
1257 }
1258
1259 /*
1260 * Copy out last hashval in each block for B-tree code.
1261 */
1262 blk1->hashval =
1263 INT_GET(leaf1->entries[INT_GET(leaf1->hdr.count,
1264 ARCH_CONVERT)-1].hashval, ARCH_CONVERT);
1265 blk2->hashval =
1266 INT_GET(leaf2->entries[INT_GET(leaf2->hdr.count,
1267 ARCH_CONVERT)-1].hashval, ARCH_CONVERT);
1268
1269 /*
1270 * Adjust the expected index for insertion.
1271 * NOTE: this code depends on the (current) situation that the
1272 * second block was originally empty.
1273 *
1274 * If the insertion point moved to the 2nd block, we must adjust
1275 * the index. We must also track the entry just following the
1276 * new entry for use in an "atomic rename" operation, that entry
1277 * is always the "old" entry and the "new" entry is what we are
1278 * inserting. The index/blkno fields refer to the "old" entry,
1279 * while the index2/blkno2 fields refer to the "new" entry.
1280 */
1281 if (blk1->index > INT_GET(leaf1->hdr.count, ARCH_CONVERT)) {
1282 ASSERT(state->inleaf == 0);
1283 blk2->index = blk1->index
1284 - INT_GET(leaf1->hdr.count, ARCH_CONVERT);
1285 args->index = args->index2 = blk2->index;
1286 args->blkno = args->blkno2 = blk2->blkno;
1287 } else if (blk1->index == INT_GET(leaf1->hdr.count, ARCH_CONVERT)) {
1288 if (state->inleaf) {
1289 args->index = blk1->index;
1290 args->blkno = blk1->blkno;
1291 args->index2 = 0;
1292 args->blkno2 = blk2->blkno;
1293 } else {
1294 blk2->index = blk1->index
1295 - INT_GET(leaf1->hdr.count, ARCH_CONVERT);
1296 args->index = args->index2 = blk2->index;
1297 args->blkno = args->blkno2 = blk2->blkno;
1298 }
1299 } else {
1300 ASSERT(state->inleaf == 1);
1301 args->index = args->index2 = blk1->index;
1302 args->blkno = args->blkno2 = blk1->blkno;
1303 }
1304}
1305
1306/*
1307 * Examine entries until we reduce the absolute difference in
1308 * byte usage between the two blocks to a minimum.
1309 * GROT: Is this really necessary? With other than a 512 byte blocksize,
1310 * GROT: there will always be enough room in either block for a new entry.
1311 * GROT: Do a double-split for this case?
1312 */
1313STATIC int
1314xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
1315 xfs_da_state_blk_t *blk1,
1316 xfs_da_state_blk_t *blk2,
1317 int *countarg, int *usedbytesarg)
1318{
1319 xfs_attr_leafblock_t *leaf1, *leaf2;
1320 xfs_attr_leaf_hdr_t *hdr1, *hdr2;
1321 xfs_attr_leaf_entry_t *entry;
1322 int count, max, index, totallen, half;
1323 int lastdelta, foundit, tmp;
1324
1325 /*
1326 * Set up environment.
1327 */
1328 leaf1 = blk1->bp->data;
1329 leaf2 = blk2->bp->data;
1330 hdr1 = &leaf1->hdr;
1331 hdr2 = &leaf2->hdr;
1332 foundit = 0;
1333 totallen = 0;
1334
1335 /*
1336 * Examine entries until we reduce the absolute difference in
1337 * byte usage between the two blocks to a minimum.
1338 */
1339 max = INT_GET(hdr1->count, ARCH_CONVERT)
1340 + INT_GET(hdr2->count, ARCH_CONVERT);
1341 half = (max+1) * sizeof(*entry);
1342 half += INT_GET(hdr1->usedbytes, ARCH_CONVERT)
1343 + INT_GET(hdr2->usedbytes, ARCH_CONVERT)
1344 + xfs_attr_leaf_newentsize(state->args,
1345 state->blocksize, NULL);
1346 half /= 2;
1347 lastdelta = state->blocksize;
1348 entry = &leaf1->entries[0];
1349 for (count = index = 0; count < max; entry++, index++, count++) {
1350
1351#define XFS_ATTR_ABS(A) (((A) < 0) ? -(A) : (A))
1352 /*
1353 * The new entry is in the first block, account for it.
1354 */
1355 if (count == blk1->index) {
1356 tmp = totallen + sizeof(*entry) +
1357 xfs_attr_leaf_newentsize(state->args,
1358 state->blocksize,
1359 NULL);
1360 if (XFS_ATTR_ABS(half - tmp) > lastdelta)
1361 break;
1362 lastdelta = XFS_ATTR_ABS(half - tmp);
1363 totallen = tmp;
1364 foundit = 1;
1365 }
1366
1367 /*
1368 * Wrap around into the second block if necessary.
1369 */
1370 if (count == INT_GET(hdr1->count, ARCH_CONVERT)) {
1371 leaf1 = leaf2;
1372 entry = &leaf1->entries[0];
1373 index = 0;
1374 }
1375
1376 /*
1377 * Figure out if next leaf entry would be too much.
1378 */
1379 tmp = totallen + sizeof(*entry) + xfs_attr_leaf_entsize(leaf1,
1380 index);
1381 if (XFS_ATTR_ABS(half - tmp) > lastdelta)
1382 break;
1383 lastdelta = XFS_ATTR_ABS(half - tmp);
1384 totallen = tmp;
1385#undef XFS_ATTR_ABS
1386 }
1387
1388 /*
1389 * Calculate the number of usedbytes that will end up in lower block.
1390 * If new entry not in lower block, fix up the count.
1391 */
1392 totallen -= count * sizeof(*entry);
1393 if (foundit) {
1394 totallen -= sizeof(*entry) +
1395 xfs_attr_leaf_newentsize(state->args,
1396 state->blocksize,
1397 NULL);
1398 }
1399
1400 *countarg = count;
1401 *usedbytesarg = totallen;
1402 return(foundit);
1403}
1404
1405/*========================================================================
1406 * Routines used for shrinking the Btree.
1407 *========================================================================*/
1408
1409/*
1410 * Check a leaf block and its neighbors to see if the block should be
1411 * collapsed into one or the other neighbor. Always keep the block
1412 * with the smaller block number.
1413 * If the current block is over 50% full, don't try to join it, return 0.
1414 * If the block is empty, fill in the state structure and return 2.
1415 * If it can be collapsed, fill in the state structure and return 1.
1416 * If nothing can be done, return 0.
1417 *
1418 * GROT: allow for INCOMPLETE entries in calculation.
1419 */
1420int
1421xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
1422{
1423 xfs_attr_leafblock_t *leaf;
1424 xfs_da_state_blk_t *blk;
1425 xfs_da_blkinfo_t *info;
1426 int count, bytes, forward, error, retval, i;
1427 xfs_dablk_t blkno;
1428 xfs_dabuf_t *bp;
1429
1430 /*
1431 * Check for the degenerate case of the block being over 50% full.
1432 * If so, it's not worth even looking to see if we might be able
1433 * to coalesce with a sibling.
1434 */
1435 blk = &state->path.blk[ state->path.active-1 ];
1436 info = blk->bp->data;
1437 ASSERT(INT_GET(info->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC);
1438 leaf = (xfs_attr_leafblock_t *)info;
1439 count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
1440 bytes = sizeof(xfs_attr_leaf_hdr_t) +
1441 count * sizeof(xfs_attr_leaf_entry_t) +
1442 INT_GET(leaf->hdr.usedbytes, ARCH_CONVERT);
1443 if (bytes > (state->blocksize >> 1)) {
1444 *action = 0; /* blk over 50%, don't try to join */
1445 return(0);
1446 }
1447
1448 /*
1449 * Check for the degenerate case of the block being empty.
1450 * If the block is empty, we'll simply delete it, no need to
1451 * coalesce it with a sibling block. We choose (aribtrarily)
1452 * to merge with the forward block unless it is NULL.
1453 */
1454 if (count == 0) {
1455 /*
1456 * Make altpath point to the block we want to keep and
1457 * path point to the block we want to drop (this one).
1458 */
1459 forward = info->forw;
1460 memcpy(&state->altpath, &state->path, sizeof(state->path));
1461 error = xfs_da_path_shift(state, &state->altpath, forward,
1462 0, &retval);
1463 if (error)
1464 return(error);
1465 if (retval) {
1466 *action = 0;
1467 } else {
1468 *action = 2;
1469 }
1470 return(0);
1471 }
1472
1473 /*
1474 * Examine each sibling block to see if we can coalesce with
1475 * at least 25% free space to spare. We need to figure out
1476 * whether to merge with the forward or the backward block.
1477 * We prefer coalescing with the lower numbered sibling so as
1478 * to shrink an attribute list over time.
1479 */
1480 /* start with smaller blk num */
1481 forward = (INT_GET(info->forw, ARCH_CONVERT)
1482 < INT_GET(info->back, ARCH_CONVERT));
1483 for (i = 0; i < 2; forward = !forward, i++) {
1484 if (forward)
1485 blkno = INT_GET(info->forw, ARCH_CONVERT);
1486 else
1487 blkno = INT_GET(info->back, ARCH_CONVERT);
1488 if (blkno == 0)
1489 continue;
1490 error = xfs_da_read_buf(state->args->trans, state->args->dp,
1491 blkno, -1, &bp, XFS_ATTR_FORK);
1492 if (error)
1493 return(error);
1494 ASSERT(bp != NULL);
1495
1496 leaf = (xfs_attr_leafblock_t *)info;
1497 count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
1498 bytes = state->blocksize - (state->blocksize>>2);
1499 bytes -= INT_GET(leaf->hdr.usedbytes, ARCH_CONVERT);
1500 leaf = bp->data;
1501 ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
1502 == XFS_ATTR_LEAF_MAGIC);
1503 count += INT_GET(leaf->hdr.count, ARCH_CONVERT);
1504 bytes -= INT_GET(leaf->hdr.usedbytes, ARCH_CONVERT);
1505 bytes -= count * sizeof(xfs_attr_leaf_entry_t);
1506 bytes -= sizeof(xfs_attr_leaf_hdr_t);
1507 xfs_da_brelse(state->args->trans, bp);
1508 if (bytes >= 0)
1509 break; /* fits with at least 25% to spare */
1510 }
1511 if (i >= 2) {
1512 *action = 0;
1513 return(0);
1514 }
1515
1516 /*
1517 * Make altpath point to the block we want to keep (the lower
1518 * numbered block) and path point to the block we want to drop.
1519 */
1520 memcpy(&state->altpath, &state->path, sizeof(state->path));
1521 if (blkno < blk->blkno) {
1522 error = xfs_da_path_shift(state, &state->altpath, forward,
1523 0, &retval);
1524 } else {
1525 error = xfs_da_path_shift(state, &state->path, forward,
1526 0, &retval);
1527 }
1528 if (error)
1529 return(error);
1530 if (retval) {
1531 *action = 0;
1532 } else {
1533 *action = 1;
1534 }
1535 return(0);
1536}
1537
1538/*
1539 * Remove a name from the leaf attribute list structure.
1540 *
1541 * Return 1 if leaf is less than 37% full, 0 if >= 37% full.
1542 * If two leaves are 37% full, when combined they will leave 25% free.
1543 */
1544int
1545xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args)
1546{
1547 xfs_attr_leafblock_t *leaf;
1548 xfs_attr_leaf_hdr_t *hdr;
1549 xfs_attr_leaf_map_t *map;
1550 xfs_attr_leaf_entry_t *entry;
1551 int before, after, smallest, entsize;
1552 int tablesize, tmp, i;
1553 xfs_mount_t *mp;
1554
1555 leaf = bp->data;
1556 ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
1557 == XFS_ATTR_LEAF_MAGIC);
1558 hdr = &leaf->hdr;
1559 mp = args->trans->t_mountp;
1560 ASSERT((INT_GET(hdr->count, ARCH_CONVERT) > 0)
1561 && (INT_GET(hdr->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8)));
1562 ASSERT((args->index >= 0)
1563 && (args->index < INT_GET(hdr->count, ARCH_CONVERT)));
1564 ASSERT(INT_GET(hdr->firstused, ARCH_CONVERT)
1565 >= ((INT_GET(hdr->count, ARCH_CONVERT)
1566 * sizeof(*entry))+sizeof(*hdr)));
1567 entry = &leaf->entries[args->index];
1568 ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT)
1569 >= INT_GET(hdr->firstused, ARCH_CONVERT));
1570 ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) < XFS_LBSIZE(mp));
1571
1572 /*
1573 * Scan through free region table:
1574 * check for adjacency of free'd entry with an existing one,
1575 * find smallest free region in case we need to replace it,
1576 * adjust any map that borders the entry table,
1577 */
1578 tablesize = INT_GET(hdr->count, ARCH_CONVERT)
1579 * sizeof(xfs_attr_leaf_entry_t)
1580 + sizeof(xfs_attr_leaf_hdr_t);
1581 map = &hdr->freemap[0];
1582 tmp = INT_GET(map->size, ARCH_CONVERT);
1583 before = after = -1;
1584 smallest = XFS_ATTR_LEAF_MAPSIZE - 1;
1585 entsize = xfs_attr_leaf_entsize(leaf, args->index);
1586 for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) {
1587 ASSERT(INT_GET(map->base, ARCH_CONVERT) < XFS_LBSIZE(mp));
1588 ASSERT(INT_GET(map->size, ARCH_CONVERT) < XFS_LBSIZE(mp));
1589 if (INT_GET(map->base, ARCH_CONVERT) == tablesize) {
1590 INT_MOD(map->base, ARCH_CONVERT,
1591 -sizeof(xfs_attr_leaf_entry_t));
1592 INT_MOD(map->size, ARCH_CONVERT,
1593 sizeof(xfs_attr_leaf_entry_t));
1594 }
1595
1596 if ((INT_GET(map->base, ARCH_CONVERT)
1597 + INT_GET(map->size, ARCH_CONVERT))
1598 == INT_GET(entry->nameidx, ARCH_CONVERT)) {
1599 before = i;
1600 } else if (INT_GET(map->base, ARCH_CONVERT)
1601 == (INT_GET(entry->nameidx, ARCH_CONVERT) + entsize)) {
1602 after = i;
1603 } else if (INT_GET(map->size, ARCH_CONVERT) < tmp) {
1604 tmp = INT_GET(map->size, ARCH_CONVERT);
1605 smallest = i;
1606 }
1607 }
1608
1609 /*
1610 * Coalesce adjacent freemap regions,
1611 * or replace the smallest region.
1612 */
1613 if ((before >= 0) || (after >= 0)) {
1614 if ((before >= 0) && (after >= 0)) {
1615 map = &hdr->freemap[before];
1616 INT_MOD(map->size, ARCH_CONVERT, entsize);
1617 INT_MOD(map->size, ARCH_CONVERT,
1618 INT_GET(hdr->freemap[after].size,
1619 ARCH_CONVERT));
1620 hdr->freemap[after].base = 0;
1621 hdr->freemap[after].size = 0;
1622 } else if (before >= 0) {
1623 map = &hdr->freemap[before];
1624 INT_MOD(map->size, ARCH_CONVERT, entsize);
1625 } else {
1626 map = &hdr->freemap[after];
1627 /* both on-disk, don't endian flip twice */
1628 map->base = entry->nameidx;
1629 INT_MOD(map->size, ARCH_CONVERT, entsize);
1630 }
1631 } else {
1632 /*
1633 * Replace smallest region (if it is smaller than free'd entry)
1634 */
1635 map = &hdr->freemap[smallest];
1636 if (INT_GET(map->size, ARCH_CONVERT) < entsize) {
1637 INT_SET(map->base, ARCH_CONVERT,
1638 INT_GET(entry->nameidx, ARCH_CONVERT));
1639 INT_SET(map->size, ARCH_CONVERT, entsize);
1640 }
1641 }
1642
1643 /*
1644 * Did we remove the first entry?
1645 */
1646 if (INT_GET(entry->nameidx, ARCH_CONVERT)
1647 == INT_GET(hdr->firstused, ARCH_CONVERT))
1648 smallest = 1;
1649 else
1650 smallest = 0;
1651
1652 /*
1653 * Compress the remaining entries and zero out the removed stuff.
1654 */
1655 memset(XFS_ATTR_LEAF_NAME(leaf, args->index), 0, entsize);
1656 INT_MOD(hdr->usedbytes, ARCH_CONVERT, -entsize);
1657 xfs_da_log_buf(args->trans, bp,
1658 XFS_DA_LOGRANGE(leaf, XFS_ATTR_LEAF_NAME(leaf, args->index),
1659 entsize));
1660
1661 tmp = (INT_GET(hdr->count, ARCH_CONVERT) - args->index)
1662 * sizeof(xfs_attr_leaf_entry_t);
1663 memmove((char *)entry, (char *)(entry+1), tmp);
1664 INT_MOD(hdr->count, ARCH_CONVERT, -1);
1665 xfs_da_log_buf(args->trans, bp,
1666 XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry)));
1667 entry = &leaf->entries[INT_GET(hdr->count, ARCH_CONVERT)];
1668 memset((char *)entry, 0, sizeof(xfs_attr_leaf_entry_t));
1669
1670 /*
1671 * If we removed the first entry, re-find the first used byte
1672 * in the name area. Note that if the entry was the "firstused",
1673 * then we don't have a "hole" in our block resulting from
1674 * removing the name.
1675 */
1676 if (smallest) {
1677 tmp = XFS_LBSIZE(mp);
1678 entry = &leaf->entries[0];
1679 for (i = INT_GET(hdr->count, ARCH_CONVERT)-1;
1680 i >= 0; entry++, i--) {
1681 ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT)
1682 >= INT_GET(hdr->firstused, ARCH_CONVERT));
1683 ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT)
1684 < XFS_LBSIZE(mp));
1685 if (INT_GET(entry->nameidx, ARCH_CONVERT) < tmp)
1686 tmp = INT_GET(entry->nameidx, ARCH_CONVERT);
1687 }
1688 INT_SET(hdr->firstused, ARCH_CONVERT, tmp);
1689 if (!hdr->firstused) {
1690 INT_SET(hdr->firstused, ARCH_CONVERT,
1691 tmp - XFS_ATTR_LEAF_NAME_ALIGN);
1692 }
1693 } else {
1694 hdr->holes = 1; /* mark as needing compaction */
1695 }
1696 xfs_da_log_buf(args->trans, bp,
1697 XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
1698
1699 /*
1700 * Check if leaf is less than 50% full, caller may want to
1701 * "join" the leaf with a sibling if so.
1702 */
1703 tmp = sizeof(xfs_attr_leaf_hdr_t);
1704 tmp += INT_GET(leaf->hdr.count, ARCH_CONVERT)
1705 * sizeof(xfs_attr_leaf_entry_t);
1706 tmp += INT_GET(leaf->hdr.usedbytes, ARCH_CONVERT);
1707 return(tmp < mp->m_attr_magicpct); /* leaf is < 37% full */
1708}
1709
1710/*
1711 * Move all the attribute list entries from drop_leaf into save_leaf.
1712 */
1713void
1714xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
1715 xfs_da_state_blk_t *save_blk)
1716{
1717 xfs_attr_leafblock_t *drop_leaf, *save_leaf, *tmp_leaf;
1718 xfs_attr_leaf_hdr_t *drop_hdr, *save_hdr, *tmp_hdr;
1719 xfs_mount_t *mp;
1720 char *tmpbuffer;
1721
1722 /*
1723 * Set up environment.
1724 */
1725 mp = state->mp;
1726 ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC);
1727 ASSERT(save_blk->magic == XFS_ATTR_LEAF_MAGIC);
1728 drop_leaf = drop_blk->bp->data;
1729 save_leaf = save_blk->bp->data;
1730 ASSERT(INT_GET(drop_leaf->hdr.info.magic, ARCH_CONVERT)
1731 == XFS_ATTR_LEAF_MAGIC);
1732 ASSERT(INT_GET(save_leaf->hdr.info.magic, ARCH_CONVERT)
1733 == XFS_ATTR_LEAF_MAGIC);
1734 drop_hdr = &drop_leaf->hdr;
1735 save_hdr = &save_leaf->hdr;
1736
1737 /*
1738 * Save last hashval from dying block for later Btree fixup.
1739 */
1740 drop_blk->hashval =
1741 INT_GET(drop_leaf->entries[INT_GET(drop_leaf->hdr.count,
1742 ARCH_CONVERT)-1].hashval,
1743 ARCH_CONVERT);
1744
1745 /*
1746 * Check if we need a temp buffer, or can we do it in place.
1747 * Note that we don't check "leaf" for holes because we will
1748 * always be dropping it, toosmall() decided that for us already.
1749 */
1750 if (save_hdr->holes == 0) {
1751 /*
1752 * dest leaf has no holes, so we add there. May need
1753 * to make some room in the entry array.
1754 */
1755 if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) {
1756 xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf, 0,
1757 (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
1758 } else {
1759 xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf,
1760 INT_GET(save_hdr->count, ARCH_CONVERT),
1761 (int)INT_GET(drop_hdr->count, ARCH_CONVERT),
1762 mp);
1763 }
1764 } else {
1765 /*
1766 * Destination has holes, so we make a temporary copy
1767 * of the leaf and add them both to that.
1768 */
1769 tmpbuffer = kmem_alloc(state->blocksize, KM_SLEEP);
1770 ASSERT(tmpbuffer != NULL);
1771 memset(tmpbuffer, 0, state->blocksize);
1772 tmp_leaf = (xfs_attr_leafblock_t *)tmpbuffer;
1773 tmp_hdr = &tmp_leaf->hdr;
1774 tmp_hdr->info = save_hdr->info; /* struct copy */
1775 tmp_hdr->count = 0;
1776 INT_SET(tmp_hdr->firstused, ARCH_CONVERT, state->blocksize);
1777 if (!tmp_hdr->firstused) {
1778 INT_SET(tmp_hdr->firstused, ARCH_CONVERT,
1779 state->blocksize - XFS_ATTR_LEAF_NAME_ALIGN);
1780 }
1781 tmp_hdr->usedbytes = 0;
1782 if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) {
1783 xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf, 0,
1784 (int)INT_GET(drop_hdr->count, ARCH_CONVERT),
1785 mp);
1786 xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf,
1787 INT_GET(tmp_leaf->hdr.count, ARCH_CONVERT),
1788 (int)INT_GET(save_hdr->count, ARCH_CONVERT),
1789 mp);
1790 } else {
1791 xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf, 0,
1792 (int)INT_GET(save_hdr->count, ARCH_CONVERT),
1793 mp);
1794 xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf,
1795 INT_GET(tmp_leaf->hdr.count, ARCH_CONVERT),
1796 (int)INT_GET(drop_hdr->count, ARCH_CONVERT),
1797 mp);
1798 }
1799 memcpy((char *)save_leaf, (char *)tmp_leaf, state->blocksize);
1800 kmem_free(tmpbuffer, state->blocksize);
1801 }
1802
1803 xfs_da_log_buf(state->args->trans, save_blk->bp, 0,
1804 state->blocksize - 1);
1805
1806 /*
1807 * Copy out last hashval in each block for B-tree code.
1808 */
1809 save_blk->hashval =
1810 INT_GET(save_leaf->entries[INT_GET(save_leaf->hdr.count,
1811 ARCH_CONVERT)-1].hashval,
1812 ARCH_CONVERT);
1813}
1814
1815/*========================================================================
1816 * Routines used for finding things in the Btree.
1817 *========================================================================*/
1818
1819/*
1820 * Look up a name in a leaf attribute list structure.
1821 * This is the internal routine, it uses the caller's buffer.
1822 *
1823 * Note that duplicate keys are allowed, but only check within the
1824 * current leaf node. The Btree code must check in adjacent leaf nodes.
1825 *
1826 * Return in args->index the index into the entry[] array of either
1827 * the found entry, or where the entry should have been (insert before
1828 * that entry).
1829 *
1830 * Don't change the args->value unless we find the attribute.
1831 */
1832int
1833xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args)
1834{
1835 xfs_attr_leafblock_t *leaf;
1836 xfs_attr_leaf_entry_t *entry;
1837 xfs_attr_leaf_name_local_t *name_loc;
1838 xfs_attr_leaf_name_remote_t *name_rmt;
1839 int probe, span;
1840 xfs_dahash_t hashval;
1841
1842 leaf = bp->data;
1843 ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
1844 == XFS_ATTR_LEAF_MAGIC);
1845 ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT)
1846 < (XFS_LBSIZE(args->dp->i_mount)/8));
1847
1848 /*
1849 * Binary search. (note: small blocks will skip this loop)
1850 */
1851 hashval = args->hashval;
1852 probe = span = INT_GET(leaf->hdr.count, ARCH_CONVERT) / 2;
1853 for (entry = &leaf->entries[probe]; span > 4;
1854 entry = &leaf->entries[probe]) {
1855 span /= 2;
1856 if (INT_GET(entry->hashval, ARCH_CONVERT) < hashval)
1857 probe += span;
1858 else if (INT_GET(entry->hashval, ARCH_CONVERT) > hashval)
1859 probe -= span;
1860 else
1861 break;
1862 }
1863 ASSERT((probe >= 0) &&
1864 (!leaf->hdr.count
1865 || (probe < INT_GET(leaf->hdr.count, ARCH_CONVERT))));
1866 ASSERT((span <= 4) || (INT_GET(entry->hashval, ARCH_CONVERT)
1867 == hashval));
1868
1869 /*
1870 * Since we may have duplicate hashval's, find the first matching
1871 * hashval in the leaf.
1872 */
1873 while ((probe > 0) && (INT_GET(entry->hashval, ARCH_CONVERT)
1874 >= hashval)) {
1875 entry--;
1876 probe--;
1877 }
1878 while ((probe < INT_GET(leaf->hdr.count, ARCH_CONVERT))
1879 && (INT_GET(entry->hashval, ARCH_CONVERT) < hashval)) {
1880 entry++;
1881 probe++;
1882 }
1883 if ((probe == INT_GET(leaf->hdr.count, ARCH_CONVERT))
1884 || (INT_GET(entry->hashval, ARCH_CONVERT) != hashval)) {
1885 args->index = probe;
1886 return(XFS_ERROR(ENOATTR));
1887 }
1888
1889 /*
1890 * Duplicate keys may be present, so search all of them for a match.
1891 */
1892 for ( ; (probe < INT_GET(leaf->hdr.count, ARCH_CONVERT))
1893 && (INT_GET(entry->hashval, ARCH_CONVERT) == hashval);
1894 entry++, probe++) {
1895/*
1896 * GROT: Add code to remove incomplete entries.
1897 */
1898 /*
1899 * If we are looking for INCOMPLETE entries, show only those.
1900 * If we are looking for complete entries, show only those.
1901 */
1902 if ((args->flags & XFS_ATTR_INCOMPLETE) !=
1903 (entry->flags & XFS_ATTR_INCOMPLETE)) {
1904 continue;
1905 }
1906 if (entry->flags & XFS_ATTR_LOCAL) {
1907 name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, probe);
1908 if (name_loc->namelen != args->namelen)
1909 continue;
1910 if (memcmp(args->name, (char *)name_loc->nameval,
1911 args->namelen) != 0)
1912 continue;
1913 if (((args->flags & ATTR_SECURE) != 0) !=
1914 ((entry->flags & XFS_ATTR_SECURE) != 0))
1915 continue;
1916 if (((args->flags & ATTR_ROOT) != 0) !=
1917 ((entry->flags & XFS_ATTR_ROOT) != 0))
1918 continue;
1919 args->index = probe;
1920 return(XFS_ERROR(EEXIST));
1921 } else {
1922 name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, probe);
1923 if (name_rmt->namelen != args->namelen)
1924 continue;
1925 if (memcmp(args->name, (char *)name_rmt->name,
1926 args->namelen) != 0)
1927 continue;
1928 if (((args->flags & ATTR_SECURE) != 0) !=
1929 ((entry->flags & XFS_ATTR_SECURE) != 0))
1930 continue;
1931 if (((args->flags & ATTR_ROOT) != 0) !=
1932 ((entry->flags & XFS_ATTR_ROOT) != 0))
1933 continue;
1934 args->index = probe;
1935 args->rmtblkno
1936 = INT_GET(name_rmt->valueblk, ARCH_CONVERT);
1937 args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount,
1938 INT_GET(name_rmt->valuelen,
1939 ARCH_CONVERT));
1940 return(XFS_ERROR(EEXIST));
1941 }
1942 }
1943 args->index = probe;
1944 return(XFS_ERROR(ENOATTR));
1945}
1946
1947/*
1948 * Get the value associated with an attribute name from a leaf attribute
1949 * list structure.
1950 */
1951int
1952xfs_attr_leaf_getvalue(xfs_dabuf_t *bp, xfs_da_args_t *args)
1953{
1954 int valuelen;
1955 xfs_attr_leafblock_t *leaf;
1956 xfs_attr_leaf_entry_t *entry;
1957 xfs_attr_leaf_name_local_t *name_loc;
1958 xfs_attr_leaf_name_remote_t *name_rmt;
1959
1960 leaf = bp->data;
1961 ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
1962 == XFS_ATTR_LEAF_MAGIC);
1963 ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT)
1964 < (XFS_LBSIZE(args->dp->i_mount)/8));
1965 ASSERT(args->index < ((int)INT_GET(leaf->hdr.count, ARCH_CONVERT)));
1966
1967 entry = &leaf->entries[args->index];
1968 if (entry->flags & XFS_ATTR_LOCAL) {
1969 name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index);
1970 ASSERT(name_loc->namelen == args->namelen);
1971 ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0);
1972 valuelen = INT_GET(name_loc->valuelen, ARCH_CONVERT);
1973 if (args->flags & ATTR_KERNOVAL) {
1974 args->valuelen = valuelen;
1975 return(0);
1976 }
1977 if (args->valuelen < valuelen) {
1978 args->valuelen = valuelen;
1979 return(XFS_ERROR(ERANGE));
1980 }
1981 args->valuelen = valuelen;
1982 memcpy(args->value, &name_loc->nameval[args->namelen], valuelen);
1983 } else {
1984 name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
1985 ASSERT(name_rmt->namelen == args->namelen);
1986 ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
1987 valuelen = INT_GET(name_rmt->valuelen, ARCH_CONVERT);
1988 args->rmtblkno = INT_GET(name_rmt->valueblk, ARCH_CONVERT);
1989 args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, valuelen);
1990 if (args->flags & ATTR_KERNOVAL) {
1991 args->valuelen = valuelen;
1992 return(0);
1993 }
1994 if (args->valuelen < valuelen) {
1995 args->valuelen = valuelen;
1996 return(XFS_ERROR(ERANGE));
1997 }
1998 args->valuelen = valuelen;
1999 }
2000 return(0);
2001}
2002
2003/*========================================================================
2004 * Utility routines.
2005 *========================================================================*/
2006
2007/*
2008 * Move the indicated entries from one leaf to another.
2009 * NOTE: this routine modifies both source and destination leaves.
2010 */
2011/*ARGSUSED*/
2012STATIC void
2013xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s,
2014 xfs_attr_leafblock_t *leaf_d, int start_d,
2015 int count, xfs_mount_t *mp)
2016{
2017 xfs_attr_leaf_hdr_t *hdr_s, *hdr_d;
2018 xfs_attr_leaf_entry_t *entry_s, *entry_d;
2019 int desti, tmp, i;
2020
2021 /*
2022 * Check for nothing to do.
2023 */
2024 if (count == 0)
2025 return;
2026
2027 /*
2028 * Set up environment.
2029 */
2030 ASSERT(INT_GET(leaf_s->hdr.info.magic, ARCH_CONVERT)
2031 == XFS_ATTR_LEAF_MAGIC);
2032 ASSERT(INT_GET(leaf_d->hdr.info.magic, ARCH_CONVERT)
2033 == XFS_ATTR_LEAF_MAGIC);
2034 hdr_s = &leaf_s->hdr;
2035 hdr_d = &leaf_d->hdr;
2036 ASSERT((INT_GET(hdr_s->count, ARCH_CONVERT) > 0)
2037 && (INT_GET(hdr_s->count, ARCH_CONVERT)
2038 < (XFS_LBSIZE(mp)/8)));
2039 ASSERT(INT_GET(hdr_s->firstused, ARCH_CONVERT) >=
2040 ((INT_GET(hdr_s->count, ARCH_CONVERT)
2041 * sizeof(*entry_s))+sizeof(*hdr_s)));
2042 ASSERT(INT_GET(hdr_d->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8));
2043 ASSERT(INT_GET(hdr_d->firstused, ARCH_CONVERT) >=
2044 ((INT_GET(hdr_d->count, ARCH_CONVERT)
2045 * sizeof(*entry_d))+sizeof(*hdr_d)));
2046
2047 ASSERT(start_s < INT_GET(hdr_s->count, ARCH_CONVERT));
2048 ASSERT(start_d <= INT_GET(hdr_d->count, ARCH_CONVERT));
2049 ASSERT(count <= INT_GET(hdr_s->count, ARCH_CONVERT));
2050
2051 /*
2052 * Move the entries in the destination leaf up to make a hole?
2053 */
2054 if (start_d < INT_GET(hdr_d->count, ARCH_CONVERT)) {
2055 tmp = INT_GET(hdr_d->count, ARCH_CONVERT) - start_d;
2056 tmp *= sizeof(xfs_attr_leaf_entry_t);
2057 entry_s = &leaf_d->entries[start_d];
2058 entry_d = &leaf_d->entries[start_d + count];
2059 memmove((char *)entry_d, (char *)entry_s, tmp);
2060 }
2061
2062 /*
2063 * Copy all entry's in the same (sorted) order,
2064 * but allocate attribute info packed and in sequence.
2065 */
2066 entry_s = &leaf_s->entries[start_s];
2067 entry_d = &leaf_d->entries[start_d];
2068 desti = start_d;
2069 for (i = 0; i < count; entry_s++, entry_d++, desti++, i++) {
2070 ASSERT(INT_GET(entry_s->nameidx, ARCH_CONVERT)
2071 >= INT_GET(hdr_s->firstused, ARCH_CONVERT));
2072 tmp = xfs_attr_leaf_entsize(leaf_s, start_s + i);
2073#ifdef GROT
2074 /*
2075 * Code to drop INCOMPLETE entries. Difficult to use as we
2076 * may also need to change the insertion index. Code turned
2077 * off for 6.2, should be revisited later.
2078 */
2079 if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */
2080 memset(XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), 0, tmp);
2081 INT_MOD(hdr_s->usedbytes, ARCH_CONVERT, -tmp);
2082 INT_MOD(hdr_s->count, ARCH_CONVERT, -1);
2083 entry_d--; /* to compensate for ++ in loop hdr */
2084 desti--;
2085 if ((start_s + i) < offset)
2086 result++; /* insertion index adjustment */
2087 } else {
2088#endif /* GROT */
2089 INT_MOD(hdr_d->firstused, ARCH_CONVERT, -tmp);
2090 /* both on-disk, don't endian flip twice */
2091 entry_d->hashval = entry_s->hashval;
2092 /* both on-disk, don't endian flip twice */
2093 entry_d->nameidx = hdr_d->firstused;
2094 entry_d->flags = entry_s->flags;
2095 ASSERT(INT_GET(entry_d->nameidx, ARCH_CONVERT) + tmp
2096 <= XFS_LBSIZE(mp));
2097 memmove(XFS_ATTR_LEAF_NAME(leaf_d, desti),
2098 XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), tmp);
2099 ASSERT(INT_GET(entry_s->nameidx, ARCH_CONVERT) + tmp
2100 <= XFS_LBSIZE(mp));
2101 memset(XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), 0, tmp);
2102 INT_MOD(hdr_s->usedbytes, ARCH_CONVERT, -tmp);
2103 INT_MOD(hdr_d->usedbytes, ARCH_CONVERT, tmp);
2104 INT_MOD(hdr_s->count, ARCH_CONVERT, -1);
2105 INT_MOD(hdr_d->count, ARCH_CONVERT, 1);
2106 tmp = INT_GET(hdr_d->count, ARCH_CONVERT)
2107 * sizeof(xfs_attr_leaf_entry_t)
2108 + sizeof(xfs_attr_leaf_hdr_t);
2109 ASSERT(INT_GET(hdr_d->firstused, ARCH_CONVERT) >= tmp);
2110#ifdef GROT
2111 }
2112#endif /* GROT */
2113 }
2114
2115 /*
2116 * Zero out the entries we just copied.
2117 */
2118 if (start_s == INT_GET(hdr_s->count, ARCH_CONVERT)) {
2119 tmp = count * sizeof(xfs_attr_leaf_entry_t);
2120 entry_s = &leaf_s->entries[start_s];
2121 ASSERT(((char *)entry_s + tmp) <=
2122 ((char *)leaf_s + XFS_LBSIZE(mp)));
2123 memset((char *)entry_s, 0, tmp);
2124 } else {
2125 /*
2126 * Move the remaining entries down to fill the hole,
2127 * then zero the entries at the top.
2128 */
2129 tmp = INT_GET(hdr_s->count, ARCH_CONVERT) - count;
2130 tmp *= sizeof(xfs_attr_leaf_entry_t);
2131 entry_s = &leaf_s->entries[start_s + count];
2132 entry_d = &leaf_s->entries[start_s];
2133 memmove((char *)entry_d, (char *)entry_s, tmp);
2134
2135 tmp = count * sizeof(xfs_attr_leaf_entry_t);
2136 entry_s = &leaf_s->entries[INT_GET(hdr_s->count,
2137 ARCH_CONVERT)];
2138 ASSERT(((char *)entry_s + tmp) <=
2139 ((char *)leaf_s + XFS_LBSIZE(mp)));
2140 memset((char *)entry_s, 0, tmp);
2141 }
2142
2143 /*
2144 * Fill in the freemap information
2145 */
2146 INT_SET(hdr_d->freemap[0].base, ARCH_CONVERT,
2147 sizeof(xfs_attr_leaf_hdr_t));
2148 INT_MOD(hdr_d->freemap[0].base, ARCH_CONVERT,
2149 INT_GET(hdr_d->count, ARCH_CONVERT)
2150 * sizeof(xfs_attr_leaf_entry_t));
2151 INT_SET(hdr_d->freemap[0].size, ARCH_CONVERT,
2152 INT_GET(hdr_d->firstused, ARCH_CONVERT)
2153 - INT_GET(hdr_d->freemap[0].base, ARCH_CONVERT));
2154 hdr_d->freemap[1].base = 0;
2155 hdr_d->freemap[2].base = 0;
2156 hdr_d->freemap[1].size = 0;
2157 hdr_d->freemap[2].size = 0;
2158 hdr_s->holes = 1; /* leaf may not be compact */
2159}
2160
2161/*
2162 * Compare two leaf blocks "order".
2163 * Return 0 unless leaf2 should go before leaf1.
2164 */
2165int
2166xfs_attr_leaf_order(xfs_dabuf_t *leaf1_bp, xfs_dabuf_t *leaf2_bp)
2167{
2168 xfs_attr_leafblock_t *leaf1, *leaf2;
2169
2170 leaf1 = leaf1_bp->data;
2171 leaf2 = leaf2_bp->data;
2172 ASSERT((INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT)
2173 == XFS_ATTR_LEAF_MAGIC) &&
2174 (INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT)
2175 == XFS_ATTR_LEAF_MAGIC));
2176 if ( (INT_GET(leaf1->hdr.count, ARCH_CONVERT) > 0)
2177 && (INT_GET(leaf2->hdr.count, ARCH_CONVERT) > 0)
2178 && ( (INT_GET(leaf2->entries[ 0 ].hashval, ARCH_CONVERT) <
2179 INT_GET(leaf1->entries[ 0 ].hashval, ARCH_CONVERT))
2180 || (INT_GET(leaf2->entries[INT_GET(leaf2->hdr.count,
2181 ARCH_CONVERT)-1].hashval, ARCH_CONVERT) <
2182 INT_GET(leaf1->entries[INT_GET(leaf1->hdr.count,
2183 ARCH_CONVERT)-1].hashval, ARCH_CONVERT))) ) {
2184 return(1);
2185 }
2186 return(0);
2187}
2188
2189/*
2190 * Pick up the last hashvalue from a leaf block.
2191 */
2192xfs_dahash_t
2193xfs_attr_leaf_lasthash(xfs_dabuf_t *bp, int *count)
2194{
2195 xfs_attr_leafblock_t *leaf;
2196
2197 leaf = bp->data;
2198 ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
2199 == XFS_ATTR_LEAF_MAGIC);
2200 if (count)
2201 *count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
2202 if (!leaf->hdr.count)
2203 return(0);
2204 return(INT_GET(leaf->entries[INT_GET(leaf->hdr.count,
2205 ARCH_CONVERT)-1].hashval, ARCH_CONVERT));
2206}
2207
2208/*
2209 * Calculate the number of bytes used to store the indicated attribute
2210 * (whether local or remote only calculate bytes in this block).
2211 */
2212int
2213xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index)
2214{
2215 xfs_attr_leaf_name_local_t *name_loc;
2216 xfs_attr_leaf_name_remote_t *name_rmt;
2217 int size;
2218
2219 ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
2220 == XFS_ATTR_LEAF_MAGIC);
2221 if (leaf->entries[index].flags & XFS_ATTR_LOCAL) {
2222 name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, index);
2223 size = XFS_ATTR_LEAF_ENTSIZE_LOCAL(name_loc->namelen,
2224 INT_GET(name_loc->valuelen,
2225 ARCH_CONVERT));
2226 } else {
2227 name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, index);
2228 size = XFS_ATTR_LEAF_ENTSIZE_REMOTE(name_rmt->namelen);
2229 }
2230 return(size);
2231}
2232
2233/*
2234 * Calculate the number of bytes that would be required to store the new
2235 * attribute (whether local or remote only calculate bytes in this block).
2236 * This routine decides as a side effect whether the attribute will be
2237 * a "local" or a "remote" attribute.
2238 */
2239int
2240xfs_attr_leaf_newentsize(xfs_da_args_t *args, int blocksize, int *local)
2241{
2242 int size;
2243
2244 size = XFS_ATTR_LEAF_ENTSIZE_LOCAL(args->namelen, args->valuelen);
2245 if (size < XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX(blocksize)) {
2246 if (local) {
2247 *local = 1;
2248 }
2249 } else {
2250 size = XFS_ATTR_LEAF_ENTSIZE_REMOTE(args->namelen);
2251 if (local) {
2252 *local = 0;
2253 }
2254 }
2255 return(size);
2256}
2257
2258/*
2259 * Copy out attribute list entries for attr_list(), for leaf attribute lists.
2260 */
2261int
2262xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
2263{
2264 attrlist_cursor_kern_t *cursor;
2265 xfs_attr_leafblock_t *leaf;
2266 xfs_attr_leaf_entry_t *entry;
2267 xfs_attr_leaf_name_local_t *name_loc;
2268 xfs_attr_leaf_name_remote_t *name_rmt;
2269 int retval, i;
2270
2271 ASSERT(bp != NULL);
2272 leaf = bp->data;
2273 cursor = context->cursor;
2274 cursor->initted = 1;
2275
2276 xfs_attr_trace_l_cl("blk start", context, leaf);
2277
2278 /*
2279 * Re-find our place in the leaf block if this is a new syscall.
2280 */
2281 if (context->resynch) {
2282 entry = &leaf->entries[0];
2283 for (i = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT);
2284 entry++, i++) {
2285 if (INT_GET(entry->hashval, ARCH_CONVERT)
2286 == cursor->hashval) {
2287 if (cursor->offset == context->dupcnt) {
2288 context->dupcnt = 0;
2289 break;
2290 }
2291 context->dupcnt++;
2292 } else if (INT_GET(entry->hashval, ARCH_CONVERT)
2293 > cursor->hashval) {
2294 context->dupcnt = 0;
2295 break;
2296 }
2297 }
2298 if (i == INT_GET(leaf->hdr.count, ARCH_CONVERT)) {
2299 xfs_attr_trace_l_c("not found", context);
2300 return(0);
2301 }
2302 } else {
2303 entry = &leaf->entries[0];
2304 i = 0;
2305 }
2306 context->resynch = 0;
2307
2308 /*
2309 * We have found our place, start copying out the new attributes.
2310 */
2311 retval = 0;
2312 for ( ; (i < INT_GET(leaf->hdr.count, ARCH_CONVERT))
2313 && (retval == 0); entry++, i++) {
2314 attrnames_t *namesp;
2315
2316 if (INT_GET(entry->hashval, ARCH_CONVERT) != cursor->hashval) {
2317 cursor->hashval = INT_GET(entry->hashval, ARCH_CONVERT);
2318 cursor->offset = 0;
2319 }
2320
2321 if (entry->flags & XFS_ATTR_INCOMPLETE)
2322 continue; /* skip incomplete entries */
2323 if (((context->flags & ATTR_SECURE) != 0) !=
2324 ((entry->flags & XFS_ATTR_SECURE) != 0) &&
2325 !(context->flags & ATTR_KERNORMALS))
2326 continue; /* skip non-matching entries */
2327 if (((context->flags & ATTR_ROOT) != 0) !=
2328 ((entry->flags & XFS_ATTR_ROOT) != 0) &&
2329 !(context->flags & ATTR_KERNROOTLS))
2330 continue; /* skip non-matching entries */
2331
2332 namesp = (entry->flags & XFS_ATTR_SECURE) ? &attr_secure :
2333 ((entry->flags & XFS_ATTR_ROOT) ? &attr_trusted :
2334 &attr_user);
2335
2336 if (entry->flags & XFS_ATTR_LOCAL) {
2337 name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i);
2338 if (context->flags & ATTR_KERNOVAL) {
2339 ASSERT(context->flags & ATTR_KERNAMELS);
2340 context->count += namesp->attr_namelen +
2341 (int)name_loc->namelen + 1;
2342 } else {
2343 retval = xfs_attr_put_listent(context, namesp,
2344 (char *)name_loc->nameval,
2345 (int)name_loc->namelen,
2346 (int)INT_GET(name_loc->valuelen,
2347 ARCH_CONVERT));
2348 }
2349 } else {
2350 name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i);
2351 if (context->flags & ATTR_KERNOVAL) {
2352 ASSERT(context->flags & ATTR_KERNAMELS);
2353 context->count += namesp->attr_namelen +
2354 (int)name_rmt->namelen + 1;
2355 } else {
2356 retval = xfs_attr_put_listent(context, namesp,
2357 (char *)name_rmt->name,
2358 (int)name_rmt->namelen,
2359 (int)INT_GET(name_rmt->valuelen,
2360 ARCH_CONVERT));
2361 }
2362 }
2363 if (retval == 0) {
2364 cursor->offset++;
2365 }
2366 }
2367 xfs_attr_trace_l_cl("blk end", context, leaf);
2368 return(retval);
2369}
2370
2371#define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \
2372 (((struct attrlist_ent *) 0)->a_name - (char *) 0)
2373#define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \
2374 ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \
2375 & ~(sizeof(u_int32_t)-1))
2376
2377/*
2378 * Format an attribute and copy it out to the user's buffer.
2379 * Take care to check values and protect against them changing later,
2380 * we may be reading them directly out of a user buffer.
2381 */
2382/*ARGSUSED*/
2383int
2384xfs_attr_put_listent(xfs_attr_list_context_t *context,
2385 attrnames_t *namesp, char *name, int namelen, int valuelen)
2386{
2387 attrlist_ent_t *aep;
2388 int arraytop;
2389
2390 ASSERT(!(context->flags & ATTR_KERNOVAL));
2391 if (context->flags & ATTR_KERNAMELS) {
2392 char *offset;
2393
2394 ASSERT(context->count >= 0);
2395
2396 arraytop = context->count + namesp->attr_namelen + namelen + 1;
2397 if (arraytop > context->firstu) {
2398 context->count = -1; /* insufficient space */
2399 return(1);
2400 }
2401 offset = (char *)context->alist + context->count;
2402 strncpy(offset, namesp->attr_name, namesp->attr_namelen);
2403 offset += namesp->attr_namelen;
2404 strncpy(offset, name, namelen); /* real name */
2405 offset += namelen;
2406 *offset = '\0';
2407 context->count += namesp->attr_namelen + namelen + 1;
2408 return(0);
2409 }
2410
2411 ASSERT(context->count >= 0);
2412 ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
2413 ASSERT(context->firstu >= sizeof(*context->alist));
2414 ASSERT(context->firstu <= context->bufsize);
2415
2416 arraytop = sizeof(*context->alist) +
2417 context->count * sizeof(context->alist->al_offset[0]);
2418 context->firstu -= ATTR_ENTSIZE(namelen);
2419 if (context->firstu < arraytop) {
2420 xfs_attr_trace_l_c("buffer full", context);
2421 context->alist->al_more = 1;
2422 return(1);
2423 }
2424
2425 aep = (attrlist_ent_t *)&(((char *)context->alist)[ context->firstu ]);
2426 aep->a_valuelen = valuelen;
2427 memcpy(aep->a_name, name, namelen);
2428 aep->a_name[ namelen ] = 0;
2429 context->alist->al_offset[ context->count++ ] = context->firstu;
2430 context->alist->al_count = context->count;
2431 xfs_attr_trace_l_c("add", context);
2432 return(0);
2433}
2434
2435/*========================================================================
2436 * Manage the INCOMPLETE flag in a leaf entry
2437 *========================================================================*/
2438
2439/*
2440 * Clear the INCOMPLETE flag on an entry in a leaf block.
2441 */
2442int
2443xfs_attr_leaf_clearflag(xfs_da_args_t *args)
2444{
2445 xfs_attr_leafblock_t *leaf;
2446 xfs_attr_leaf_entry_t *entry;
2447 xfs_attr_leaf_name_remote_t *name_rmt;
2448 xfs_dabuf_t *bp;
2449 int error;
2450#ifdef DEBUG
2451 xfs_attr_leaf_name_local_t *name_loc;
2452 int namelen;
2453 char *name;
2454#endif /* DEBUG */
2455
2456 /*
2457 * Set up the operation.
2458 */
2459 error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
2460 XFS_ATTR_FORK);
2461 if (error) {
2462 return(error);
2463 }
2464 ASSERT(bp != NULL);
2465
2466 leaf = bp->data;
2467 ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
2468 == XFS_ATTR_LEAF_MAGIC);
2469 ASSERT(args->index < INT_GET(leaf->hdr.count, ARCH_CONVERT));
2470 ASSERT(args->index >= 0);
2471 entry = &leaf->entries[ args->index ];
2472 ASSERT(entry->flags & XFS_ATTR_INCOMPLETE);
2473
2474#ifdef DEBUG
2475 if (entry->flags & XFS_ATTR_LOCAL) {
2476 name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index);
2477 namelen = name_loc->namelen;
2478 name = (char *)name_loc->nameval;
2479 } else {
2480 name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
2481 namelen = name_rmt->namelen;
2482 name = (char *)name_rmt->name;
2483 }
2484 ASSERT(INT_GET(entry->hashval, ARCH_CONVERT) == args->hashval);
2485 ASSERT(namelen == args->namelen);
2486 ASSERT(memcmp(name, args->name, namelen) == 0);
2487#endif /* DEBUG */
2488
2489 entry->flags &= ~XFS_ATTR_INCOMPLETE;
2490 xfs_da_log_buf(args->trans, bp,
2491 XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
2492
2493 if (args->rmtblkno) {
2494 ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0);
2495 name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
2496 INT_SET(name_rmt->valueblk, ARCH_CONVERT, args->rmtblkno);
2497 INT_SET(name_rmt->valuelen, ARCH_CONVERT, args->valuelen);
2498 xfs_da_log_buf(args->trans, bp,
2499 XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
2500 }
2501 xfs_da_buf_done(bp);
2502
2503 /*
2504 * Commit the flag value change and start the next trans in series.
2505 */
2506 error = xfs_attr_rolltrans(&args->trans, args->dp);
2507
2508 return(error);
2509}
2510
2511/*
2512 * Set the INCOMPLETE flag on an entry in a leaf block.
2513 */
2514int
2515xfs_attr_leaf_setflag(xfs_da_args_t *args)
2516{
2517 xfs_attr_leafblock_t *leaf;
2518 xfs_attr_leaf_entry_t *entry;
2519 xfs_attr_leaf_name_remote_t *name_rmt;
2520 xfs_dabuf_t *bp;
2521 int error;
2522
2523 /*
2524 * Set up the operation.
2525 */
2526 error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
2527 XFS_ATTR_FORK);
2528 if (error) {
2529 return(error);
2530 }
2531 ASSERT(bp != NULL);
2532
2533 leaf = bp->data;
2534 ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
2535 == XFS_ATTR_LEAF_MAGIC);
2536 ASSERT(args->index < INT_GET(leaf->hdr.count, ARCH_CONVERT));
2537 ASSERT(args->index >= 0);
2538 entry = &leaf->entries[ args->index ];
2539
2540 ASSERT((entry->flags & XFS_ATTR_INCOMPLETE) == 0);
2541 entry->flags |= XFS_ATTR_INCOMPLETE;
2542 xfs_da_log_buf(args->trans, bp,
2543 XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
2544 if ((entry->flags & XFS_ATTR_LOCAL) == 0) {
2545 name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
2546 name_rmt->valueblk = 0;
2547 name_rmt->valuelen = 0;
2548 xfs_da_log_buf(args->trans, bp,
2549 XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
2550 }
2551 xfs_da_buf_done(bp);
2552
2553 /*
2554 * Commit the flag value change and start the next trans in series.
2555 */
2556 error = xfs_attr_rolltrans(&args->trans, args->dp);
2557
2558 return(error);
2559}
2560
2561/*
2562 * In a single transaction, clear the INCOMPLETE flag on the leaf entry
2563 * given by args->blkno/index and set the INCOMPLETE flag on the leaf
2564 * entry given by args->blkno2/index2.
2565 *
2566 * Note that they could be in different blocks, or in the same block.
2567 */
2568int
2569xfs_attr_leaf_flipflags(xfs_da_args_t *args)
2570{
2571 xfs_attr_leafblock_t *leaf1, *leaf2;
2572 xfs_attr_leaf_entry_t *entry1, *entry2;
2573 xfs_attr_leaf_name_remote_t *name_rmt;
2574 xfs_dabuf_t *bp1, *bp2;
2575 int error;
2576#ifdef DEBUG
2577 xfs_attr_leaf_name_local_t *name_loc;
2578 int namelen1, namelen2;
2579 char *name1, *name2;
2580#endif /* DEBUG */
2581
2582 /*
2583 * Read the block containing the "old" attr
2584 */
2585 error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp1,
2586 XFS_ATTR_FORK);
2587 if (error) {
2588 return(error);
2589 }
2590 ASSERT(bp1 != NULL);
2591
2592 /*
2593 * Read the block containing the "new" attr, if it is different
2594 */
2595 if (args->blkno2 != args->blkno) {
2596 error = xfs_da_read_buf(args->trans, args->dp, args->blkno2,
2597 -1, &bp2, XFS_ATTR_FORK);
2598 if (error) {
2599 return(error);
2600 }
2601 ASSERT(bp2 != NULL);
2602 } else {
2603 bp2 = bp1;
2604 }
2605
2606 leaf1 = bp1->data;
2607 ASSERT(INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT)
2608 == XFS_ATTR_LEAF_MAGIC);
2609 ASSERT(args->index < INT_GET(leaf1->hdr.count, ARCH_CONVERT));
2610 ASSERT(args->index >= 0);
2611 entry1 = &leaf1->entries[ args->index ];
2612
2613 leaf2 = bp2->data;
2614 ASSERT(INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT)
2615 == XFS_ATTR_LEAF_MAGIC);
2616 ASSERT(args->index2 < INT_GET(leaf2->hdr.count, ARCH_CONVERT));
2617 ASSERT(args->index2 >= 0);
2618 entry2 = &leaf2->entries[ args->index2 ];
2619
2620#ifdef DEBUG
2621 if (entry1->flags & XFS_ATTR_LOCAL) {
2622 name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf1, args->index);
2623 namelen1 = name_loc->namelen;
2624 name1 = (char *)name_loc->nameval;
2625 } else {
2626 name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf1, args->index);
2627 namelen1 = name_rmt->namelen;
2628 name1 = (char *)name_rmt->name;
2629 }
2630 if (entry2->flags & XFS_ATTR_LOCAL) {
2631 name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf2, args->index2);
2632 namelen2 = name_loc->namelen;
2633 name2 = (char *)name_loc->nameval;
2634 } else {
2635 name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf2, args->index2);
2636 namelen2 = name_rmt->namelen;
2637 name2 = (char *)name_rmt->name;
2638 }
2639 ASSERT(INT_GET(entry1->hashval, ARCH_CONVERT) == INT_GET(entry2->hashval, ARCH_CONVERT));
2640 ASSERT(namelen1 == namelen2);
2641 ASSERT(memcmp(name1, name2, namelen1) == 0);
2642#endif /* DEBUG */
2643
2644 ASSERT(entry1->flags & XFS_ATTR_INCOMPLETE);
2645 ASSERT((entry2->flags & XFS_ATTR_INCOMPLETE) == 0);
2646
2647 entry1->flags &= ~XFS_ATTR_INCOMPLETE;
2648 xfs_da_log_buf(args->trans, bp1,
2649 XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1)));
2650 if (args->rmtblkno) {
2651 ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0);
2652 name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf1, args->index);
2653 INT_SET(name_rmt->valueblk, ARCH_CONVERT, args->rmtblkno);
2654 INT_SET(name_rmt->valuelen, ARCH_CONVERT, args->valuelen);
2655 xfs_da_log_buf(args->trans, bp1,
2656 XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt)));
2657 }
2658
2659 entry2->flags |= XFS_ATTR_INCOMPLETE;
2660 xfs_da_log_buf(args->trans, bp2,
2661 XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2)));
2662 if ((entry2->flags & XFS_ATTR_LOCAL) == 0) {
2663 name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf2, args->index2);
2664 name_rmt->valueblk = 0;
2665 name_rmt->valuelen = 0;
2666 xfs_da_log_buf(args->trans, bp2,
2667 XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt)));
2668 }
2669 xfs_da_buf_done(bp1);
2670 if (bp1 != bp2)
2671 xfs_da_buf_done(bp2);
2672
2673 /*
2674 * Commit the flag value change and start the next trans in series.
2675 */
2676 error = xfs_attr_rolltrans(&args->trans, args->dp);
2677
2678 return(error);
2679}
2680
2681/*========================================================================
2682 * Indiscriminately delete the entire attribute fork
2683 *========================================================================*/
2684
2685/*
2686 * Recurse (gasp!) through the attribute nodes until we find leaves.
2687 * We're doing a depth-first traversal in order to invalidate everything.
2688 */
2689int
2690xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp)
2691{
2692 xfs_da_blkinfo_t *info;
2693 xfs_daddr_t blkno;
2694 xfs_dabuf_t *bp;
2695 int error;
2696
2697 /*
2698 * Read block 0 to see what we have to work with.
2699 * We only get here if we have extents, since we remove
2700 * the extents in reverse order the extent containing
2701 * block 0 must still be there.
2702 */
2703 error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
2704 if (error)
2705 return(error);
2706 blkno = xfs_da_blkno(bp);
2707
2708 /*
2709 * Invalidate the tree, even if the "tree" is only a single leaf block.
2710 * This is a depth-first traversal!
2711 */
2712 info = bp->data;
2713 if (INT_GET(info->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) {
2714 error = xfs_attr_node_inactive(trans, dp, bp, 1);
2715 } else if (INT_GET(info->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC) {
2716 error = xfs_attr_leaf_inactive(trans, dp, bp);
2717 } else {
2718 error = XFS_ERROR(EIO);
2719 xfs_da_brelse(*trans, bp);
2720 }
2721 if (error)
2722 return(error);
2723
2724 /*
2725 * Invalidate the incore copy of the root block.
2726 */
2727 error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK);
2728 if (error)
2729 return(error);
2730 xfs_da_binval(*trans, bp); /* remove from cache */
2731 /*
2732 * Commit the invalidate and start the next transaction.
2733 */
2734 error = xfs_attr_rolltrans(trans, dp);
2735
2736 return (error);
2737}
2738
2739/*
2740 * Recurse (gasp!) through the attribute nodes until we find leaves.
2741 * We're doing a depth-first traversal in order to invalidate everything.
2742 */
2743int
2744xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp,
2745 int level)
2746{
2747 xfs_da_blkinfo_t *info;
2748 xfs_da_intnode_t *node;
2749 xfs_dablk_t child_fsb;
2750 xfs_daddr_t parent_blkno, child_blkno;
2751 int error, count, i;
2752 xfs_dabuf_t *child_bp;
2753
2754 /*
2755 * Since this code is recursive (gasp!) we must protect ourselves.
2756 */
2757 if (level > XFS_DA_NODE_MAXDEPTH) {
2758 xfs_da_brelse(*trans, bp); /* no locks for later trans */
2759 return(XFS_ERROR(EIO));
2760 }
2761
2762 node = bp->data;
2763 ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT)
2764 == XFS_DA_NODE_MAGIC);
2765 parent_blkno = xfs_da_blkno(bp); /* save for re-read later */
2766 count = INT_GET(node->hdr.count, ARCH_CONVERT);
2767 if (!count) {
2768 xfs_da_brelse(*trans, bp);
2769 return(0);
2770 }
2771 child_fsb = INT_GET(node->btree[0].before, ARCH_CONVERT);
2772 xfs_da_brelse(*trans, bp); /* no locks for later trans */
2773
2774 /*
2775 * If this is the node level just above the leaves, simply loop
2776 * over the leaves removing all of them. If this is higher up
2777 * in the tree, recurse downward.
2778 */
2779 for (i = 0; i < count; i++) {
2780 /*
2781 * Read the subsidiary block to see what we have to work with.
2782 * Don't do this in a transaction. This is a depth-first
2783 * traversal of the tree so we may deal with many blocks
2784 * before we come back to this one.
2785 */
2786 error = xfs_da_read_buf(*trans, dp, child_fsb, -2, &child_bp,
2787 XFS_ATTR_FORK);
2788 if (error)
2789 return(error);
2790 if (child_bp) {
2791 /* save for re-read later */
2792 child_blkno = xfs_da_blkno(child_bp);
2793
2794 /*
2795 * Invalidate the subtree, however we have to.
2796 */
2797 info = child_bp->data;
2798 if (INT_GET(info->magic, ARCH_CONVERT)
2799 == XFS_DA_NODE_MAGIC) {
2800 error = xfs_attr_node_inactive(trans, dp,
2801 child_bp, level+1);
2802 } else if (INT_GET(info->magic, ARCH_CONVERT)
2803 == XFS_ATTR_LEAF_MAGIC) {
2804 error = xfs_attr_leaf_inactive(trans, dp,
2805 child_bp);
2806 } else {
2807 error = XFS_ERROR(EIO);
2808 xfs_da_brelse(*trans, child_bp);
2809 }
2810 if (error)
2811 return(error);
2812
2813 /*
2814 * Remove the subsidiary block from the cache
2815 * and from the log.
2816 */
2817 error = xfs_da_get_buf(*trans, dp, 0, child_blkno,
2818 &child_bp, XFS_ATTR_FORK);
2819 if (error)
2820 return(error);
2821 xfs_da_binval(*trans, child_bp);
2822 }
2823
2824 /*
2825 * If we're not done, re-read the parent to get the next
2826 * child block number.
2827 */
2828 if ((i+1) < count) {
2829 error = xfs_da_read_buf(*trans, dp, 0, parent_blkno,
2830 &bp, XFS_ATTR_FORK);
2831 if (error)
2832 return(error);
2833 child_fsb = INT_GET(node->btree[i+1].before, ARCH_CONVERT);
2834 xfs_da_brelse(*trans, bp);
2835 }
2836 /*
2837 * Atomically commit the whole invalidate stuff.
2838 */
2839 if ((error = xfs_attr_rolltrans(trans, dp)))
2840 return (error);
2841 }
2842
2843 return(0);
2844}
2845
2846/*
2847 * Invalidate all of the "remote" value regions pointed to by a particular
2848 * leaf block.
2849 * Note that we must release the lock on the buffer so that we are not
2850 * caught holding something that the logging code wants to flush to disk.
2851 */
2852int
2853xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp)
2854{
2855 xfs_attr_leafblock_t *leaf;
2856 xfs_attr_leaf_entry_t *entry;
2857 xfs_attr_leaf_name_remote_t *name_rmt;
2858 xfs_attr_inactive_list_t *list, *lp;
2859 int error, count, size, tmp, i;
2860
2861 leaf = bp->data;
2862 ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
2863 == XFS_ATTR_LEAF_MAGIC);
2864
2865 /*
2866 * Count the number of "remote" value extents.
2867 */
2868 count = 0;
2869 entry = &leaf->entries[0];
2870 for (i = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); entry++, i++) {
2871 if ( INT_GET(entry->nameidx, ARCH_CONVERT)
2872 && ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
2873 name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i);
2874 if (name_rmt->valueblk)
2875 count++;
2876 }
2877 }
2878
2879 /*
2880 * If there are no "remote" values, we're done.
2881 */
2882 if (count == 0) {
2883 xfs_da_brelse(*trans, bp);
2884 return(0);
2885 }
2886
2887 /*
2888 * Allocate storage for a list of all the "remote" value extents.
2889 */
2890 size = count * sizeof(xfs_attr_inactive_list_t);
2891 list = (xfs_attr_inactive_list_t *)kmem_alloc(size, KM_SLEEP);
2892
2893 /*
2894 * Identify each of the "remote" value extents.
2895 */
2896 lp = list;
2897 entry = &leaf->entries[0];
2898 for (i = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); entry++, i++) {
2899 if ( INT_GET(entry->nameidx, ARCH_CONVERT)
2900 && ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
2901 name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i);
2902 if (name_rmt->valueblk) {
2903 /* both on-disk, don't endian flip twice */
2904 lp->valueblk = name_rmt->valueblk;
2905 INT_SET(lp->valuelen, ARCH_CONVERT,
2906 XFS_B_TO_FSB(dp->i_mount,
2907 INT_GET(name_rmt->valuelen,
2908 ARCH_CONVERT)));
2909 lp++;
2910 }
2911 }
2912 }
2913 xfs_da_brelse(*trans, bp); /* unlock for trans. in freextent() */
2914
2915 /*
2916 * Invalidate each of the "remote" value extents.
2917 */
2918 error = 0;
2919 for (lp = list, i = 0; i < count; i++, lp++) {
2920 tmp = xfs_attr_leaf_freextent(trans, dp,
2921 INT_GET(lp->valueblk,
2922 ARCH_CONVERT),
2923 INT_GET(lp->valuelen,
2924 ARCH_CONVERT));
2925 if (error == 0)
2926 error = tmp; /* save only the 1st errno */
2927 }
2928
2929 kmem_free((xfs_caddr_t)list, size);
2930 return(error);
2931}
2932
2933/*
2934 * Look at all the extents for this logical region,
2935 * invalidate any buffers that are incore/in transactions.
2936 */
2937int
2938xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
2939 xfs_dablk_t blkno, int blkcnt)
2940{
2941 xfs_bmbt_irec_t map;
2942 xfs_dablk_t tblkno;
2943 int tblkcnt, dblkcnt, nmap, error;
2944 xfs_daddr_t dblkno;
2945 xfs_buf_t *bp;
2946
2947 /*
2948 * Roll through the "value", invalidating the attribute value's
2949 * blocks.
2950 */
2951 tblkno = blkno;
2952 tblkcnt = blkcnt;
2953 while (tblkcnt > 0) {
2954 /*
2955 * Try to remember where we decided to put the value.
2956 */
2957 nmap = 1;
2958 error = xfs_bmapi(*trans, dp, (xfs_fileoff_t)tblkno, tblkcnt,
2959 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
2960 NULL, 0, &map, &nmap, NULL);
2961 if (error) {
2962 return(error);
2963 }
2964 ASSERT(nmap == 1);
2965 ASSERT(map.br_startblock != DELAYSTARTBLOCK);
2966
2967 /*
2968 * If it's a hole, these are already unmapped
2969 * so there's nothing to invalidate.
2970 */
2971 if (map.br_startblock != HOLESTARTBLOCK) {
2972
2973 dblkno = XFS_FSB_TO_DADDR(dp->i_mount,
2974 map.br_startblock);
2975 dblkcnt = XFS_FSB_TO_BB(dp->i_mount,
2976 map.br_blockcount);
2977 bp = xfs_trans_get_buf(*trans,
2978 dp->i_mount->m_ddev_targp,
2979 dblkno, dblkcnt, XFS_BUF_LOCK);
2980 xfs_trans_binval(*trans, bp);
2981 /*
2982 * Roll to next transaction.
2983 */
2984 if ((error = xfs_attr_rolltrans(trans, dp)))
2985 return (error);
2986 }
2987
2988 tblkno += map.br_blockcount;
2989 tblkcnt -= map.br_blockcount;
2990 }
2991
2992 return(0);
2993}
2994
2995
2996/*
2997 * Roll from one trans in the sequence of PERMANENT transactions to the next.
2998 */
2999int
3000xfs_attr_rolltrans(xfs_trans_t **transp, xfs_inode_t *dp)
3001{
3002 xfs_trans_t *trans;
3003 unsigned int logres, count;
3004 int error;
3005
3006 /*
3007 * Ensure that the inode is always logged.
3008 */
3009 trans = *transp;
3010 xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
3011
3012 /*
3013 * Copy the critical parameters from one trans to the next.
3014 */
3015 logres = trans->t_log_res;
3016 count = trans->t_log_count;
3017 *transp = xfs_trans_dup(trans);
3018
3019 /*
3020 * Commit the current transaction.
3021 * If this commit failed, then it'd just unlock those items that
3022 * are not marked ihold. That also means that a filesystem shutdown
3023 * is in progress. The caller takes the responsibility to cancel
3024 * the duplicate transaction that gets returned.
3025 */
3026 if ((error = xfs_trans_commit(trans, 0, NULL)))
3027 return (error);
3028
3029 trans = *transp;
3030
3031 /*
3032 * Reserve space in the log for th next transaction.
3033 * This also pushes items in the "AIL", the list of logged items,
3034 * out to disk if they are taking up space at the tail of the log
3035 * that we want to use. This requires that either nothing be locked
3036 * across this call, or that anything that is locked be logged in
3037 * the prior and the next transactions.
3038 */
3039 error = xfs_trans_reserve(trans, 0, logres, 0,
3040 XFS_TRANS_PERM_LOG_RES, count);
3041 /*
3042 * Ensure that the inode is in the new transaction and locked.
3043 */
3044 if (!error) {
3045 xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL);
3046 xfs_trans_ihold(trans, dp);
3047 }
3048 return (error);
3049
3050}
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
new file mode 100644
index 000000000000..b1480e0b3349
--- /dev/null
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -0,0 +1,308 @@
1/*
2 * Copyright (c) 2000, 2002-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_ATTR_LEAF_H__
33#define __XFS_ATTR_LEAF_H__
34
35/*
36 * Attribute storage layout, internal structure, access macros, etc.
37 *
38 * Attribute lists are structured around Btrees where all the data
39 * elements are in the leaf nodes. Attribute names are hashed into an int,
40 * then that int is used as the index into the Btree. Since the hashval
41 * of an attribute name may not be unique, we may have duplicate keys. The
42 * internal links in the Btree are logical block offsets into the file.
43 */
44
45struct attrlist;
46struct attrlist_cursor_kern;
47struct attrnames;
48struct xfs_dabuf;
49struct xfs_da_args;
50struct xfs_da_state;
51struct xfs_da_state_blk;
52struct xfs_inode;
53struct xfs_trans;
54
55/*========================================================================
56 * Attribute structure when equal to XFS_LBSIZE(mp) bytes.
57 *========================================================================*/
58
59/*
60 * This is the structure of the leaf nodes in the Btree.
61 *
62 * Struct leaf_entry's are packed from the top. Name/values grow from the
63 * bottom but are not packed. The freemap contains run-length-encoded entries
64 * for the free bytes after the leaf_entry's, but only the N largest such,
65 * smaller runs are dropped. When the freemap doesn't show enough space
66 * for an allocation, we compact the name/value area and try again. If we
67 * still don't have enough space, then we have to split the block. The
68 * name/value structs (both local and remote versions) must be 32bit aligned.
69 *
70 * Since we have duplicate hash keys, for each key that matches, compare
71 * the actual name string. The root and intermediate node search always
72 * takes the first-in-the-block key match found, so we should only have
73 * to work "forw"ard. If none matches, continue with the "forw"ard leaf
74 * nodes until the hash key changes or the attribute name is found.
75 *
76 * We store the fact that an attribute is a ROOT/USER/SECURE attribute in
77 * the leaf_entry. The namespaces are independent only because we also look
78 * at the namespace bit when we are looking for a matching attribute name.
79 *
80 * We also store a "incomplete" bit in the leaf_entry. It shows that an
81 * attribute is in the middle of being created and should not be shown to
82 * the user if we crash during the time that the bit is set. We clear the
83 * bit when we have finished setting up the attribute. We do this because
84 * we cannot create some large attributes inside a single transaction, and we
85 * need some indication that we weren't finished if we crash in the middle.
86 */
87#define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */
88
89typedef struct xfs_attr_leafblock {
90 struct xfs_attr_leaf_hdr { /* constant-structure header block */
91 xfs_da_blkinfo_t info; /* block type, links, etc. */
92 __uint16_t count; /* count of active leaf_entry's */
93 __uint16_t usedbytes; /* num bytes of names/values stored */
94 __uint16_t firstused; /* first used byte in name area */
95 __uint8_t holes; /* != 0 if blk needs compaction */
96 __uint8_t pad1;
97 struct xfs_attr_leaf_map { /* RLE map of free bytes */
98 __uint16_t base; /* base of free region */
99 __uint16_t size; /* length of free region */
100 } freemap[XFS_ATTR_LEAF_MAPSIZE]; /* N largest free regions */
101 } hdr;
102 struct xfs_attr_leaf_entry { /* sorted on key, not name */
103 xfs_dahash_t hashval; /* hash value of name */
104 __uint16_t nameidx; /* index into buffer of name/value */
105 __uint8_t flags; /* LOCAL/ROOT/SECURE/INCOMPLETE flag */
106 __uint8_t pad2; /* unused pad byte */
107 } entries[1]; /* variable sized array */
108 struct xfs_attr_leaf_name_local {
109 __uint16_t valuelen; /* number of bytes in value */
110 __uint8_t namelen; /* length of name bytes */
111 __uint8_t nameval[1]; /* name/value bytes */
112 } namelist; /* grows from bottom of buf */
113 struct xfs_attr_leaf_name_remote {
114 xfs_dablk_t valueblk; /* block number of value bytes */
115 __uint32_t valuelen; /* number of bytes in value */
116 __uint8_t namelen; /* length of name bytes */
117 __uint8_t name[1]; /* name bytes */
118 } valuelist; /* grows from bottom of buf */
119} xfs_attr_leafblock_t;
120typedef struct xfs_attr_leaf_hdr xfs_attr_leaf_hdr_t;
121typedef struct xfs_attr_leaf_map xfs_attr_leaf_map_t;
122typedef struct xfs_attr_leaf_entry xfs_attr_leaf_entry_t;
123typedef struct xfs_attr_leaf_name_local xfs_attr_leaf_name_local_t;
124typedef struct xfs_attr_leaf_name_remote xfs_attr_leaf_name_remote_t;
125
126/*
127 * Flags used in the leaf_entry[i].flags field.
128 * NOTE: the INCOMPLETE bit must not collide with the flags bits specified
129 * on the system call, they are "or"ed together for various operations.
130 */
131#define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */
132#define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */
133#define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */
134#define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */
135#define XFS_ATTR_LOCAL (1 << XFS_ATTR_LOCAL_BIT)
136#define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT)
137#define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT)
138#define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT)
139
140/*
141 * Alignment for namelist and valuelist entries (since they are mixed
142 * there can be only one alignment value)
143 */
144#define XFS_ATTR_LEAF_NAME_ALIGN ((uint)sizeof(xfs_dablk_t))
145
146/*
147 * Cast typed pointers for "local" and "remote" name/value structs.
148 */
149#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_NAME_REMOTE)
150xfs_attr_leaf_name_remote_t *
151xfs_attr_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx);
152#define XFS_ATTR_LEAF_NAME_REMOTE(leafp,idx) \
153 xfs_attr_leaf_name_remote(leafp,idx)
154#else
155#define XFS_ATTR_LEAF_NAME_REMOTE(leafp,idx) /* remote name struct ptr */ \
156 ((xfs_attr_leaf_name_remote_t *) \
157 &((char *)(leafp))[ INT_GET((leafp)->entries[idx].nameidx, ARCH_CONVERT) ])
158#endif
159#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_NAME_LOCAL)
160xfs_attr_leaf_name_local_t *
161xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx);
162#define XFS_ATTR_LEAF_NAME_LOCAL(leafp,idx) \
163 xfs_attr_leaf_name_local(leafp,idx)
164#else
165#define XFS_ATTR_LEAF_NAME_LOCAL(leafp,idx) /* local name struct ptr */ \
166 ((xfs_attr_leaf_name_local_t *) \
167 &((char *)(leafp))[ INT_GET((leafp)->entries[idx].nameidx, ARCH_CONVERT) ])
168#endif
169#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_NAME)
170char *xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx);
171#define XFS_ATTR_LEAF_NAME(leafp,idx) xfs_attr_leaf_name(leafp,idx)
172#else
173#define XFS_ATTR_LEAF_NAME(leafp,idx) /* generic name struct ptr */ \
174 (&((char *)(leafp))[ INT_GET((leafp)->entries[idx].nameidx, ARCH_CONVERT) ])
175#endif
176
177/*
178 * Calculate total bytes used (including trailing pad for alignment) for
179 * a "local" name/value structure, a "remote" name/value structure, and
180 * a pointer which might be either.
181 */
182#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_ENTSIZE_REMOTE)
183int xfs_attr_leaf_entsize_remote(int nlen);
184#define XFS_ATTR_LEAF_ENTSIZE_REMOTE(nlen) \
185 xfs_attr_leaf_entsize_remote(nlen)
186#else
187#define XFS_ATTR_LEAF_ENTSIZE_REMOTE(nlen) /* space for remote struct */ \
188 (((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \
189 XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1))
190#endif
191#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_ENTSIZE_LOCAL)
192int xfs_attr_leaf_entsize_local(int nlen, int vlen);
193#define XFS_ATTR_LEAF_ENTSIZE_LOCAL(nlen,vlen) \
194 xfs_attr_leaf_entsize_local(nlen,vlen)
195#else
196#define XFS_ATTR_LEAF_ENTSIZE_LOCAL(nlen,vlen) /* space for local struct */ \
197 (((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) + \
198 XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1))
199#endif
200#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX)
201int xfs_attr_leaf_entsize_local_max(int bsize);
202#define XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX(bsize) \
203 xfs_attr_leaf_entsize_local_max(bsize)
204#else
205#define XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX(bsize) /* max local struct size */ \
206 (((bsize) >> 1) + ((bsize) >> 2))
207#endif
208
209
210/*========================================================================
211 * Structure used to pass context around among the routines.
212 *========================================================================*/
213
214typedef struct xfs_attr_list_context {
215 struct xfs_inode *dp; /* inode */
216 struct attrlist_cursor_kern *cursor;/* position in list */
217 struct attrlist *alist; /* output buffer */
218 int count; /* num used entries */
219 int dupcnt; /* count dup hashvals seen */
220 int bufsize;/* total buffer size */
221 int firstu; /* first used byte in buffer */
222 int flags; /* from VOP call */
223 int resynch;/* T/F: resynch with cursor */
224} xfs_attr_list_context_t;
225
226/*
227 * Used to keep a list of "remote value" extents when unlinking an inode.
228 */
229typedef struct xfs_attr_inactive_list {
230 xfs_dablk_t valueblk; /* block number of value bytes */
231 int valuelen; /* number of bytes in value */
232} xfs_attr_inactive_list_t;
233
234
235/*========================================================================
236 * Function prototypes for the kernel.
237 *========================================================================*/
238
239/*
240 * Internal routines when dirsize < XFS_LITINO(mp).
241 */
242int xfs_attr_shortform_create(struct xfs_da_args *args);
243int xfs_attr_shortform_add(struct xfs_da_args *add);
244int xfs_attr_shortform_lookup(struct xfs_da_args *args);
245int xfs_attr_shortform_getvalue(struct xfs_da_args *args);
246int xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
247int xfs_attr_shortform_remove(struct xfs_da_args *remove);
248int xfs_attr_shortform_list(struct xfs_attr_list_context *context);
249int xfs_attr_shortform_allfit(struct xfs_dabuf *bp, struct xfs_inode *dp);
250
251/*
252 * Internal routines when dirsize == XFS_LBSIZE(mp).
253 */
254int xfs_attr_leaf_to_node(struct xfs_da_args *args);
255int xfs_attr_leaf_to_shortform(struct xfs_dabuf *bp,
256 struct xfs_da_args *args);
257int xfs_attr_leaf_clearflag(struct xfs_da_args *args);
258int xfs_attr_leaf_setflag(struct xfs_da_args *args);
259int xfs_attr_leaf_flipflags(xfs_da_args_t *args);
260
261/*
262 * Routines used for growing the Btree.
263 */
264int xfs_attr_leaf_create(struct xfs_da_args *args, xfs_dablk_t which_block,
265 struct xfs_dabuf **bpp);
266int xfs_attr_leaf_split(struct xfs_da_state *state,
267 struct xfs_da_state_blk *oldblk,
268 struct xfs_da_state_blk *newblk);
269int xfs_attr_leaf_lookup_int(struct xfs_dabuf *leaf,
270 struct xfs_da_args *args);
271int xfs_attr_leaf_getvalue(struct xfs_dabuf *bp, struct xfs_da_args *args);
272int xfs_attr_leaf_add(struct xfs_dabuf *leaf_buffer,
273 struct xfs_da_args *args);
274int xfs_attr_leaf_remove(struct xfs_dabuf *leaf_buffer,
275 struct xfs_da_args *args);
276int xfs_attr_leaf_list_int(struct xfs_dabuf *bp,
277 struct xfs_attr_list_context *context);
278
279/*
280 * Routines used for shrinking the Btree.
281 */
282int xfs_attr_leaf_toosmall(struct xfs_da_state *state, int *retval);
283void xfs_attr_leaf_unbalance(struct xfs_da_state *state,
284 struct xfs_da_state_blk *drop_blk,
285 struct xfs_da_state_blk *save_blk);
286int xfs_attr_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp);
287int xfs_attr_node_inactive(struct xfs_trans **trans, struct xfs_inode *dp,
288 struct xfs_dabuf *bp, int level);
289int xfs_attr_leaf_inactive(struct xfs_trans **trans, struct xfs_inode *dp,
290 struct xfs_dabuf *bp);
291int xfs_attr_leaf_freextent(struct xfs_trans **trans, struct xfs_inode *dp,
292 xfs_dablk_t blkno, int blkcnt);
293
294/*
295 * Utility routines.
296 */
297xfs_dahash_t xfs_attr_leaf_lasthash(struct xfs_dabuf *bp, int *count);
298int xfs_attr_leaf_order(struct xfs_dabuf *leaf1_bp,
299 struct xfs_dabuf *leaf2_bp);
300int xfs_attr_leaf_newentsize(struct xfs_da_args *args, int blocksize,
301 int *local);
302int xfs_attr_leaf_entsize(struct xfs_attr_leafblock *leaf, int index);
303int xfs_attr_put_listent(struct xfs_attr_list_context *context,
304 struct attrnames *, char *name, int namelen,
305 int valuelen);
306int xfs_attr_rolltrans(struct xfs_trans **transp, struct xfs_inode *dp);
307
308#endif /* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/xfs_attr_sf.h b/fs/xfs/xfs_attr_sf.h
new file mode 100644
index 000000000000..ef7d2942d306
--- /dev/null
+++ b/fs/xfs/xfs_attr_sf.h
@@ -0,0 +1,149 @@
1/*
2 * Copyright (c) 2000, 2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_ATTR_SF_H__
33#define __XFS_ATTR_SF_H__
34
35/*
36 * Attribute storage when stored inside the inode.
37 *
38 * Small attribute lists are packed as tightly as possible so as
39 * to fit into the literal area of the inode.
40 */
41
42struct xfs_inode;
43
44/*
45 * Entries are packed toward the top as tight as possible.
46 */
47typedef struct xfs_attr_shortform {
48 struct xfs_attr_sf_hdr { /* constant-structure header block */
49 __uint16_t totsize; /* total bytes in shortform list */
50 __uint8_t count; /* count of active entries */
51 } hdr;
52 struct xfs_attr_sf_entry {
53 __uint8_t namelen; /* actual length of name (no NULL) */
54 __uint8_t valuelen; /* actual length of value (no NULL) */
55 __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */
56 __uint8_t nameval[1]; /* name & value bytes concatenated */
57 } list[1]; /* variable sized array */
58} xfs_attr_shortform_t;
59typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t;
60typedef struct xfs_attr_sf_entry xfs_attr_sf_entry_t;
61
62/*
63 * We generate this then sort it, attr_list() must return things in hash-order.
64 */
65typedef struct xfs_attr_sf_sort {
66 __uint8_t entno; /* entry number in original list */
67 __uint8_t namelen; /* length of name value (no null) */
68 __uint8_t valuelen; /* length of value */
69 __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */
70 xfs_dahash_t hash; /* this entry's hash value */
71 char *name; /* name value, pointer into buffer */
72} xfs_attr_sf_sort_t;
73
74#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_SF_ENTSIZE_BYNAME)
75int xfs_attr_sf_entsize_byname(int nlen, int vlen);
76#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen) \
77 xfs_attr_sf_entsize_byname(nlen,vlen)
78#else
79#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen) /* space name/value uses */ \
80 ((int)sizeof(xfs_attr_sf_entry_t)-1 + (nlen)+(vlen))
81#endif
82#define XFS_ATTR_SF_ENTSIZE_MAX /* max space for name&value */ \
83 ((1 << (NBBY*(int)sizeof(__uint8_t))) - 1)
84#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_SF_ENTSIZE)
85int xfs_attr_sf_entsize(xfs_attr_sf_entry_t *sfep);
86#define XFS_ATTR_SF_ENTSIZE(sfep) xfs_attr_sf_entsize(sfep)
87#else
88#define XFS_ATTR_SF_ENTSIZE(sfep) /* space an entry uses */ \
89 ((int)sizeof(xfs_attr_sf_entry_t)-1 + (sfep)->namelen+(sfep)->valuelen)
90#endif
91#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_SF_NEXTENTRY)
92xfs_attr_sf_entry_t *xfs_attr_sf_nextentry(xfs_attr_sf_entry_t *sfep);
93#define XFS_ATTR_SF_NEXTENTRY(sfep) xfs_attr_sf_nextentry(sfep)
94#else
95#define XFS_ATTR_SF_NEXTENTRY(sfep) /* next entry in struct */ \
96 ((xfs_attr_sf_entry_t *) \
97 ((char *)(sfep) + XFS_ATTR_SF_ENTSIZE(sfep)))
98#endif
99#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_SF_TOTSIZE)
100int xfs_attr_sf_totsize(struct xfs_inode *dp);
101#define XFS_ATTR_SF_TOTSIZE(dp) xfs_attr_sf_totsize(dp)
102#else
103#define XFS_ATTR_SF_TOTSIZE(dp) /* total space in use */ \
104 (INT_GET(((xfs_attr_shortform_t *)((dp)->i_afp->if_u1.if_data))->hdr.totsize, ARCH_CONVERT))
105#endif
106
107#if defined(XFS_ATTR_TRACE)
108/*
109 * Kernel tracing support for attribute lists
110 */
111struct xfs_attr_list_context;
112struct xfs_da_intnode;
113struct xfs_da_node_entry;
114struct xfs_attr_leafblock;
115
116#define XFS_ATTR_TRACE_SIZE 4096 /* size of global trace buffer */
117extern ktrace_t *xfs_attr_trace_buf;
118
119/*
120 * Trace record types.
121 */
122#define XFS_ATTR_KTRACE_L_C 1 /* context */
123#define XFS_ATTR_KTRACE_L_CN 2 /* context, node */
124#define XFS_ATTR_KTRACE_L_CB 3 /* context, btree */
125#define XFS_ATTR_KTRACE_L_CL 4 /* context, leaf */
126
127void xfs_attr_trace_l_c(char *where, struct xfs_attr_list_context *context);
128void xfs_attr_trace_l_cn(char *where, struct xfs_attr_list_context *context,
129 struct xfs_da_intnode *node);
130void xfs_attr_trace_l_cb(char *where, struct xfs_attr_list_context *context,
131 struct xfs_da_node_entry *btree);
132void xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context,
133 struct xfs_attr_leafblock *leaf);
134void xfs_attr_trace_enter(int type, char *where,
135 __psunsigned_t a2, __psunsigned_t a3,
136 __psunsigned_t a4, __psunsigned_t a5,
137 __psunsigned_t a6, __psunsigned_t a7,
138 __psunsigned_t a8, __psunsigned_t a9,
139 __psunsigned_t a10, __psunsigned_t a11,
140 __psunsigned_t a12, __psunsigned_t a13,
141 __psunsigned_t a14, __psunsigned_t a15);
142#else
143#define xfs_attr_trace_l_c(w,c)
144#define xfs_attr_trace_l_cn(w,c,n)
145#define xfs_attr_trace_l_cb(w,c,b)
146#define xfs_attr_trace_l_cl(w,c,l)
147#endif /* XFS_ATTR_TRACE */
148
149#endif /* __XFS_ATTR_SF_H__ */
diff --git a/fs/xfs/xfs_behavior.c b/fs/xfs/xfs_behavior.c
new file mode 100644
index 000000000000..16088e175ecc
--- /dev/null
+++ b/fs/xfs/xfs_behavior.c
@@ -0,0 +1,218 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 *
32 */
33#include "xfs.h"
34
35/*
36 * Source file used to associate/disassociate behaviors with virtualized
37 * objects. See xfs_behavior.h for more information about behaviors, etc.
38 *
39 * The implementation is split between functions in this file and macros
40 * in xfs_behavior.h.
41 */
42
43/*
44 * Insert a new behavior descriptor into a behavior chain.
45 *
46 * The behavior chain is ordered based on the 'position' number which
47 * lives in the first field of the ops vector (higher numbers first).
48 *
49 * Attemps to insert duplicate ops result in an EINVAL return code.
50 * Otherwise, return 0 to indicate success.
51 */
52int
53bhv_insert(bhv_head_t *bhp, bhv_desc_t *bdp)
54{
55 bhv_desc_t *curdesc, *prev;
56 int position;
57
58 /*
59 * Validate the position value of the new behavior.
60 */
61 position = BHV_POSITION(bdp);
62 ASSERT(position >= BHV_POSITION_BASE && position <= BHV_POSITION_TOP);
63
64 /*
65 * Find location to insert behavior. Check for duplicates.
66 */
67 prev = NULL;
68 for (curdesc = bhp->bh_first;
69 curdesc != NULL;
70 curdesc = curdesc->bd_next) {
71
72 /* Check for duplication. */
73 if (curdesc->bd_ops == bdp->bd_ops) {
74 ASSERT(0);
75 return EINVAL;
76 }
77
78 /* Find correct position */
79 if (position >= BHV_POSITION(curdesc)) {
80 ASSERT(position != BHV_POSITION(curdesc));
81 break; /* found it */
82 }
83
84 prev = curdesc;
85 }
86
87 if (prev == NULL) {
88 /* insert at front of chain */
89 bdp->bd_next = bhp->bh_first;
90 bhp->bh_first = bdp;
91 } else {
92 /* insert after prev */
93 bdp->bd_next = prev->bd_next;
94 prev->bd_next = bdp;
95 }
96
97 return 0;
98}
99
100/*
101 * Remove a behavior descriptor from a position in a behavior chain;
102 * the postition is guaranteed not to be the first position.
103 * Should only be called by the bhv_remove() macro.
104 */
105void
106bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp)
107{
108 bhv_desc_t *curdesc, *prev;
109
110 ASSERT(bhp->bh_first != NULL);
111 ASSERT(bhp->bh_first->bd_next != NULL);
112
113 prev = bhp->bh_first;
114 for (curdesc = bhp->bh_first->bd_next;
115 curdesc != NULL;
116 curdesc = curdesc->bd_next) {
117
118 if (curdesc == bdp)
119 break; /* found it */
120 prev = curdesc;
121 }
122
123 ASSERT(curdesc == bdp);
124 prev->bd_next = bdp->bd_next; /* remove from after prev */
125}
126
127/*
128 * Look for a specific ops vector on the specified behavior chain.
129 * Return the associated behavior descriptor. Or NULL, if not found.
130 */
131bhv_desc_t *
132bhv_lookup(bhv_head_t *bhp, void *ops)
133{
134 bhv_desc_t *curdesc;
135
136 for (curdesc = bhp->bh_first;
137 curdesc != NULL;
138 curdesc = curdesc->bd_next) {
139
140 if (curdesc->bd_ops == ops)
141 return curdesc;
142 }
143
144 return NULL;
145}
146
147/*
148 * Looks for the first behavior within a specified range of positions.
149 * Return the associated behavior descriptor. Or NULL, if none found.
150 */
151bhv_desc_t *
152bhv_lookup_range(bhv_head_t *bhp, int low, int high)
153{
154 bhv_desc_t *curdesc;
155
156 for (curdesc = bhp->bh_first;
157 curdesc != NULL;
158 curdesc = curdesc->bd_next) {
159
160 int position = BHV_POSITION(curdesc);
161
162 if (position <= high) {
163 if (position >= low)
164 return curdesc;
165 return NULL;
166 }
167 }
168
169 return NULL;
170}
171
172/*
173 * Return the base behavior in the chain, or NULL if the chain
174 * is empty.
175 *
176 * The caller has not read locked the behavior chain, so acquire the
177 * lock before traversing the chain.
178 */
179bhv_desc_t *
180bhv_base(bhv_head_t *bhp)
181{
182 bhv_desc_t *curdesc;
183
184 for (curdesc = bhp->bh_first;
185 curdesc != NULL;
186 curdesc = curdesc->bd_next) {
187
188 if (curdesc->bd_next == NULL) {
189 return curdesc;
190 }
191 }
192
193 return NULL;
194}
195
196void
197bhv_head_init(
198 bhv_head_t *bhp,
199 char *name)
200{
201 bhp->bh_first = NULL;
202}
203
204void
205bhv_insert_initial(
206 bhv_head_t *bhp,
207 bhv_desc_t *bdp)
208{
209 ASSERT(bhp->bh_first == NULL);
210 (bhp)->bh_first = bdp;
211}
212
213void
214bhv_head_destroy(
215 bhv_head_t *bhp)
216{
217 ASSERT(bhp->bh_first == NULL);
218}
diff --git a/fs/xfs/xfs_behavior.h b/fs/xfs/xfs_behavior.h
new file mode 100644
index 000000000000..d5ed5a843921
--- /dev/null
+++ b/fs/xfs/xfs_behavior.h
@@ -0,0 +1,204 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_BEHAVIOR_H__
33#define __XFS_BEHAVIOR_H__
34
35/*
36 * Header file used to associate behaviors with virtualized objects.
37 *
38 * A virtualized object is an internal, virtualized representation of
39 * OS entities such as persistent files, processes, or sockets. Examples
40 * of virtualized objects include vnodes, vprocs, and vsockets. Often
41 * a virtualized object is referred to simply as an "object."
42 *
43 * A behavior is essentially an implementation layer associated with
44 * an object. Multiple behaviors for an object are chained together,
45 * the order of chaining determining the order of invocation. Each
46 * behavior of a given object implements the same set of interfaces
47 * (e.g., the VOP interfaces).
48 *
49 * Behaviors may be dynamically inserted into an object's behavior chain,
50 * such that the addition is transparent to consumers that already have
51 * references to the object. Typically, a given behavior will be inserted
52 * at a particular location in the behavior chain. Insertion of new
53 * behaviors is synchronized with operations-in-progress (oip's) so that
54 * the oip's always see a consistent view of the chain.
55 *
56 * The term "interpostion" is used to refer to the act of inserting
57 * a behavior such that it interposes on (i.e., is inserted in front
58 * of) a particular other behavior. A key example of this is when a
59 * system implementing distributed single system image wishes to
60 * interpose a distribution layer (providing distributed coherency)
61 * in front of an object that is otherwise only accessed locally.
62 *
63 * Note that the traditional vnode/inode combination is simply a virtualized
64 * object that has exactly one associated behavior.
65 *
66 * Behavior synchronization is logic which is necessary under certain
67 * circumstances that there is no conflict between ongoing operations
68 * traversing the behavior chain and those dunamically modifying the
69 * behavior chain. Because behavior synchronization adds extra overhead
70 * to virtual operation invocation, we want to restrict, as much as
71 * we can, the requirement for this extra code, to those situations
72 * in which it is truly necessary.
73 *
74 * Behavior synchronization is needed whenever there's at least one class
75 * of object in the system for which:
76 * 1) multiple behaviors for a given object are supported,
77 * -- AND --
78 * 2a) insertion of a new behavior can happen dynamically at any time during
79 * the life of an active object,
80 * -- AND --
81 * 3a) insertion of a new behavior needs to synchronize with existing
82 * ops-in-progress.
83 * -- OR --
84 * 3b) multiple different behaviors can be dynamically inserted at
85 * any time during the life of an active object
86 * -- OR --
87 * 3c) removal of a behavior can occur at any time during the life of
88 * an active object.
89 * -- OR --
90 * 2b) removal of a behavior can occur at any time during the life of an
91 * active object
92 *
93 */
94
95struct bhv_head_lock;
96
97/*
98 * Behavior head. Head of the chain of behaviors.
99 * Contained within each virtualized object data structure.
100 */
101typedef struct bhv_head {
102 struct bhv_desc *bh_first; /* first behavior in chain */
103 struct bhv_head_lock *bh_lockp; /* pointer to lock info struct */
104} bhv_head_t;
105
106/*
107 * Behavior descriptor. Descriptor associated with each behavior.
108 * Contained within the behavior's private data structure.
109 */
110typedef struct bhv_desc {
111 void *bd_pdata; /* private data for this behavior */
112 void *bd_vobj; /* virtual object associated with */
113 void *bd_ops; /* ops for this behavior */
114 struct bhv_desc *bd_next; /* next behavior in chain */
115} bhv_desc_t;
116
117/*
118 * Behavior identity field. A behavior's identity determines the position
119 * where it lives within a behavior chain, and it's always the first field
120 * of the behavior's ops vector. The optional id field further identifies the
121 * subsystem responsible for the behavior.
122 */
123typedef struct bhv_identity {
124 __u16 bi_id; /* owning subsystem id */
125 __u16 bi_position; /* position in chain */
126} bhv_identity_t;
127
128typedef bhv_identity_t bhv_position_t;
129
130#define BHV_IDENTITY_INIT(id,pos) {id, pos}
131#define BHV_IDENTITY_INIT_POSITION(pos) BHV_IDENTITY_INIT(0, pos)
132
133/*
134 * Define boundaries of position values.
135 */
136#define BHV_POSITION_INVALID 0 /* invalid position number */
137#define BHV_POSITION_BASE 1 /* base (last) implementation layer */
138#define BHV_POSITION_TOP 63 /* top (first) implementation layer */
139
140/*
141 * Plumbing macros.
142 */
143#define BHV_HEAD_FIRST(bhp) (ASSERT((bhp)->bh_first), (bhp)->bh_first)
144#define BHV_NEXT(bdp) (ASSERT((bdp)->bd_next), (bdp)->bd_next)
145#define BHV_NEXTNULL(bdp) ((bdp)->bd_next)
146#define BHV_VOBJ(bdp) (ASSERT((bdp)->bd_vobj), (bdp)->bd_vobj)
147#define BHV_VOBJNULL(bdp) ((bdp)->bd_vobj)
148#define BHV_PDATA(bdp) (bdp)->bd_pdata
149#define BHV_OPS(bdp) (bdp)->bd_ops
150#define BHV_IDENTITY(bdp) ((bhv_identity_t *)(bdp)->bd_ops)
151#define BHV_POSITION(bdp) (BHV_IDENTITY(bdp)->bi_position)
152
153extern void bhv_head_init(bhv_head_t *, char *);
154extern void bhv_head_destroy(bhv_head_t *);
155extern int bhv_insert(bhv_head_t *, bhv_desc_t *);
156extern void bhv_insert_initial(bhv_head_t *, bhv_desc_t *);
157
158/*
159 * Initialize a new behavior descriptor.
160 * Arguments:
161 * bdp - pointer to behavior descriptor
162 * pdata - pointer to behavior's private data
163 * vobj - pointer to associated virtual object
164 * ops - pointer to ops for this behavior
165 */
166#define bhv_desc_init(bdp, pdata, vobj, ops) \
167 { \
168 (bdp)->bd_pdata = pdata; \
169 (bdp)->bd_vobj = vobj; \
170 (bdp)->bd_ops = ops; \
171 (bdp)->bd_next = NULL; \
172 }
173
174/*
175 * Remove a behavior descriptor from a behavior chain.
176 */
177#define bhv_remove(bhp, bdp) \
178 { \
179 if ((bhp)->bh_first == (bdp)) { \
180 /* \
181 * Remove from front of chain. \
182 * Atomic wrt oip's. \
183 */ \
184 (bhp)->bh_first = (bdp)->bd_next; \
185 } else { \
186 /* remove from non-front of chain */ \
187 bhv_remove_not_first(bhp, bdp); \
188 } \
189 (bdp)->bd_vobj = NULL; \
190 }
191
192/*
193 * Behavior module prototypes.
194 */
195extern void bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp);
196extern bhv_desc_t * bhv_lookup(bhv_head_t *bhp, void *ops);
197extern bhv_desc_t * bhv_lookup_range(bhv_head_t *bhp, int low, int high);
198extern bhv_desc_t * bhv_base(bhv_head_t *bhp);
199
200/* No bhv locking on Linux */
201#define bhv_lookup_unlocked bhv_lookup
202#define bhv_base_unlocked bhv_base
203
204#endif /* __XFS_BEHAVIOR_H__ */
diff --git a/fs/xfs/xfs_bit.c b/fs/xfs/xfs_bit.c
new file mode 100644
index 000000000000..a20a6c3dc13e
--- /dev/null
+++ b/fs/xfs/xfs_bit.c
@@ -0,0 +1,312 @@
1/*
2 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33/*
34 * XFS bit manipulation routines, used in non-realtime code.
35 */
36
37#include "xfs.h"
38#include "xfs_bit.h"
39#include "xfs_log.h"
40#include "xfs_trans.h"
41#include "xfs_buf_item.h"
42
43
44#ifndef HAVE_ARCH_HIGHBIT
45/*
46 * Index of high bit number in byte, -1 for none set, 0..7 otherwise.
47 */
48const char xfs_highbit[256] = {
49 -1, 0, 1, 1, 2, 2, 2, 2, /* 00 .. 07 */
50 3, 3, 3, 3, 3, 3, 3, 3, /* 08 .. 0f */
51 4, 4, 4, 4, 4, 4, 4, 4, /* 10 .. 17 */
52 4, 4, 4, 4, 4, 4, 4, 4, /* 18 .. 1f */
53 5, 5, 5, 5, 5, 5, 5, 5, /* 20 .. 27 */
54 5, 5, 5, 5, 5, 5, 5, 5, /* 28 .. 2f */
55 5, 5, 5, 5, 5, 5, 5, 5, /* 30 .. 37 */
56 5, 5, 5, 5, 5, 5, 5, 5, /* 38 .. 3f */
57 6, 6, 6, 6, 6, 6, 6, 6, /* 40 .. 47 */
58 6, 6, 6, 6, 6, 6, 6, 6, /* 48 .. 4f */
59 6, 6, 6, 6, 6, 6, 6, 6, /* 50 .. 57 */
60 6, 6, 6, 6, 6, 6, 6, 6, /* 58 .. 5f */
61 6, 6, 6, 6, 6, 6, 6, 6, /* 60 .. 67 */
62 6, 6, 6, 6, 6, 6, 6, 6, /* 68 .. 6f */
63 6, 6, 6, 6, 6, 6, 6, 6, /* 70 .. 77 */
64 6, 6, 6, 6, 6, 6, 6, 6, /* 78 .. 7f */
65 7, 7, 7, 7, 7, 7, 7, 7, /* 80 .. 87 */
66 7, 7, 7, 7, 7, 7, 7, 7, /* 88 .. 8f */
67 7, 7, 7, 7, 7, 7, 7, 7, /* 90 .. 97 */
68 7, 7, 7, 7, 7, 7, 7, 7, /* 98 .. 9f */
69 7, 7, 7, 7, 7, 7, 7, 7, /* a0 .. a7 */
70 7, 7, 7, 7, 7, 7, 7, 7, /* a8 .. af */
71 7, 7, 7, 7, 7, 7, 7, 7, /* b0 .. b7 */
72 7, 7, 7, 7, 7, 7, 7, 7, /* b8 .. bf */
73 7, 7, 7, 7, 7, 7, 7, 7, /* c0 .. c7 */
74 7, 7, 7, 7, 7, 7, 7, 7, /* c8 .. cf */
75 7, 7, 7, 7, 7, 7, 7, 7, /* d0 .. d7 */
76 7, 7, 7, 7, 7, 7, 7, 7, /* d8 .. df */
77 7, 7, 7, 7, 7, 7, 7, 7, /* e0 .. e7 */
78 7, 7, 7, 7, 7, 7, 7, 7, /* e8 .. ef */
79 7, 7, 7, 7, 7, 7, 7, 7, /* f0 .. f7 */
80 7, 7, 7, 7, 7, 7, 7, 7, /* f8 .. ff */
81};
82#endif
83
84/*
85 * Count of bits set in byte, 0..8.
86 */
87static const char xfs_countbit[256] = {
88 0, 1, 1, 2, 1, 2, 2, 3, /* 00 .. 07 */
89 1, 2, 2, 3, 2, 3, 3, 4, /* 08 .. 0f */
90 1, 2, 2, 3, 2, 3, 3, 4, /* 10 .. 17 */
91 2, 3, 3, 4, 3, 4, 4, 5, /* 18 .. 1f */
92 1, 2, 2, 3, 2, 3, 3, 4, /* 20 .. 27 */
93 2, 3, 3, 4, 3, 4, 4, 5, /* 28 .. 2f */
94 2, 3, 3, 4, 3, 4, 4, 5, /* 30 .. 37 */
95 3, 4, 4, 5, 4, 5, 5, 6, /* 38 .. 3f */
96 1, 2, 2, 3, 2, 3, 3, 4, /* 40 .. 47 */
97 2, 3, 3, 4, 3, 4, 4, 5, /* 48 .. 4f */
98 2, 3, 3, 4, 3, 4, 4, 5, /* 50 .. 57 */
99 3, 4, 4, 5, 4, 5, 5, 6, /* 58 .. 5f */
100 2, 3, 3, 4, 3, 4, 4, 5, /* 60 .. 67 */
101 3, 4, 4, 5, 4, 5, 5, 6, /* 68 .. 6f */
102 3, 4, 4, 5, 4, 5, 5, 6, /* 70 .. 77 */
103 4, 5, 5, 6, 5, 6, 6, 7, /* 78 .. 7f */
104 1, 2, 2, 3, 2, 3, 3, 4, /* 80 .. 87 */
105 2, 3, 3, 4, 3, 4, 4, 5, /* 88 .. 8f */
106 2, 3, 3, 4, 3, 4, 4, 5, /* 90 .. 97 */
107 3, 4, 4, 5, 4, 5, 5, 6, /* 98 .. 9f */
108 2, 3, 3, 4, 3, 4, 4, 5, /* a0 .. a7 */
109 3, 4, 4, 5, 4, 5, 5, 6, /* a8 .. af */
110 3, 4, 4, 5, 4, 5, 5, 6, /* b0 .. b7 */
111 4, 5, 5, 6, 5, 6, 6, 7, /* b8 .. bf */
112 2, 3, 3, 4, 3, 4, 4, 5, /* c0 .. c7 */
113 3, 4, 4, 5, 4, 5, 5, 6, /* c8 .. cf */
114 3, 4, 4, 5, 4, 5, 5, 6, /* d0 .. d7 */
115 4, 5, 5, 6, 5, 6, 6, 7, /* d8 .. df */
116 3, 4, 4, 5, 4, 5, 5, 6, /* e0 .. e7 */
117 4, 5, 5, 6, 5, 6, 6, 7, /* e8 .. ef */
118 4, 5, 5, 6, 5, 6, 6, 7, /* f0 .. f7 */
119 5, 6, 6, 7, 6, 7, 7, 8, /* f8 .. ff */
120};
121
122/*
123 * xfs_highbit32: get high bit set out of 32-bit argument, -1 if none set.
124 */
125inline int
126xfs_highbit32(
127 __uint32_t v)
128{
129#ifdef HAVE_ARCH_HIGHBIT
130 return highbit32(v);
131#else
132 int i;
133
134 if (v & 0xffff0000)
135 if (v & 0xff000000)
136 i = 24;
137 else
138 i = 16;
139 else if (v & 0x0000ffff)
140 if (v & 0x0000ff00)
141 i = 8;
142 else
143 i = 0;
144 else
145 return -1;
146 return i + xfs_highbit[(v >> i) & 0xff];
147#endif
148}
149
150/*
151 * xfs_lowbit64: get low bit set out of 64-bit argument, -1 if none set.
152 */
153int
154xfs_lowbit64(
155 __uint64_t v)
156{
157 __uint32_t w = (__uint32_t)v;
158 int n = 0;
159
160 if (w) { /* lower bits */
161 n = ffs(w);
162 } else { /* upper bits */
163 w = (__uint32_t)(v >> 32);
164 if (w && (n = ffs(w)))
165 n += 32;
166 }
167 return n - 1;
168}
169
170/*
171 * xfs_highbit64: get high bit set out of 64-bit argument, -1 if none set.
172 */
173int
174xfs_highbit64(
175 __uint64_t v)
176{
177 __uint32_t h = (__uint32_t)(v >> 32);
178
179 if (h)
180 return xfs_highbit32(h) + 32;
181 return xfs_highbit32((__uint32_t)v);
182}
183
184
185/*
186 * Count the number of bits set in the bitmap starting with bit
187 * start_bit. Size is the size of the bitmap in words.
188 *
189 * Do the counting by mapping a byte value to the number of set
190 * bits for that value using the xfs_countbit array, i.e.
191 * xfs_countbit[0] == 0, xfs_countbit[1] == 1, xfs_countbit[2] == 1,
192 * xfs_countbit[3] == 2, etc.
193 */
194int
195xfs_count_bits(uint *map, uint size, uint start_bit)
196{
197 register int bits;
198 register unsigned char *bytep;
199 register unsigned char *end_map;
200 int byte_bit;
201
202 bits = 0;
203 end_map = (char*)(map + size);
204 bytep = (char*)(map + (start_bit & ~0x7));
205 byte_bit = start_bit & 0x7;
206
207 /*
208 * If the caller fell off the end of the map, return 0.
209 */
210 if (bytep >= end_map) {
211 return (0);
212 }
213
214 /*
215 * If start_bit is not byte aligned, then process the
216 * first byte separately.
217 */
218 if (byte_bit != 0) {
219 /*
220 * Shift off the bits we don't want to look at,
221 * before indexing into xfs_countbit.
222 */
223 bits += xfs_countbit[(*bytep >> byte_bit)];
224 bytep++;
225 }
226
227 /*
228 * Count the bits in each byte until the end of the bitmap.
229 */
230 while (bytep < end_map) {
231 bits += xfs_countbit[*bytep];
232 bytep++;
233 }
234
235 return (bits);
236}
237
238/*
239 * Count the number of contiguous bits set in the bitmap starting with bit
240 * start_bit. Size is the size of the bitmap in words.
241 */
242int
243xfs_contig_bits(uint *map, uint size, uint start_bit)
244{
245 uint * p = ((unsigned int *) map) + (start_bit >> BIT_TO_WORD_SHIFT);
246 uint result = 0;
247 uint tmp;
248
249 size <<= BIT_TO_WORD_SHIFT;
250
251 ASSERT(start_bit < size);
252 size -= start_bit & ~(NBWORD - 1);
253 start_bit &= (NBWORD - 1);
254 if (start_bit) {
255 tmp = *p++;
256 /* set to one first offset bits prior to start */
257 tmp |= (~0U >> (NBWORD-start_bit));
258 if (tmp != ~0U)
259 goto found;
260 result += NBWORD;
261 size -= NBWORD;
262 }
263 while (size) {
264 if ((tmp = *p++) != ~0U)
265 goto found;
266 result += NBWORD;
267 size -= NBWORD;
268 }
269 return result - start_bit;
270found:
271 return result + ffz(tmp) - start_bit;
272}
273
274/*
275 * This takes the bit number to start looking from and
276 * returns the next set bit from there. It returns -1
277 * if there are no more bits set or the start bit is
278 * beyond the end of the bitmap.
279 *
280 * Size is the number of words, not bytes, in the bitmap.
281 */
282int xfs_next_bit(uint *map, uint size, uint start_bit)
283{
284 uint * p = ((unsigned int *) map) + (start_bit >> BIT_TO_WORD_SHIFT);
285 uint result = start_bit & ~(NBWORD - 1);
286 uint tmp;
287
288 size <<= BIT_TO_WORD_SHIFT;
289
290 if (start_bit >= size)
291 return -1;
292 size -= result;
293 start_bit &= (NBWORD - 1);
294 if (start_bit) {
295 tmp = *p++;
296 /* set to zero first offset bits prior to start */
297 tmp &= (~0U << start_bit);
298 if (tmp != 0U)
299 goto found;
300 result += NBWORD;
301 size -= NBWORD;
302 }
303 while (size) {
304 if ((tmp = *p++) != 0U)
305 goto found;
306 result += NBWORD;
307 size -= NBWORD;
308 }
309 return -1;
310found:
311 return result + ffs(tmp) - 1;
312}
diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h
new file mode 100644
index 000000000000..1e7f57ddf7a8
--- /dev/null
+++ b/fs/xfs/xfs_bit.h
@@ -0,0 +1,85 @@
1/*
2 * Copyright (c) 2000, 2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_BIT_H__
33#define __XFS_BIT_H__
34
35/*
36 * XFS bit manipulation routines.
37 */
38
39/*
40 * masks with n high/low bits set, 32-bit values & 64-bit values
41 */
42#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MASK32HI)
43__uint32_t xfs_mask32hi(int n);
44#define XFS_MASK32HI(n) xfs_mask32hi(n)
45#else
46#define XFS_MASK32HI(n) ((__uint32_t)-1 << (32 - (n)))
47#endif
48#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MASK64HI)
49__uint64_t xfs_mask64hi(int n);
50#define XFS_MASK64HI(n) xfs_mask64hi(n)
51#else
52#define XFS_MASK64HI(n) ((__uint64_t)-1 << (64 - (n)))
53#endif
54#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MASK32LO)
55__uint32_t xfs_mask32lo(int n);
56#define XFS_MASK32LO(n) xfs_mask32lo(n)
57#else
58#define XFS_MASK32LO(n) (((__uint32_t)1 << (n)) - 1)
59#endif
60#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MASK64LO)
61__uint64_t xfs_mask64lo(int n);
62#define XFS_MASK64LO(n) xfs_mask64lo(n)
63#else
64#define XFS_MASK64LO(n) (((__uint64_t)1 << (n)) - 1)
65#endif
66
67/* Get high bit set out of 32-bit argument, -1 if none set */
68extern int xfs_highbit32(__uint32_t v);
69
70/* Get low bit set out of 64-bit argument, -1 if none set */
71extern int xfs_lowbit64(__uint64_t v);
72
73/* Get high bit set out of 64-bit argument, -1 if none set */
74extern int xfs_highbit64(__uint64_t);
75
76/* Count set bits in map starting with start_bit */
77extern int xfs_count_bits(uint *map, uint size, uint start_bit);
78
79/* Count continuous one bits in map starting with start_bit */
80extern int xfs_contig_bits(uint *map, uint size, uint start_bit);
81
82/* Find next set bit in map */
83extern int xfs_next_bit(uint *map, uint size, uint start_bit);
84
85#endif /* __XFS_BIT_H__ */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
new file mode 100644
index 000000000000..de3162418663
--- /dev/null
+++ b/fs/xfs/xfs_bmap.c
@@ -0,0 +1,6246 @@
1/*
2 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34
35#include "xfs_macros.h"
36#include "xfs_types.h"
37#include "xfs_inum.h"
38#include "xfs_log.h"
39#include "xfs_trans.h"
40#include "xfs_sb.h"
41#include "xfs_ag.h"
42#include "xfs_dir.h"
43#include "xfs_dir2.h"
44#include "xfs_dmapi.h"
45#include "xfs_mount.h"
46#include "xfs_alloc_btree.h"
47#include "xfs_bmap_btree.h"
48#include "xfs_ialloc_btree.h"
49#include "xfs_btree.h"
50#include "xfs_ialloc.h"
51#include "xfs_attr_sf.h"
52#include "xfs_dir_sf.h"
53#include "xfs_dir2_sf.h"
54#include "xfs_dinode.h"
55#include "xfs_inode_item.h"
56#include "xfs_inode.h"
57#include "xfs_itable.h"
58#include "xfs_extfree_item.h"
59#include "xfs_alloc.h"
60#include "xfs_bmap.h"
61#include "xfs_rtalloc.h"
62#include "xfs_error.h"
63#include "xfs_da_btree.h"
64#include "xfs_dir_leaf.h"
65#include "xfs_bit.h"
66#include "xfs_rw.h"
67#include "xfs_quota.h"
68#include "xfs_trans_space.h"
69#include "xfs_buf_item.h"
70
71
72#ifdef DEBUG
73STATIC void
74xfs_bmap_check_leaf_extents(xfs_btree_cur_t *cur, xfs_inode_t *ip, int whichfork);
75#endif
76
77kmem_zone_t *xfs_bmap_free_item_zone;
78
79/*
80 * Prototypes for internal bmap routines.
81 */
82
83
84/*
85 * Called from xfs_bmap_add_attrfork to handle extents format files.
86 */
87STATIC int /* error */
88xfs_bmap_add_attrfork_extents(
89 xfs_trans_t *tp, /* transaction pointer */
90 xfs_inode_t *ip, /* incore inode pointer */
91 xfs_fsblock_t *firstblock, /* first block allocated */
92 xfs_bmap_free_t *flist, /* blocks to free at commit */
93 int *flags); /* inode logging flags */
94
95/*
96 * Called from xfs_bmap_add_attrfork to handle local format files.
97 */
98STATIC int /* error */
99xfs_bmap_add_attrfork_local(
100 xfs_trans_t *tp, /* transaction pointer */
101 xfs_inode_t *ip, /* incore inode pointer */
102 xfs_fsblock_t *firstblock, /* first block allocated */
103 xfs_bmap_free_t *flist, /* blocks to free at commit */
104 int *flags); /* inode logging flags */
105
106/*
107 * Called by xfs_bmapi to update extent list structure and the btree
108 * after allocating space (or doing a delayed allocation).
109 */
110STATIC int /* error */
111xfs_bmap_add_extent(
112 xfs_inode_t *ip, /* incore inode pointer */
113 xfs_extnum_t idx, /* extent number to update/insert */
114 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
115 xfs_bmbt_irec_t *new, /* new data to put in extent list */
116 xfs_fsblock_t *first, /* pointer to firstblock variable */
117 xfs_bmap_free_t *flist, /* list of extents to be freed */
118 int *logflagsp, /* inode logging flags */
119 int whichfork, /* data or attr fork */
120 int rsvd); /* OK to allocate reserved blocks */
121
122/*
123 * Called by xfs_bmap_add_extent to handle cases converting a delayed
124 * allocation to a real allocation.
125 */
126STATIC int /* error */
127xfs_bmap_add_extent_delay_real(
128 xfs_inode_t *ip, /* incore inode pointer */
129 xfs_extnum_t idx, /* extent number to update/insert */
130 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
131 xfs_bmbt_irec_t *new, /* new data to put in extent list */
132 xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */
133 xfs_fsblock_t *first, /* pointer to firstblock variable */
134 xfs_bmap_free_t *flist, /* list of extents to be freed */
135 int *logflagsp, /* inode logging flags */
136 int rsvd); /* OK to allocate reserved blocks */
137
138/*
139 * Called by xfs_bmap_add_extent to handle cases converting a hole
140 * to a delayed allocation.
141 */
142STATIC int /* error */
143xfs_bmap_add_extent_hole_delay(
144 xfs_inode_t *ip, /* incore inode pointer */
145 xfs_extnum_t idx, /* extent number to update/insert */
146 xfs_btree_cur_t *cur, /* if null, not a btree */
147 xfs_bmbt_irec_t *new, /* new data to put in extent list */
148 int *logflagsp,/* inode logging flags */
149 int rsvd); /* OK to allocate reserved blocks */
150
151/*
152 * Called by xfs_bmap_add_extent to handle cases converting a hole
153 * to a real allocation.
154 */
155STATIC int /* error */
156xfs_bmap_add_extent_hole_real(
157 xfs_inode_t *ip, /* incore inode pointer */
158 xfs_extnum_t idx, /* extent number to update/insert */
159 xfs_btree_cur_t *cur, /* if null, not a btree */
160 xfs_bmbt_irec_t *new, /* new data to put in extent list */
161 int *logflagsp, /* inode logging flags */
162 int whichfork); /* data or attr fork */
163
164/*
165 * Called by xfs_bmap_add_extent to handle cases converting an unwritten
166 * allocation to a real allocation or vice versa.
167 */
168STATIC int /* error */
169xfs_bmap_add_extent_unwritten_real(
170 xfs_inode_t *ip, /* incore inode pointer */
171 xfs_extnum_t idx, /* extent number to update/insert */
172 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
173 xfs_bmbt_irec_t *new, /* new data to put in extent list */
174 int *logflagsp); /* inode logging flags */
175
176/*
177 * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
178 * It figures out where to ask the underlying allocator to put the new extent.
179 */
180STATIC int /* error */
181xfs_bmap_alloc(
182 xfs_bmalloca_t *ap); /* bmap alloc argument struct */
183
184/*
185 * Transform a btree format file with only one leaf node, where the
186 * extents list will fit in the inode, into an extents format file.
187 * Since the extent list is already in-core, all we have to do is
188 * give up the space for the btree root and pitch the leaf block.
189 */
190STATIC int /* error */
191xfs_bmap_btree_to_extents(
192 xfs_trans_t *tp, /* transaction pointer */
193 xfs_inode_t *ip, /* incore inode pointer */
194 xfs_btree_cur_t *cur, /* btree cursor */
195 int *logflagsp, /* inode logging flags */
196 int whichfork); /* data or attr fork */
197
198#ifdef DEBUG
199/*
200 * Check that the extents list for the inode ip is in the right order.
201 */
202STATIC void
203xfs_bmap_check_extents(
204 xfs_inode_t *ip, /* incore inode pointer */
205 int whichfork); /* data or attr fork */
206#endif
207
208/*
209 * Called by xfs_bmapi to update extent list structure and the btree
210 * after removing space (or undoing a delayed allocation).
211 */
212STATIC int /* error */
213xfs_bmap_del_extent(
214 xfs_inode_t *ip, /* incore inode pointer */
215 xfs_trans_t *tp, /* current trans pointer */
216 xfs_extnum_t idx, /* extent number to update/insert */
217 xfs_bmap_free_t *flist, /* list of extents to be freed */
218 xfs_btree_cur_t *cur, /* if null, not a btree */
219 xfs_bmbt_irec_t *new, /* new data to put in extent list */
220 int *logflagsp,/* inode logging flags */
221 int whichfork, /* data or attr fork */
222 int rsvd); /* OK to allocate reserved blocks */
223
224/*
225 * Remove the entry "free" from the free item list. Prev points to the
226 * previous entry, unless "free" is the head of the list.
227 */
228STATIC void
229xfs_bmap_del_free(
230 xfs_bmap_free_t *flist, /* free item list header */
231 xfs_bmap_free_item_t *prev, /* previous item on list, if any */
232 xfs_bmap_free_item_t *free); /* list item to be freed */
233
234/*
235 * Remove count entries from the extents array for inode "ip", starting
236 * at index "idx". Copies the remaining items down over the deleted ones,
237 * and gives back the excess memory.
238 */
239STATIC void
240xfs_bmap_delete_exlist(
241 xfs_inode_t *ip, /* incode inode pointer */
242 xfs_extnum_t idx, /* starting delete index */
243 xfs_extnum_t count, /* count of items to delete */
244 int whichfork); /* data or attr fork */
245
246/*
247 * Convert an extents-format file into a btree-format file.
248 * The new file will have a root block (in the inode) and a single child block.
249 */
250STATIC int /* error */
251xfs_bmap_extents_to_btree(
252 xfs_trans_t *tp, /* transaction pointer */
253 xfs_inode_t *ip, /* incore inode pointer */
254 xfs_fsblock_t *firstblock, /* first-block-allocated */
255 xfs_bmap_free_t *flist, /* blocks freed in xaction */
256 xfs_btree_cur_t **curp, /* cursor returned to caller */
257 int wasdel, /* converting a delayed alloc */
258 int *logflagsp, /* inode logging flags */
259 int whichfork); /* data or attr fork */
260
261/*
262 * Insert new item(s) in the extent list for inode "ip".
263 * Count new items are inserted at offset idx.
264 */
265STATIC void
266xfs_bmap_insert_exlist(
267 xfs_inode_t *ip, /* incore inode pointer */
268 xfs_extnum_t idx, /* starting index of new items */
269 xfs_extnum_t count, /* number of inserted items */
270 xfs_bmbt_irec_t *new, /* items to insert */
271 int whichfork); /* data or attr fork */
272
273/*
274 * Convert a local file to an extents file.
275 * This code is sort of bogus, since the file data needs to get
276 * logged so it won't be lost. The bmap-level manipulations are ok, though.
277 */
278STATIC int /* error */
279xfs_bmap_local_to_extents(
280 xfs_trans_t *tp, /* transaction pointer */
281 xfs_inode_t *ip, /* incore inode pointer */
282 xfs_fsblock_t *firstblock, /* first block allocated in xaction */
283 xfs_extlen_t total, /* total blocks needed by transaction */
284 int *logflagsp, /* inode logging flags */
285 int whichfork); /* data or attr fork */
286
287/*
288 * Search the extents list for the inode, for the extent containing bno.
289 * If bno lies in a hole, point to the next entry. If bno lies past eof,
290 * *eofp will be set, and *prevp will contain the last entry (null if none).
291 * Else, *lastxp will be set to the index of the found
292 * entry; *gotp will contain the entry.
293 */
294STATIC xfs_bmbt_rec_t * /* pointer to found extent entry */
295xfs_bmap_search_extents(
296 xfs_inode_t *ip, /* incore inode pointer */
297 xfs_fileoff_t bno, /* block number searched for */
298 int whichfork, /* data or attr fork */
299 int *eofp, /* out: end of file found */
300 xfs_extnum_t *lastxp, /* out: last extent index */
301 xfs_bmbt_irec_t *gotp, /* out: extent entry found */
302 xfs_bmbt_irec_t *prevp); /* out: previous extent entry found */
303
304#ifdef XFS_BMAP_TRACE
305/*
306 * Add a bmap trace buffer entry. Base routine for the others.
307 */
308STATIC void
309xfs_bmap_trace_addentry(
310 int opcode, /* operation */
311 char *fname, /* function name */
312 char *desc, /* operation description */
313 xfs_inode_t *ip, /* incore inode pointer */
314 xfs_extnum_t idx, /* index of entry(ies) */
315 xfs_extnum_t cnt, /* count of entries, 1 or 2 */
316 xfs_bmbt_rec_t *r1, /* first record */
317 xfs_bmbt_rec_t *r2, /* second record or null */
318 int whichfork); /* data or attr fork */
319
320/*
321 * Add bmap trace entry prior to a call to xfs_bmap_delete_exlist.
322 */
323STATIC void
324xfs_bmap_trace_delete(
325 char *fname, /* function name */
326 char *desc, /* operation description */
327 xfs_inode_t *ip, /* incore inode pointer */
328 xfs_extnum_t idx, /* index of entry(entries) deleted */
329 xfs_extnum_t cnt, /* count of entries deleted, 1 or 2 */
330 int whichfork); /* data or attr fork */
331
332/*
333 * Add bmap trace entry prior to a call to xfs_bmap_insert_exlist, or
334 * reading in the extents list from the disk (in the btree).
335 */
336STATIC void
337xfs_bmap_trace_insert(
338 char *fname, /* function name */
339 char *desc, /* operation description */
340 xfs_inode_t *ip, /* incore inode pointer */
341 xfs_extnum_t idx, /* index of entry(entries) inserted */
342 xfs_extnum_t cnt, /* count of entries inserted, 1 or 2 */
343 xfs_bmbt_irec_t *r1, /* inserted record 1 */
344 xfs_bmbt_irec_t *r2, /* inserted record 2 or null */
345 int whichfork); /* data or attr fork */
346
347/*
348 * Add bmap trace entry after updating an extent list entry in place.
349 */
350STATIC void
351xfs_bmap_trace_post_update(
352 char *fname, /* function name */
353 char *desc, /* operation description */
354 xfs_inode_t *ip, /* incore inode pointer */
355 xfs_extnum_t idx, /* index of entry updated */
356 int whichfork); /* data or attr fork */
357
358/*
359 * Add bmap trace entry prior to updating an extent list entry in place.
360 */
361STATIC void
362xfs_bmap_trace_pre_update(
363 char *fname, /* function name */
364 char *desc, /* operation description */
365 xfs_inode_t *ip, /* incore inode pointer */
366 xfs_extnum_t idx, /* index of entry to be updated */
367 int whichfork); /* data or attr fork */
368
369#else
370#define xfs_bmap_trace_delete(f,d,ip,i,c,w)
371#define xfs_bmap_trace_insert(f,d,ip,i,c,r1,r2,w)
372#define xfs_bmap_trace_post_update(f,d,ip,i,w)
373#define xfs_bmap_trace_pre_update(f,d,ip,i,w)
374#endif /* XFS_BMAP_TRACE */
375
376/*
377 * Compute the worst-case number of indirect blocks that will be used
378 * for ip's delayed extent of length "len".
379 */
380STATIC xfs_filblks_t
381xfs_bmap_worst_indlen(
382 xfs_inode_t *ip, /* incore inode pointer */
383 xfs_filblks_t len); /* delayed extent length */
384
385#ifdef DEBUG
386/*
387 * Perform various validation checks on the values being returned
388 * from xfs_bmapi().
389 */
390STATIC void
391xfs_bmap_validate_ret(
392 xfs_fileoff_t bno,
393 xfs_filblks_t len,
394 int flags,
395 xfs_bmbt_irec_t *mval,
396 int nmap,
397 int ret_nmap);
398#else
399#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)
400#endif /* DEBUG */
401
402#if defined(XFS_RW_TRACE)
403STATIC void
404xfs_bunmap_trace(
405 xfs_inode_t *ip,
406 xfs_fileoff_t bno,
407 xfs_filblks_t len,
408 int flags,
409 inst_t *ra);
410#else
411#define xfs_bunmap_trace(ip, bno, len, flags, ra)
412#endif /* XFS_RW_TRACE */
413
414STATIC int
415xfs_bmap_count_tree(
416 xfs_mount_t *mp,
417 xfs_trans_t *tp,
418 xfs_fsblock_t blockno,
419 int levelin,
420 int *count);
421
422STATIC int
423xfs_bmap_count_leaves(
424 xfs_bmbt_rec_t *frp,
425 int numrecs,
426 int *count);
427
428/*
429 * Bmap internal routines.
430 */
431
432/*
433 * Called from xfs_bmap_add_attrfork to handle btree format files.
434 */
435STATIC int /* error */
436xfs_bmap_add_attrfork_btree(
437 xfs_trans_t *tp, /* transaction pointer */
438 xfs_inode_t *ip, /* incore inode pointer */
439 xfs_fsblock_t *firstblock, /* first block allocated */
440 xfs_bmap_free_t *flist, /* blocks to free at commit */
441 int *flags) /* inode logging flags */
442{
443 xfs_btree_cur_t *cur; /* btree cursor */
444 int error; /* error return value */
445 xfs_mount_t *mp; /* file system mount struct */
446 int stat; /* newroot status */
447
448 mp = ip->i_mount;
449 if (ip->i_df.if_broot_bytes <= XFS_IFORK_DSIZE(ip))
450 *flags |= XFS_ILOG_DBROOT;
451 else {
452 cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip,
453 XFS_DATA_FORK);
454 cur->bc_private.b.flist = flist;
455 cur->bc_private.b.firstblock = *firstblock;
456 if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat)))
457 goto error0;
458 ASSERT(stat == 1); /* must be at least one entry */
459 if ((error = xfs_bmbt_newroot(cur, flags, &stat)))
460 goto error0;
461 if (stat == 0) {
462 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
463 return XFS_ERROR(ENOSPC);
464 }
465 *firstblock = cur->bc_private.b.firstblock;
466 cur->bc_private.b.allocated = 0;
467 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
468 }
469 return 0;
470error0:
471 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
472 return error;
473}
474
475/*
476 * Called from xfs_bmap_add_attrfork to handle extents format files.
477 */
478STATIC int /* error */
479xfs_bmap_add_attrfork_extents(
480 xfs_trans_t *tp, /* transaction pointer */
481 xfs_inode_t *ip, /* incore inode pointer */
482 xfs_fsblock_t *firstblock, /* first block allocated */
483 xfs_bmap_free_t *flist, /* blocks to free at commit */
484 int *flags) /* inode logging flags */
485{
486 xfs_btree_cur_t *cur; /* bmap btree cursor */
487 int error; /* error return value */
488
489 if (ip->i_d.di_nextents * sizeof(xfs_bmbt_rec_t) <= XFS_IFORK_DSIZE(ip))
490 return 0;
491 cur = NULL;
492 error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, &cur, 0,
493 flags, XFS_DATA_FORK);
494 if (cur) {
495 cur->bc_private.b.allocated = 0;
496 xfs_btree_del_cursor(cur,
497 error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
498 }
499 return error;
500}
501
502/*
503 * Called from xfs_bmap_add_attrfork to handle local format files.
504 */
505STATIC int /* error */
506xfs_bmap_add_attrfork_local(
507 xfs_trans_t *tp, /* transaction pointer */
508 xfs_inode_t *ip, /* incore inode pointer */
509 xfs_fsblock_t *firstblock, /* first block allocated */
510 xfs_bmap_free_t *flist, /* blocks to free at commit */
511 int *flags) /* inode logging flags */
512{
513 xfs_da_args_t dargs; /* args for dir/attr code */
514 int error; /* error return value */
515 xfs_mount_t *mp; /* mount structure pointer */
516
517 if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip))
518 return 0;
519 if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
520 mp = ip->i_mount;
521 memset(&dargs, 0, sizeof(dargs));
522 dargs.dp = ip;
523 dargs.firstblock = firstblock;
524 dargs.flist = flist;
525 dargs.total = mp->m_dirblkfsbs;
526 dargs.whichfork = XFS_DATA_FORK;
527 dargs.trans = tp;
528 error = XFS_DIR_SHORTFORM_TO_SINGLE(mp, &dargs);
529 } else
530 error = xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags,
531 XFS_DATA_FORK);
532 return error;
533}
534
535/*
536 * Called by xfs_bmapi to update extent list structure and the btree
537 * after allocating space (or doing a delayed allocation).
538 */
539STATIC int /* error */
540xfs_bmap_add_extent(
541 xfs_inode_t *ip, /* incore inode pointer */
542 xfs_extnum_t idx, /* extent number to update/insert */
543 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
544 xfs_bmbt_irec_t *new, /* new data to put in extent list */
545 xfs_fsblock_t *first, /* pointer to firstblock variable */
546 xfs_bmap_free_t *flist, /* list of extents to be freed */
547 int *logflagsp, /* inode logging flags */
548 int whichfork, /* data or attr fork */
549 int rsvd) /* OK to use reserved data blocks */
550{
551 xfs_btree_cur_t *cur; /* btree cursor or null */
552 xfs_filblks_t da_new; /* new count del alloc blocks used */
553 xfs_filblks_t da_old; /* old count del alloc blocks used */
554 int error; /* error return value */
555#ifdef XFS_BMAP_TRACE
556 static char fname[] = "xfs_bmap_add_extent";
557#endif
558 xfs_ifork_t *ifp; /* inode fork ptr */
559 int logflags; /* returned value */
560 xfs_extnum_t nextents; /* number of extents in file now */
561
562 XFS_STATS_INC(xs_add_exlist);
563 cur = *curp;
564 ifp = XFS_IFORK_PTR(ip, whichfork);
565 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
566 ASSERT(idx <= nextents);
567 da_old = da_new = 0;
568 error = 0;
569 /*
570 * This is the first extent added to a new/empty file.
571 * Special case this one, so other routines get to assume there are
572 * already extents in the list.
573 */
574 if (nextents == 0) {
575 xfs_bmap_trace_insert(fname, "insert empty", ip, 0, 1, new,
576 NULL, whichfork);
577 xfs_bmap_insert_exlist(ip, 0, 1, new, whichfork);
578 ASSERT(cur == NULL);
579 ifp->if_lastex = 0;
580 if (!ISNULLSTARTBLOCK(new->br_startblock)) {
581 XFS_IFORK_NEXT_SET(ip, whichfork, 1);
582 logflags = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
583 } else
584 logflags = 0;
585 }
586 /*
587 * Any kind of new delayed allocation goes here.
588 */
589 else if (ISNULLSTARTBLOCK(new->br_startblock)) {
590 if (cur)
591 ASSERT((cur->bc_private.b.flags &
592 XFS_BTCUR_BPRV_WASDEL) == 0);
593 if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, cur, new,
594 &logflags, rsvd)))
595 goto done;
596 }
597 /*
598 * Real allocation off the end of the file.
599 */
600 else if (idx == nextents) {
601 if (cur)
602 ASSERT((cur->bc_private.b.flags &
603 XFS_BTCUR_BPRV_WASDEL) == 0);
604 if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
605 &logflags, whichfork)))
606 goto done;
607 } else {
608 xfs_bmbt_irec_t prev; /* old extent at offset idx */
609
610 /*
611 * Get the record referred to by idx.
612 */
613 xfs_bmbt_get_all(&ifp->if_u1.if_extents[idx], &prev);
614 /*
615 * If it's a real allocation record, and the new allocation ends
616 * after the start of the referred to record, then we're filling
617 * in a delayed or unwritten allocation with a real one, or
618 * converting real back to unwritten.
619 */
620 if (!ISNULLSTARTBLOCK(new->br_startblock) &&
621 new->br_startoff + new->br_blockcount > prev.br_startoff) {
622 if (prev.br_state != XFS_EXT_UNWRITTEN &&
623 ISNULLSTARTBLOCK(prev.br_startblock)) {
624 da_old = STARTBLOCKVAL(prev.br_startblock);
625 if (cur)
626 ASSERT(cur->bc_private.b.flags &
627 XFS_BTCUR_BPRV_WASDEL);
628 if ((error = xfs_bmap_add_extent_delay_real(ip,
629 idx, &cur, new, &da_new, first, flist,
630 &logflags, rsvd)))
631 goto done;
632 } else if (new->br_state == XFS_EXT_NORM) {
633 ASSERT(new->br_state == XFS_EXT_NORM);
634 if ((error = xfs_bmap_add_extent_unwritten_real(
635 ip, idx, &cur, new, &logflags)))
636 goto done;
637 } else {
638 ASSERT(new->br_state == XFS_EXT_UNWRITTEN);
639 if ((error = xfs_bmap_add_extent_unwritten_real(
640 ip, idx, &cur, new, &logflags)))
641 goto done;
642 }
643 ASSERT(*curp == cur || *curp == NULL);
644 }
645 /*
646 * Otherwise we're filling in a hole with an allocation.
647 */
648 else {
649 if (cur)
650 ASSERT((cur->bc_private.b.flags &
651 XFS_BTCUR_BPRV_WASDEL) == 0);
652 if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
653 new, &logflags, whichfork)))
654 goto done;
655 }
656 }
657
658 ASSERT(*curp == cur || *curp == NULL);
659 /*
660 * Convert to a btree if necessary.
661 */
662 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
663 XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) {
664 int tmp_logflags; /* partial log flag return val */
665
666 ASSERT(cur == NULL);
667 error = xfs_bmap_extents_to_btree(ip->i_transp, ip, first,
668 flist, &cur, da_old > 0, &tmp_logflags, whichfork);
669 logflags |= tmp_logflags;
670 if (error)
671 goto done;
672 }
673 /*
674 * Adjust for changes in reserved delayed indirect blocks.
675 * Nothing to do for disk quotas here.
676 */
677 if (da_old || da_new) {
678 xfs_filblks_t nblks;
679
680 nblks = da_new;
681 if (cur)
682 nblks += cur->bc_private.b.allocated;
683 ASSERT(nblks <= da_old);
684 if (nblks < da_old)
685 xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS,
686 (int)(da_old - nblks), rsvd);
687 }
688 /*
689 * Clear out the allocated field, done with it now in any case.
690 */
691 if (cur) {
692 cur->bc_private.b.allocated = 0;
693 *curp = cur;
694 }
695done:
696#ifdef DEBUG
697 if (!error)
698 xfs_bmap_check_leaf_extents(*curp, ip, whichfork);
699#endif
700 *logflagsp = logflags;
701 return error;
702}
703
704/*
705 * Called by xfs_bmap_add_extent to handle cases converting a delayed
706 * allocation to a real allocation.
707 */
708STATIC int /* error */
709xfs_bmap_add_extent_delay_real(
710 xfs_inode_t *ip, /* incore inode pointer */
711 xfs_extnum_t idx, /* extent number to update/insert */
712 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
713 xfs_bmbt_irec_t *new, /* new data to put in extent list */
714 xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */
715 xfs_fsblock_t *first, /* pointer to firstblock variable */
716 xfs_bmap_free_t *flist, /* list of extents to be freed */
717 int *logflagsp, /* inode logging flags */
718 int rsvd) /* OK to use reserved data block allocation */
719{
720 xfs_bmbt_rec_t *base; /* base of extent entry list */
721 xfs_btree_cur_t *cur; /* btree cursor */
722 int diff; /* temp value */
723 xfs_bmbt_rec_t *ep; /* extent entry for idx */
724 int error; /* error return value */
725#ifdef XFS_BMAP_TRACE
726 static char fname[] = "xfs_bmap_add_extent_delay_real";
727#endif
728 int i; /* temp state */
729 xfs_fileoff_t new_endoff; /* end offset of new entry */
730 xfs_bmbt_irec_t r[3]; /* neighbor extent entries */
731 /* left is 0, right is 1, prev is 2 */
732 int rval=0; /* return value (logging flags) */
733 int state = 0;/* state bits, accessed thru macros */
734 xfs_filblks_t temp; /* value for dnew calculations */
735 xfs_filblks_t temp2; /* value for dnew calculations */
736 int tmp_rval; /* partial logging flags */
737 enum { /* bit number definitions for state */
738 LEFT_CONTIG, RIGHT_CONTIG,
739 LEFT_FILLING, RIGHT_FILLING,
740 LEFT_DELAY, RIGHT_DELAY,
741 LEFT_VALID, RIGHT_VALID
742 };
743
744#define LEFT r[0]
745#define RIGHT r[1]
746#define PREV r[2]
747#define MASK(b) (1 << (b))
748#define MASK2(a,b) (MASK(a) | MASK(b))
749#define MASK3(a,b,c) (MASK2(a,b) | MASK(c))
750#define MASK4(a,b,c,d) (MASK3(a,b,c) | MASK(d))
751#define STATE_SET(b,v) ((v) ? (state |= MASK(b)) : (state &= ~MASK(b)))
752#define STATE_TEST(b) (state & MASK(b))
753#define STATE_SET_TEST(b,v) ((v) ? ((state |= MASK(b)), 1) : \
754 ((state &= ~MASK(b)), 0))
755#define SWITCH_STATE \
756 (state & MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG))
757
758 /*
759 * Set up a bunch of variables to make the tests simpler.
760 */
761 cur = *curp;
762 base = ip->i_df.if_u1.if_extents;
763 ep = &base[idx];
764 xfs_bmbt_get_all(ep, &PREV);
765 new_endoff = new->br_startoff + new->br_blockcount;
766 ASSERT(PREV.br_startoff <= new->br_startoff);
767 ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
768 /*
769 * Set flags determining what part of the previous delayed allocation
770 * extent is being replaced by a real allocation.
771 */
772 STATE_SET(LEFT_FILLING, PREV.br_startoff == new->br_startoff);
773 STATE_SET(RIGHT_FILLING,
774 PREV.br_startoff + PREV.br_blockcount == new_endoff);
775 /*
776 * Check and set flags if this segment has a left neighbor.
777 * Don't set contiguous if the combined extent would be too large.
778 */
779 if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
780 xfs_bmbt_get_all(ep - 1, &LEFT);
781 STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(LEFT.br_startblock));
782 }
783 STATE_SET(LEFT_CONTIG,
784 STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) &&
785 LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
786 LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
787 LEFT.br_state == new->br_state &&
788 LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN);
789 /*
790 * Check and set flags if this segment has a right neighbor.
791 * Don't set contiguous if the combined extent would be too large.
792 * Also check for all-three-contiguous being too large.
793 */
794 if (STATE_SET_TEST(RIGHT_VALID,
795 idx <
796 ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) {
797 xfs_bmbt_get_all(ep + 1, &RIGHT);
798 STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(RIGHT.br_startblock));
799 }
800 STATE_SET(RIGHT_CONTIG,
801 STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) &&
802 new_endoff == RIGHT.br_startoff &&
803 new->br_startblock + new->br_blockcount ==
804 RIGHT.br_startblock &&
805 new->br_state == RIGHT.br_state &&
806 new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
807 ((state & MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING)) !=
808 MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING) ||
809 LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
810 <= MAXEXTLEN));
811 error = 0;
812 /*
813 * Switch out based on the FILLING and CONTIG state bits.
814 */
815 switch (SWITCH_STATE) {
816
817 case MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
818 /*
819 * Filling in all of a previously delayed allocation extent.
820 * The left and right neighbors are both contiguous with new.
821 */
822 xfs_bmap_trace_pre_update(fname, "LF|RF|LC|RC", ip, idx - 1,
823 XFS_DATA_FORK);
824 xfs_bmbt_set_blockcount(ep - 1,
825 LEFT.br_blockcount + PREV.br_blockcount +
826 RIGHT.br_blockcount);
827 xfs_bmap_trace_post_update(fname, "LF|RF|LC|RC", ip, idx - 1,
828 XFS_DATA_FORK);
829 xfs_bmap_trace_delete(fname, "LF|RF|LC|RC", ip, idx, 2,
830 XFS_DATA_FORK);
831 xfs_bmap_delete_exlist(ip, idx, 2, XFS_DATA_FORK);
832 ip->i_df.if_lastex = idx - 1;
833 ip->i_d.di_nextents--;
834 if (cur == NULL)
835 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
836 else {
837 rval = XFS_ILOG_CORE;
838 if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
839 RIGHT.br_startblock,
840 RIGHT.br_blockcount, &i)))
841 goto done;
842 ASSERT(i == 1);
843 if ((error = xfs_bmbt_delete(cur, &i)))
844 goto done;
845 ASSERT(i == 1);
846 if ((error = xfs_bmbt_decrement(cur, 0, &i)))
847 goto done;
848 ASSERT(i == 1);
849 if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
850 LEFT.br_startblock,
851 LEFT.br_blockcount +
852 PREV.br_blockcount +
853 RIGHT.br_blockcount, LEFT.br_state)))
854 goto done;
855 }
856 *dnew = 0;
857 break;
858
859 case MASK3(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG):
860 /*
861 * Filling in all of a previously delayed allocation extent.
862 * The left neighbor is contiguous, the right is not.
863 */
864 xfs_bmap_trace_pre_update(fname, "LF|RF|LC", ip, idx - 1,
865 XFS_DATA_FORK);
866 xfs_bmbt_set_blockcount(ep - 1,
867 LEFT.br_blockcount + PREV.br_blockcount);
868 xfs_bmap_trace_post_update(fname, "LF|RF|LC", ip, idx - 1,
869 XFS_DATA_FORK);
870 ip->i_df.if_lastex = idx - 1;
871 xfs_bmap_trace_delete(fname, "LF|RF|LC", ip, idx, 1,
872 XFS_DATA_FORK);
873 xfs_bmap_delete_exlist(ip, idx, 1, XFS_DATA_FORK);
874 if (cur == NULL)
875 rval = XFS_ILOG_DEXT;
876 else {
877 rval = 0;
878 if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff,
879 LEFT.br_startblock, LEFT.br_blockcount,
880 &i)))
881 goto done;
882 ASSERT(i == 1);
883 if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
884 LEFT.br_startblock,
885 LEFT.br_blockcount +
886 PREV.br_blockcount, LEFT.br_state)))
887 goto done;
888 }
889 *dnew = 0;
890 break;
891
892 case MASK3(LEFT_FILLING, RIGHT_FILLING, RIGHT_CONTIG):
893 /*
894 * Filling in all of a previously delayed allocation extent.
895 * The right neighbor is contiguous, the left is not.
896 */
897 xfs_bmap_trace_pre_update(fname, "LF|RF|RC", ip, idx,
898 XFS_DATA_FORK);
899 xfs_bmbt_set_startblock(ep, new->br_startblock);
900 xfs_bmbt_set_blockcount(ep,
901 PREV.br_blockcount + RIGHT.br_blockcount);
902 xfs_bmap_trace_post_update(fname, "LF|RF|RC", ip, idx,
903 XFS_DATA_FORK);
904 ip->i_df.if_lastex = idx;
905 xfs_bmap_trace_delete(fname, "LF|RF|RC", ip, idx + 1, 1,
906 XFS_DATA_FORK);
907 xfs_bmap_delete_exlist(ip, idx + 1, 1, XFS_DATA_FORK);
908 if (cur == NULL)
909 rval = XFS_ILOG_DEXT;
910 else {
911 rval = 0;
912 if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
913 RIGHT.br_startblock,
914 RIGHT.br_blockcount, &i)))
915 goto done;
916 ASSERT(i == 1);
917 if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
918 new->br_startblock,
919 PREV.br_blockcount +
920 RIGHT.br_blockcount, PREV.br_state)))
921 goto done;
922 }
923 *dnew = 0;
924 break;
925
926 case MASK2(LEFT_FILLING, RIGHT_FILLING):
927 /*
928 * Filling in all of a previously delayed allocation extent.
929 * Neither the left nor right neighbors are contiguous with
930 * the new one.
931 */
932 xfs_bmap_trace_pre_update(fname, "LF|RF", ip, idx,
933 XFS_DATA_FORK);
934 xfs_bmbt_set_startblock(ep, new->br_startblock);
935 xfs_bmap_trace_post_update(fname, "LF|RF", ip, idx,
936 XFS_DATA_FORK);
937 ip->i_df.if_lastex = idx;
938 ip->i_d.di_nextents++;
939 if (cur == NULL)
940 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
941 else {
942 rval = XFS_ILOG_CORE;
943 if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
944 new->br_startblock, new->br_blockcount,
945 &i)))
946 goto done;
947 ASSERT(i == 0);
948 cur->bc_rec.b.br_state = XFS_EXT_NORM;
949 if ((error = xfs_bmbt_insert(cur, &i)))
950 goto done;
951 ASSERT(i == 1);
952 }
953 *dnew = 0;
954 break;
955
956 case MASK2(LEFT_FILLING, LEFT_CONTIG):
957 /*
958 * Filling in the first part of a previous delayed allocation.
959 * The left neighbor is contiguous.
960 */
961 xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx - 1,
962 XFS_DATA_FORK);
963 xfs_bmbt_set_blockcount(ep - 1,
964 LEFT.br_blockcount + new->br_blockcount);
965 xfs_bmbt_set_startoff(ep,
966 PREV.br_startoff + new->br_blockcount);
967 xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx - 1,
968 XFS_DATA_FORK);
969 temp = PREV.br_blockcount - new->br_blockcount;
970 xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx,
971 XFS_DATA_FORK);
972 xfs_bmbt_set_blockcount(ep, temp);
973 ip->i_df.if_lastex = idx - 1;
974 if (cur == NULL)
975 rval = XFS_ILOG_DEXT;
976 else {
977 rval = 0;
978 if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff,
979 LEFT.br_startblock, LEFT.br_blockcount,
980 &i)))
981 goto done;
982 ASSERT(i == 1);
983 if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
984 LEFT.br_startblock,
985 LEFT.br_blockcount +
986 new->br_blockcount,
987 LEFT.br_state)))
988 goto done;
989 }
990 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
991 STARTBLOCKVAL(PREV.br_startblock));
992 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
993 xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx,
994 XFS_DATA_FORK);
995 *dnew = temp;
996 break;
997
998 case MASK(LEFT_FILLING):
999 /*
1000 * Filling in the first part of a previous delayed allocation.
1001 * The left neighbor is not contiguous.
1002 */
1003 xfs_bmap_trace_pre_update(fname, "LF", ip, idx, XFS_DATA_FORK);
1004 xfs_bmbt_set_startoff(ep, new_endoff);
1005 temp = PREV.br_blockcount - new->br_blockcount;
1006 xfs_bmbt_set_blockcount(ep, temp);
1007 xfs_bmap_trace_insert(fname, "LF", ip, idx, 1, new, NULL,
1008 XFS_DATA_FORK);
1009 xfs_bmap_insert_exlist(ip, idx, 1, new, XFS_DATA_FORK);
1010 ip->i_df.if_lastex = idx;
1011 ip->i_d.di_nextents++;
1012 if (cur == NULL)
1013 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
1014 else {
1015 rval = XFS_ILOG_CORE;
1016 if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
1017 new->br_startblock, new->br_blockcount,
1018 &i)))
1019 goto done;
1020 ASSERT(i == 0);
1021 cur->bc_rec.b.br_state = XFS_EXT_NORM;
1022 if ((error = xfs_bmbt_insert(cur, &i)))
1023 goto done;
1024 ASSERT(i == 1);
1025 }
1026 if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
1027 ip->i_d.di_nextents > ip->i_df.if_ext_max) {
1028 error = xfs_bmap_extents_to_btree(ip->i_transp, ip,
1029 first, flist, &cur, 1, &tmp_rval,
1030 XFS_DATA_FORK);
1031 rval |= tmp_rval;
1032 if (error)
1033 goto done;
1034 }
1035 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
1036 STARTBLOCKVAL(PREV.br_startblock) -
1037 (cur ? cur->bc_private.b.allocated : 0));
1038 base = ip->i_df.if_u1.if_extents;
1039 ep = &base[idx + 1];
1040 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
1041 xfs_bmap_trace_post_update(fname, "LF", ip, idx + 1,
1042 XFS_DATA_FORK);
1043 *dnew = temp;
1044 break;
1045
1046 case MASK2(RIGHT_FILLING, RIGHT_CONTIG):
1047 /*
1048 * Filling in the last part of a previous delayed allocation.
1049 * The right neighbor is contiguous with the new allocation.
1050 */
1051 temp = PREV.br_blockcount - new->br_blockcount;
1052 xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx,
1053 XFS_DATA_FORK);
1054 xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx + 1,
1055 XFS_DATA_FORK);
1056 xfs_bmbt_set_blockcount(ep, temp);
1057 xfs_bmbt_set_allf(ep + 1, new->br_startoff, new->br_startblock,
1058 new->br_blockcount + RIGHT.br_blockcount,
1059 RIGHT.br_state);
1060 xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx + 1,
1061 XFS_DATA_FORK);
1062 ip->i_df.if_lastex = idx + 1;
1063 if (cur == NULL)
1064 rval = XFS_ILOG_DEXT;
1065 else {
1066 rval = 0;
1067 if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
1068 RIGHT.br_startblock,
1069 RIGHT.br_blockcount, &i)))
1070 goto done;
1071 ASSERT(i == 1);
1072 if ((error = xfs_bmbt_update(cur, new->br_startoff,
1073 new->br_startblock,
1074 new->br_blockcount +
1075 RIGHT.br_blockcount,
1076 RIGHT.br_state)))
1077 goto done;
1078 }
1079 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
1080 STARTBLOCKVAL(PREV.br_startblock));
1081 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
1082 xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx,
1083 XFS_DATA_FORK);
1084 *dnew = temp;
1085 break;
1086
1087 case MASK(RIGHT_FILLING):
1088 /*
1089 * Filling in the last part of a previous delayed allocation.
1090 * The right neighbor is not contiguous.
1091 */
1092 temp = PREV.br_blockcount - new->br_blockcount;
1093 xfs_bmap_trace_pre_update(fname, "RF", ip, idx, XFS_DATA_FORK);
1094 xfs_bmbt_set_blockcount(ep, temp);
1095 xfs_bmap_trace_insert(fname, "RF", ip, idx + 1, 1,
1096 new, NULL, XFS_DATA_FORK);
1097 xfs_bmap_insert_exlist(ip, idx + 1, 1, new, XFS_DATA_FORK);
1098 ip->i_df.if_lastex = idx + 1;
1099 ip->i_d.di_nextents++;
1100 if (cur == NULL)
1101 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
1102 else {
1103 rval = XFS_ILOG_CORE;
1104 if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
1105 new->br_startblock, new->br_blockcount,
1106 &i)))
1107 goto done;
1108 ASSERT(i == 0);
1109 cur->bc_rec.b.br_state = XFS_EXT_NORM;
1110 if ((error = xfs_bmbt_insert(cur, &i)))
1111 goto done;
1112 ASSERT(i == 1);
1113 }
1114 if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
1115 ip->i_d.di_nextents > ip->i_df.if_ext_max) {
1116 error = xfs_bmap_extents_to_btree(ip->i_transp, ip,
1117 first, flist, &cur, 1, &tmp_rval,
1118 XFS_DATA_FORK);
1119 rval |= tmp_rval;
1120 if (error)
1121 goto done;
1122 }
1123 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
1124 STARTBLOCKVAL(PREV.br_startblock) -
1125 (cur ? cur->bc_private.b.allocated : 0));
1126 base = ip->i_df.if_u1.if_extents;
1127 ep = &base[idx];
1128 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
1129 xfs_bmap_trace_post_update(fname, "RF", ip, idx, XFS_DATA_FORK);
1130 *dnew = temp;
1131 break;
1132
1133 case 0:
1134 /*
1135 * Filling in the middle part of a previous delayed allocation.
1136 * Contiguity is impossible here.
1137 * This case is avoided almost all the time.
1138 */
1139 temp = new->br_startoff - PREV.br_startoff;
1140 xfs_bmap_trace_pre_update(fname, "0", ip, idx, XFS_DATA_FORK);
1141 xfs_bmbt_set_blockcount(ep, temp);
1142 r[0] = *new;
1143 r[1].br_startoff = new_endoff;
1144 temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
1145 r[1].br_blockcount = temp2;
1146 xfs_bmap_trace_insert(fname, "0", ip, idx + 1, 2, &r[0], &r[1],
1147 XFS_DATA_FORK);
1148 xfs_bmap_insert_exlist(ip, idx + 1, 2, &r[0], XFS_DATA_FORK);
1149 ip->i_df.if_lastex = idx + 1;
1150 ip->i_d.di_nextents++;
1151 if (cur == NULL)
1152 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
1153 else {
1154 rval = XFS_ILOG_CORE;
1155 if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
1156 new->br_startblock, new->br_blockcount,
1157 &i)))
1158 goto done;
1159 ASSERT(i == 0);
1160 cur->bc_rec.b.br_state = XFS_EXT_NORM;
1161 if ((error = xfs_bmbt_insert(cur, &i)))
1162 goto done;
1163 ASSERT(i == 1);
1164 }
1165 if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
1166 ip->i_d.di_nextents > ip->i_df.if_ext_max) {
1167 error = xfs_bmap_extents_to_btree(ip->i_transp, ip,
1168 first, flist, &cur, 1, &tmp_rval,
1169 XFS_DATA_FORK);
1170 rval |= tmp_rval;
1171 if (error)
1172 goto done;
1173 }
1174 temp = xfs_bmap_worst_indlen(ip, temp);
1175 temp2 = xfs_bmap_worst_indlen(ip, temp2);
1176 diff = (int)(temp + temp2 - STARTBLOCKVAL(PREV.br_startblock) -
1177 (cur ? cur->bc_private.b.allocated : 0));
1178 if (diff > 0 &&
1179 xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, -diff, rsvd)) {
1180 /*
1181 * Ick gross gag me with a spoon.
1182 */
1183 ASSERT(0); /* want to see if this ever happens! */
1184 while (diff > 0) {
1185 if (temp) {
1186 temp--;
1187 diff--;
1188 if (!diff ||
1189 !xfs_mod_incore_sb(ip->i_mount,
1190 XFS_SBS_FDBLOCKS, -diff, rsvd))
1191 break;
1192 }
1193 if (temp2) {
1194 temp2--;
1195 diff--;
1196 if (!diff ||
1197 !xfs_mod_incore_sb(ip->i_mount,
1198 XFS_SBS_FDBLOCKS, -diff, rsvd))
1199 break;
1200 }
1201 }
1202 }
1203 base = ip->i_df.if_u1.if_extents;
1204 ep = &base[idx];
1205 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
1206 xfs_bmap_trace_post_update(fname, "0", ip, idx, XFS_DATA_FORK);
1207 xfs_bmap_trace_pre_update(fname, "0", ip, idx + 2,
1208 XFS_DATA_FORK);
1209 xfs_bmbt_set_startblock(ep + 2, NULLSTARTBLOCK((int)temp2));
1210 xfs_bmap_trace_post_update(fname, "0", ip, idx + 2,
1211 XFS_DATA_FORK);
1212 *dnew = temp + temp2;
1213 break;
1214
1215 case MASK3(LEFT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
1216 case MASK3(RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
1217 case MASK2(LEFT_FILLING, RIGHT_CONTIG):
1218 case MASK2(RIGHT_FILLING, LEFT_CONTIG):
1219 case MASK2(LEFT_CONTIG, RIGHT_CONTIG):
1220 case MASK(LEFT_CONTIG):
1221 case MASK(RIGHT_CONTIG):
1222 /*
1223 * These cases are all impossible.
1224 */
1225 ASSERT(0);
1226 }
1227 *curp = cur;
1228done:
1229 *logflagsp = rval;
1230 return error;
1231#undef LEFT
1232#undef RIGHT
1233#undef PREV
1234#undef MASK
1235#undef MASK2
1236#undef MASK3
1237#undef MASK4
1238#undef STATE_SET
1239#undef STATE_TEST
1240#undef STATE_SET_TEST
1241#undef SWITCH_STATE
1242}
1243
1244/*
1245 * Called by xfs_bmap_add_extent to handle cases converting an unwritten
1246 * allocation to a real allocation or vice versa.
1247 */
1248STATIC int /* error */
1249xfs_bmap_add_extent_unwritten_real(
1250 xfs_inode_t *ip, /* incore inode pointer */
1251 xfs_extnum_t idx, /* extent number to update/insert */
1252 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
1253 xfs_bmbt_irec_t *new, /* new data to put in extent list */
1254 int *logflagsp) /* inode logging flags */
1255{
1256 xfs_bmbt_rec_t *base; /* base of extent entry list */
1257 xfs_btree_cur_t *cur; /* btree cursor */
1258 xfs_bmbt_rec_t *ep; /* extent entry for idx */
1259 int error; /* error return value */
1260#ifdef XFS_BMAP_TRACE
1261 static char fname[] = "xfs_bmap_add_extent_unwritten_real";
1262#endif
1263 int i; /* temp state */
1264 xfs_fileoff_t new_endoff; /* end offset of new entry */
1265 xfs_exntst_t newext; /* new extent state */
1266 xfs_exntst_t oldext; /* old extent state */
1267 xfs_bmbt_irec_t r[3]; /* neighbor extent entries */
1268 /* left is 0, right is 1, prev is 2 */
1269 int rval=0; /* return value (logging flags) */
1270 int state = 0;/* state bits, accessed thru macros */
1271 enum { /* bit number definitions for state */
1272 LEFT_CONTIG, RIGHT_CONTIG,
1273 LEFT_FILLING, RIGHT_FILLING,
1274 LEFT_DELAY, RIGHT_DELAY,
1275 LEFT_VALID, RIGHT_VALID
1276 };
1277
1278#define LEFT r[0]
1279#define RIGHT r[1]
1280#define PREV r[2]
1281#define MASK(b) (1 << (b))
1282#define MASK2(a,b) (MASK(a) | MASK(b))
1283#define MASK3(a,b,c) (MASK2(a,b) | MASK(c))
1284#define MASK4(a,b,c,d) (MASK3(a,b,c) | MASK(d))
1285#define STATE_SET(b,v) ((v) ? (state |= MASK(b)) : (state &= ~MASK(b)))
1286#define STATE_TEST(b) (state & MASK(b))
1287#define STATE_SET_TEST(b,v) ((v) ? ((state |= MASK(b)), 1) : \
1288 ((state &= ~MASK(b)), 0))
1289#define SWITCH_STATE \
1290 (state & MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG))
1291
1292 /*
1293 * Set up a bunch of variables to make the tests simpler.
1294 */
1295 error = 0;
1296 cur = *curp;
1297 base = ip->i_df.if_u1.if_extents;
1298 ep = &base[idx];
1299 xfs_bmbt_get_all(ep, &PREV);
1300 newext = new->br_state;
1301 oldext = (newext == XFS_EXT_UNWRITTEN) ?
1302 XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
1303 ASSERT(PREV.br_state == oldext);
1304 new_endoff = new->br_startoff + new->br_blockcount;
1305 ASSERT(PREV.br_startoff <= new->br_startoff);
1306 ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
1307 /*
1308 * Set flags determining what part of the previous oldext allocation
1309 * extent is being replaced by a newext allocation.
1310 */
1311 STATE_SET(LEFT_FILLING, PREV.br_startoff == new->br_startoff);
1312 STATE_SET(RIGHT_FILLING,
1313 PREV.br_startoff + PREV.br_blockcount == new_endoff);
1314 /*
1315 * Check and set flags if this segment has a left neighbor.
1316 * Don't set contiguous if the combined extent would be too large.
1317 */
1318 if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
1319 xfs_bmbt_get_all(ep - 1, &LEFT);
1320 STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(LEFT.br_startblock));
1321 }
1322 STATE_SET(LEFT_CONTIG,
1323 STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) &&
1324 LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
1325 LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
1326 LEFT.br_state == newext &&
1327 LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN);
1328 /*
1329 * Check and set flags if this segment has a right neighbor.
1330 * Don't set contiguous if the combined extent would be too large.
1331 * Also check for all-three-contiguous being too large.
1332 */
1333 if (STATE_SET_TEST(RIGHT_VALID,
1334 idx <
1335 ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) {
1336 xfs_bmbt_get_all(ep + 1, &RIGHT);
1337 STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(RIGHT.br_startblock));
1338 }
1339 STATE_SET(RIGHT_CONTIG,
1340 STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) &&
1341 new_endoff == RIGHT.br_startoff &&
1342 new->br_startblock + new->br_blockcount ==
1343 RIGHT.br_startblock &&
1344 newext == RIGHT.br_state &&
1345 new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
1346 ((state & MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING)) !=
1347 MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING) ||
1348 LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
1349 <= MAXEXTLEN));
1350 /*
1351 * Switch out based on the FILLING and CONTIG state bits.
1352 */
1353 switch (SWITCH_STATE) {
1354
1355 case MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
1356 /*
1357 * Setting all of a previous oldext extent to newext.
1358 * The left and right neighbors are both contiguous with new.
1359 */
1360 xfs_bmap_trace_pre_update(fname, "LF|RF|LC|RC", ip, idx - 1,
1361 XFS_DATA_FORK);
1362 xfs_bmbt_set_blockcount(ep - 1,
1363 LEFT.br_blockcount + PREV.br_blockcount +
1364 RIGHT.br_blockcount);
1365 xfs_bmap_trace_post_update(fname, "LF|RF|LC|RC", ip, idx - 1,
1366 XFS_DATA_FORK);
1367 xfs_bmap_trace_delete(fname, "LF|RF|LC|RC", ip, idx, 2,
1368 XFS_DATA_FORK);
1369 xfs_bmap_delete_exlist(ip, idx, 2, XFS_DATA_FORK);
1370 ip->i_df.if_lastex = idx - 1;
1371 ip->i_d.di_nextents -= 2;
1372 if (cur == NULL)
1373 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
1374 else {
1375 rval = XFS_ILOG_CORE;
1376 if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
1377 RIGHT.br_startblock,
1378 RIGHT.br_blockcount, &i)))
1379 goto done;
1380 ASSERT(i == 1);
1381 if ((error = xfs_bmbt_delete(cur, &i)))
1382 goto done;
1383 ASSERT(i == 1);
1384 if ((error = xfs_bmbt_decrement(cur, 0, &i)))
1385 goto done;
1386 ASSERT(i == 1);
1387 if ((error = xfs_bmbt_delete(cur, &i)))
1388 goto done;
1389 ASSERT(i == 1);
1390 if ((error = xfs_bmbt_decrement(cur, 0, &i)))
1391 goto done;
1392 ASSERT(i == 1);
1393 if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
1394 LEFT.br_startblock,
1395 LEFT.br_blockcount + PREV.br_blockcount +
1396 RIGHT.br_blockcount, LEFT.br_state)))
1397 goto done;
1398 }
1399 break;
1400
1401 case MASK3(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG):
1402 /*
1403 * Setting all of a previous oldext extent to newext.
1404 * The left neighbor is contiguous, the right is not.
1405 */
1406 xfs_bmap_trace_pre_update(fname, "LF|RF|LC", ip, idx - 1,
1407 XFS_DATA_FORK);
1408 xfs_bmbt_set_blockcount(ep - 1,
1409 LEFT.br_blockcount + PREV.br_blockcount);
1410 xfs_bmap_trace_post_update(fname, "LF|RF|LC", ip, idx - 1,
1411 XFS_DATA_FORK);
1412 ip->i_df.if_lastex = idx - 1;
1413 xfs_bmap_trace_delete(fname, "LF|RF|LC", ip, idx, 1,
1414 XFS_DATA_FORK);
1415 xfs_bmap_delete_exlist(ip, idx, 1, XFS_DATA_FORK);
1416 ip->i_d.di_nextents--;
1417 if (cur == NULL)
1418 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
1419 else {
1420 rval = XFS_ILOG_CORE;
1421 if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
1422 PREV.br_startblock, PREV.br_blockcount,
1423 &i)))
1424 goto done;
1425 ASSERT(i == 1);
1426 if ((error = xfs_bmbt_delete(cur, &i)))
1427 goto done;
1428 ASSERT(i == 1);
1429 if ((error = xfs_bmbt_decrement(cur, 0, &i)))
1430 goto done;
1431 ASSERT(i == 1);
1432 if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
1433 LEFT.br_startblock,
1434 LEFT.br_blockcount + PREV.br_blockcount,
1435 LEFT.br_state)))
1436 goto done;
1437 }
1438 break;
1439
1440 case MASK3(LEFT_FILLING, RIGHT_FILLING, RIGHT_CONTIG):
1441 /*
1442 * Setting all of a previous oldext extent to newext.
1443 * The right neighbor is contiguous, the left is not.
1444 */
1445 xfs_bmap_trace_pre_update(fname, "LF|RF|RC", ip, idx,
1446 XFS_DATA_FORK);
1447 xfs_bmbt_set_blockcount(ep,
1448 PREV.br_blockcount + RIGHT.br_blockcount);
1449 xfs_bmbt_set_state(ep, newext);
1450 xfs_bmap_trace_post_update(fname, "LF|RF|RC", ip, idx,
1451 XFS_DATA_FORK);
1452 ip->i_df.if_lastex = idx;
1453 xfs_bmap_trace_delete(fname, "LF|RF|RC", ip, idx + 1, 1,
1454 XFS_DATA_FORK);
1455 xfs_bmap_delete_exlist(ip, idx + 1, 1, XFS_DATA_FORK);
1456 ip->i_d.di_nextents--;
1457 if (cur == NULL)
1458 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
1459 else {
1460 rval = XFS_ILOG_CORE;
1461 if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
1462 RIGHT.br_startblock,
1463 RIGHT.br_blockcount, &i)))
1464 goto done;
1465 ASSERT(i == 1);
1466 if ((error = xfs_bmbt_delete(cur, &i)))
1467 goto done;
1468 ASSERT(i == 1);
1469 if ((error = xfs_bmbt_decrement(cur, 0, &i)))
1470 goto done;
1471 ASSERT(i == 1);
1472 if ((error = xfs_bmbt_update(cur, new->br_startoff,
1473 new->br_startblock,
1474 new->br_blockcount + RIGHT.br_blockcount,
1475 newext)))
1476 goto done;
1477 }
1478 break;
1479
1480 case MASK2(LEFT_FILLING, RIGHT_FILLING):
1481 /*
1482 * Setting all of a previous oldext extent to newext.
1483 * Neither the left nor right neighbors are contiguous with
1484 * the new one.
1485 */
1486 xfs_bmap_trace_pre_update(fname, "LF|RF", ip, idx,
1487 XFS_DATA_FORK);
1488 xfs_bmbt_set_state(ep, newext);
1489 xfs_bmap_trace_post_update(fname, "LF|RF", ip, idx,
1490 XFS_DATA_FORK);
1491 ip->i_df.if_lastex = idx;
1492 if (cur == NULL)
1493 rval = XFS_ILOG_DEXT;
1494 else {
1495 rval = 0;
1496 if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
1497 new->br_startblock, new->br_blockcount,
1498 &i)))
1499 goto done;
1500 ASSERT(i == 1);
1501 if ((error = xfs_bmbt_update(cur, new->br_startoff,
1502 new->br_startblock, new->br_blockcount,
1503 newext)))
1504 goto done;
1505 }
1506 break;
1507
1508 case MASK2(LEFT_FILLING, LEFT_CONTIG):
1509 /*
1510 * Setting the first part of a previous oldext extent to newext.
1511 * The left neighbor is contiguous.
1512 */
1513 xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx - 1,
1514 XFS_DATA_FORK);
1515 xfs_bmbt_set_blockcount(ep - 1,
1516 LEFT.br_blockcount + new->br_blockcount);
1517 xfs_bmbt_set_startoff(ep,
1518 PREV.br_startoff + new->br_blockcount);
1519 xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx - 1,
1520 XFS_DATA_FORK);
1521 xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx,
1522 XFS_DATA_FORK);
1523 xfs_bmbt_set_startblock(ep,
1524 new->br_startblock + new->br_blockcount);
1525 xfs_bmbt_set_blockcount(ep,
1526 PREV.br_blockcount - new->br_blockcount);
1527 xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx,
1528 XFS_DATA_FORK);
1529 ip->i_df.if_lastex = idx - 1;
1530 if (cur == NULL)
1531 rval = XFS_ILOG_DEXT;
1532 else {
1533 rval = 0;
1534 if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
1535 PREV.br_startblock, PREV.br_blockcount,
1536 &i)))
1537 goto done;
1538 ASSERT(i == 1);
1539 if ((error = xfs_bmbt_update(cur,
1540 PREV.br_startoff + new->br_blockcount,
1541 PREV.br_startblock + new->br_blockcount,
1542 PREV.br_blockcount - new->br_blockcount,
1543 oldext)))
1544 goto done;
1545 if ((error = xfs_bmbt_decrement(cur, 0, &i)))
1546 goto done;
1547 if (xfs_bmbt_update(cur, LEFT.br_startoff,
1548 LEFT.br_startblock,
1549 LEFT.br_blockcount + new->br_blockcount,
1550 LEFT.br_state))
1551 goto done;
1552 }
1553 break;
1554
1555 case MASK(LEFT_FILLING):
1556 /*
1557 * Setting the first part of a previous oldext extent to newext.
1558 * The left neighbor is not contiguous.
1559 */
1560 xfs_bmap_trace_pre_update(fname, "LF", ip, idx, XFS_DATA_FORK);
1561 ASSERT(ep && xfs_bmbt_get_state(ep) == oldext);
1562 xfs_bmbt_set_startoff(ep, new_endoff);
1563 xfs_bmbt_set_blockcount(ep,
1564 PREV.br_blockcount - new->br_blockcount);
1565 xfs_bmbt_set_startblock(ep,
1566 new->br_startblock + new->br_blockcount);
1567 xfs_bmap_trace_post_update(fname, "LF", ip, idx, XFS_DATA_FORK);
1568 xfs_bmap_trace_insert(fname, "LF", ip, idx, 1, new, NULL,
1569 XFS_DATA_FORK);
1570 xfs_bmap_insert_exlist(ip, idx, 1, new, XFS_DATA_FORK);
1571 ip->i_df.if_lastex = idx;
1572 ip->i_d.di_nextents++;
1573 if (cur == NULL)
1574 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
1575 else {
1576 rval = XFS_ILOG_CORE;
1577 if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
1578 PREV.br_startblock, PREV.br_blockcount,
1579 &i)))
1580 goto done;
1581 ASSERT(i == 1);
1582 if ((error = xfs_bmbt_update(cur,
1583 PREV.br_startoff + new->br_blockcount,
1584 PREV.br_startblock + new->br_blockcount,
1585 PREV.br_blockcount - new->br_blockcount,
1586 oldext)))
1587 goto done;
1588 cur->bc_rec.b = *new;
1589 if ((error = xfs_bmbt_insert(cur, &i)))
1590 goto done;
1591 ASSERT(i == 1);
1592 }
1593 break;
1594
1595 case MASK2(RIGHT_FILLING, RIGHT_CONTIG):
1596 /*
1597 * Setting the last part of a previous oldext extent to newext.
1598 * The right neighbor is contiguous with the new allocation.
1599 */
1600 xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx,
1601 XFS_DATA_FORK);
1602 xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx + 1,
1603 XFS_DATA_FORK);
1604 xfs_bmbt_set_blockcount(ep,
1605 PREV.br_blockcount - new->br_blockcount);
1606 xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx,
1607 XFS_DATA_FORK);
1608 xfs_bmbt_set_allf(ep + 1, new->br_startoff, new->br_startblock,
1609 new->br_blockcount + RIGHT.br_blockcount, newext);
1610 xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx + 1,
1611 XFS_DATA_FORK);
1612 ip->i_df.if_lastex = idx + 1;
1613 if (cur == NULL)
1614 rval = XFS_ILOG_DEXT;
1615 else {
1616 rval = 0;
1617 if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
1618 PREV.br_startblock,
1619 PREV.br_blockcount, &i)))
1620 goto done;
1621 ASSERT(i == 1);
1622 if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
1623 PREV.br_startblock,
1624 PREV.br_blockcount - new->br_blockcount,
1625 oldext)))
1626 goto done;
1627 if ((error = xfs_bmbt_increment(cur, 0, &i)))
1628 goto done;
1629 if ((error = xfs_bmbt_update(cur, new->br_startoff,
1630 new->br_startblock,
1631 new->br_blockcount + RIGHT.br_blockcount,
1632 newext)))
1633 goto done;
1634 }
1635 break;
1636
1637 case MASK(RIGHT_FILLING):
1638 /*
1639 * Setting the last part of a previous oldext extent to newext.
1640 * The right neighbor is not contiguous.
1641 */
1642 xfs_bmap_trace_pre_update(fname, "RF", ip, idx, XFS_DATA_FORK);
1643 xfs_bmbt_set_blockcount(ep,
1644 PREV.br_blockcount - new->br_blockcount);
1645 xfs_bmap_trace_post_update(fname, "RF", ip, idx, XFS_DATA_FORK);
1646 xfs_bmap_trace_insert(fname, "RF", ip, idx + 1, 1,
1647 new, NULL, XFS_DATA_FORK);
1648 xfs_bmap_insert_exlist(ip, idx + 1, 1, new, XFS_DATA_FORK);
1649 ip->i_df.if_lastex = idx + 1;
1650 ip->i_d.di_nextents++;
1651 if (cur == NULL)
1652 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
1653 else {
1654 rval = XFS_ILOG_CORE;
1655 if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
1656 PREV.br_startblock, PREV.br_blockcount,
1657 &i)))
1658 goto done;
1659 ASSERT(i == 1);
1660 if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
1661 PREV.br_startblock,
1662 PREV.br_blockcount - new->br_blockcount,
1663 oldext)))
1664 goto done;
1665 if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
1666 new->br_startblock, new->br_blockcount,
1667 &i)))
1668 goto done;
1669 ASSERT(i == 0);
1670 cur->bc_rec.b.br_state = XFS_EXT_NORM;
1671 if ((error = xfs_bmbt_insert(cur, &i)))
1672 goto done;
1673 ASSERT(i == 1);
1674 }
1675 break;
1676
1677 case 0:
1678 /*
1679 * Setting the middle part of a previous oldext extent to
1680 * newext. Contiguity is impossible here.
1681 * One extent becomes three extents.
1682 */
1683 xfs_bmap_trace_pre_update(fname, "0", ip, idx, XFS_DATA_FORK);
1684 xfs_bmbt_set_blockcount(ep,
1685 new->br_startoff - PREV.br_startoff);
1686 xfs_bmap_trace_post_update(fname, "0", ip, idx, XFS_DATA_FORK);
1687 r[0] = *new;
1688 r[1].br_startoff = new_endoff;
1689 r[1].br_blockcount =
1690 PREV.br_startoff + PREV.br_blockcount - new_endoff;
1691 r[1].br_startblock = new->br_startblock + new->br_blockcount;
1692 r[1].br_state = oldext;
1693 xfs_bmap_trace_insert(fname, "0", ip, idx + 1, 2, &r[0], &r[1],
1694 XFS_DATA_FORK);
1695 xfs_bmap_insert_exlist(ip, idx + 1, 2, &r[0], XFS_DATA_FORK);
1696 ip->i_df.if_lastex = idx + 1;
1697 ip->i_d.di_nextents += 2;
1698 if (cur == NULL)
1699 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
1700 else {
1701 rval = XFS_ILOG_CORE;
1702 if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
1703 PREV.br_startblock, PREV.br_blockcount,
1704 &i)))
1705 goto done;
1706 ASSERT(i == 1);
1707 /* new right extent - oldext */
1708 if ((error = xfs_bmbt_update(cur, r[1].br_startoff,
1709 r[1].br_startblock, r[1].br_blockcount,
1710 r[1].br_state)))
1711 goto done;
1712 /* new left extent - oldext */
1713 PREV.br_blockcount =
1714 new->br_startoff - PREV.br_startoff;
1715 cur->bc_rec.b = PREV;
1716 if ((error = xfs_bmbt_insert(cur, &i)))
1717 goto done;
1718 ASSERT(i == 1);
1719 if ((error = xfs_bmbt_increment(cur, 0, &i)))
1720 goto done;
1721 ASSERT(i == 1);
1722 /* new middle extent - newext */
1723 cur->bc_rec.b = *new;
1724 if ((error = xfs_bmbt_insert(cur, &i)))
1725 goto done;
1726 ASSERT(i == 1);
1727 }
1728 break;
1729
1730 case MASK3(LEFT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
1731 case MASK3(RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
1732 case MASK2(LEFT_FILLING, RIGHT_CONTIG):
1733 case MASK2(RIGHT_FILLING, LEFT_CONTIG):
1734 case MASK2(LEFT_CONTIG, RIGHT_CONTIG):
1735 case MASK(LEFT_CONTIG):
1736 case MASK(RIGHT_CONTIG):
1737 /*
1738 * These cases are all impossible.
1739 */
1740 ASSERT(0);
1741 }
1742 *curp = cur;
1743done:
1744 *logflagsp = rval;
1745 return error;
1746#undef LEFT
1747#undef RIGHT
1748#undef PREV
1749#undef MASK
1750#undef MASK2
1751#undef MASK3
1752#undef MASK4
1753#undef STATE_SET
1754#undef STATE_TEST
1755#undef STATE_SET_TEST
1756#undef SWITCH_STATE
1757}
1758
1759/*
1760 * Called by xfs_bmap_add_extent to handle cases converting a hole
1761 * to a delayed allocation.
1762 */
1763/*ARGSUSED*/
1764STATIC int /* error */
1765xfs_bmap_add_extent_hole_delay(
1766 xfs_inode_t *ip, /* incore inode pointer */
1767 xfs_extnum_t idx, /* extent number to update/insert */
1768 xfs_btree_cur_t *cur, /* if null, not a btree */
1769 xfs_bmbt_irec_t *new, /* new data to put in extent list */
1770 int *logflagsp, /* inode logging flags */
1771 int rsvd) /* OK to allocate reserved blocks */
1772{
1773 xfs_bmbt_rec_t *base; /* base of extent entry list */
1774 xfs_bmbt_rec_t *ep; /* extent list entry for idx */
1775#ifdef XFS_BMAP_TRACE
1776 static char fname[] = "xfs_bmap_add_extent_hole_delay";
1777#endif
1778 xfs_bmbt_irec_t left; /* left neighbor extent entry */
1779 xfs_filblks_t newlen=0; /* new indirect size */
1780 xfs_filblks_t oldlen=0; /* old indirect size */
1781 xfs_bmbt_irec_t right; /* right neighbor extent entry */
1782 int state; /* state bits, accessed thru macros */
1783 xfs_filblks_t temp; /* temp for indirect calculations */
1784 enum { /* bit number definitions for state */
1785 LEFT_CONTIG, RIGHT_CONTIG,
1786 LEFT_DELAY, RIGHT_DELAY,
1787 LEFT_VALID, RIGHT_VALID
1788 };
1789
1790#define MASK(b) (1 << (b))
1791#define MASK2(a,b) (MASK(a) | MASK(b))
1792#define STATE_SET(b,v) ((v) ? (state |= MASK(b)) : (state &= ~MASK(b)))
1793#define STATE_TEST(b) (state & MASK(b))
1794#define STATE_SET_TEST(b,v) ((v) ? ((state |= MASK(b)), 1) : \
1795 ((state &= ~MASK(b)), 0))
1796#define SWITCH_STATE (state & MASK2(LEFT_CONTIG, RIGHT_CONTIG))
1797
1798 base = ip->i_df.if_u1.if_extents;
1799 ep = &base[idx];
1800 state = 0;
1801 ASSERT(ISNULLSTARTBLOCK(new->br_startblock));
1802 /*
1803 * Check and set flags if this segment has a left neighbor
1804 */
1805 if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
1806 xfs_bmbt_get_all(ep - 1, &left);
1807 STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(left.br_startblock));
1808 }
1809 /*
1810 * Check and set flags if the current (right) segment exists.
1811 * If it doesn't exist, we're converting the hole at end-of-file.
1812 */
1813 if (STATE_SET_TEST(RIGHT_VALID,
1814 idx <
1815 ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
1816 xfs_bmbt_get_all(ep, &right);
1817 STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(right.br_startblock));
1818 }
1819 /*
1820 * Set contiguity flags on the left and right neighbors.
1821 * Don't let extents get too large, even if the pieces are contiguous.
1822 */
1823 STATE_SET(LEFT_CONTIG,
1824 STATE_TEST(LEFT_VALID) && STATE_TEST(LEFT_DELAY) &&
1825 left.br_startoff + left.br_blockcount == new->br_startoff &&
1826 left.br_blockcount + new->br_blockcount <= MAXEXTLEN);
1827 STATE_SET(RIGHT_CONTIG,
1828 STATE_TEST(RIGHT_VALID) && STATE_TEST(RIGHT_DELAY) &&
1829 new->br_startoff + new->br_blockcount == right.br_startoff &&
1830 new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
1831 (!STATE_TEST(LEFT_CONTIG) ||
1832 (left.br_blockcount + new->br_blockcount +
1833 right.br_blockcount <= MAXEXTLEN)));
1834 /*
1835 * Switch out based on the contiguity flags.
1836 */
1837 switch (SWITCH_STATE) {
1838
1839 case MASK2(LEFT_CONTIG, RIGHT_CONTIG):
1840 /*
1841 * New allocation is contiguous with delayed allocations
1842 * on the left and on the right.
1843 * Merge all three into a single extent list entry.
1844 */
1845 temp = left.br_blockcount + new->br_blockcount +
1846 right.br_blockcount;
1847 xfs_bmap_trace_pre_update(fname, "LC|RC", ip, idx - 1,
1848 XFS_DATA_FORK);
1849 xfs_bmbt_set_blockcount(ep - 1, temp);
1850 oldlen = STARTBLOCKVAL(left.br_startblock) +
1851 STARTBLOCKVAL(new->br_startblock) +
1852 STARTBLOCKVAL(right.br_startblock);
1853 newlen = xfs_bmap_worst_indlen(ip, temp);
1854 xfs_bmbt_set_startblock(ep - 1, NULLSTARTBLOCK((int)newlen));
1855 xfs_bmap_trace_post_update(fname, "LC|RC", ip, idx - 1,
1856 XFS_DATA_FORK);
1857 xfs_bmap_trace_delete(fname, "LC|RC", ip, idx, 1,
1858 XFS_DATA_FORK);
1859 xfs_bmap_delete_exlist(ip, idx, 1, XFS_DATA_FORK);
1860 ip->i_df.if_lastex = idx - 1;
1861 break;
1862
1863 case MASK(LEFT_CONTIG):
1864 /*
1865 * New allocation is contiguous with a delayed allocation
1866 * on the left.
1867 * Merge the new allocation with the left neighbor.
1868 */
1869 temp = left.br_blockcount + new->br_blockcount;
1870 xfs_bmap_trace_pre_update(fname, "LC", ip, idx - 1,
1871 XFS_DATA_FORK);
1872 xfs_bmbt_set_blockcount(ep - 1, temp);
1873 oldlen = STARTBLOCKVAL(left.br_startblock) +
1874 STARTBLOCKVAL(new->br_startblock);
1875 newlen = xfs_bmap_worst_indlen(ip, temp);
1876 xfs_bmbt_set_startblock(ep - 1, NULLSTARTBLOCK((int)newlen));
1877 xfs_bmap_trace_post_update(fname, "LC", ip, idx - 1,
1878 XFS_DATA_FORK);
1879 ip->i_df.if_lastex = idx - 1;
1880 break;
1881
1882 case MASK(RIGHT_CONTIG):
1883 /*
1884 * New allocation is contiguous with a delayed allocation
1885 * on the right.
1886 * Merge the new allocation with the right neighbor.
1887 */
1888 xfs_bmap_trace_pre_update(fname, "RC", ip, idx, XFS_DATA_FORK);
1889 temp = new->br_blockcount + right.br_blockcount;
1890 oldlen = STARTBLOCKVAL(new->br_startblock) +
1891 STARTBLOCKVAL(right.br_startblock);
1892 newlen = xfs_bmap_worst_indlen(ip, temp);
1893 xfs_bmbt_set_allf(ep, new->br_startoff,
1894 NULLSTARTBLOCK((int)newlen), temp, right.br_state);
1895 xfs_bmap_trace_post_update(fname, "RC", ip, idx, XFS_DATA_FORK);
1896 ip->i_df.if_lastex = idx;
1897 break;
1898
1899 case 0:
1900 /*
1901 * New allocation is not contiguous with another
1902 * delayed allocation.
1903 * Insert a new entry.
1904 */
1905 oldlen = newlen = 0;
1906 xfs_bmap_trace_insert(fname, "0", ip, idx, 1, new, NULL,
1907 XFS_DATA_FORK);
1908 xfs_bmap_insert_exlist(ip, idx, 1, new, XFS_DATA_FORK);
1909 ip->i_df.if_lastex = idx;
1910 break;
1911 }
1912 if (oldlen != newlen) {
1913 ASSERT(oldlen > newlen);
1914 xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS,
1915 (int)(oldlen - newlen), rsvd);
1916 /*
1917 * Nothing to do for disk quota accounting here.
1918 */
1919 }
1920 *logflagsp = 0;
1921 return 0;
1922#undef MASK
1923#undef MASK2
1924#undef STATE_SET
1925#undef STATE_TEST
1926#undef STATE_SET_TEST
1927#undef SWITCH_STATE
1928}
1929
1930/*
1931 * Called by xfs_bmap_add_extent to handle cases converting a hole
1932 * to a real allocation.
1933 */
1934STATIC int /* error */
1935xfs_bmap_add_extent_hole_real(
1936 xfs_inode_t *ip, /* incore inode pointer */
1937 xfs_extnum_t idx, /* extent number to update/insert */
1938 xfs_btree_cur_t *cur, /* if null, not a btree */
1939 xfs_bmbt_irec_t *new, /* new data to put in extent list */
1940 int *logflagsp, /* inode logging flags */
1941 int whichfork) /* data or attr fork */
1942{
1943 xfs_bmbt_rec_t *ep; /* pointer to extent entry ins. point */
1944 int error; /* error return value */
1945#ifdef XFS_BMAP_TRACE
1946 static char fname[] = "xfs_bmap_add_extent_hole_real";
1947#endif
1948 int i; /* temp state */
1949 xfs_ifork_t *ifp; /* inode fork pointer */
1950 xfs_bmbt_irec_t left; /* left neighbor extent entry */
1951 xfs_bmbt_irec_t right; /* right neighbor extent entry */
1952 int state; /* state bits, accessed thru macros */
1953 enum { /* bit number definitions for state */
1954 LEFT_CONTIG, RIGHT_CONTIG,
1955 LEFT_DELAY, RIGHT_DELAY,
1956 LEFT_VALID, RIGHT_VALID
1957 };
1958
1959#define MASK(b) (1 << (b))
1960#define MASK2(a,b) (MASK(a) | MASK(b))
1961#define STATE_SET(b,v) ((v) ? (state |= MASK(b)) : (state &= ~MASK(b)))
1962#define STATE_TEST(b) (state & MASK(b))
1963#define STATE_SET_TEST(b,v) ((v) ? ((state |= MASK(b)), 1) : \
1964 ((state &= ~MASK(b)), 0))
1965#define SWITCH_STATE (state & MASK2(LEFT_CONTIG, RIGHT_CONTIG))
1966
1967 ifp = XFS_IFORK_PTR(ip, whichfork);
1968 ASSERT(idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
1969 ep = &ifp->if_u1.if_extents[idx];
1970 state = 0;
1971 /*
1972 * Check and set flags if this segment has a left neighbor.
1973 */
1974 if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
1975 xfs_bmbt_get_all(ep - 1, &left);
1976 STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(left.br_startblock));
1977 }
1978 /*
1979 * Check and set flags if this segment has a current value.
1980 * Not true if we're inserting into the "hole" at eof.
1981 */
1982 if (STATE_SET_TEST(RIGHT_VALID,
1983 idx <
1984 ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
1985 xfs_bmbt_get_all(ep, &right);
1986 STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(right.br_startblock));
1987 }
1988 /*
1989 * We're inserting a real allocation between "left" and "right".
1990 * Set the contiguity flags. Don't let extents get too large.
1991 */
1992 STATE_SET(LEFT_CONTIG,
1993 STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) &&
1994 left.br_startoff + left.br_blockcount == new->br_startoff &&
1995 left.br_startblock + left.br_blockcount == new->br_startblock &&
1996 left.br_state == new->br_state &&
1997 left.br_blockcount + new->br_blockcount <= MAXEXTLEN);
1998 STATE_SET(RIGHT_CONTIG,
1999 STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) &&
2000 new->br_startoff + new->br_blockcount == right.br_startoff &&
2001 new->br_startblock + new->br_blockcount ==
2002 right.br_startblock &&
2003 new->br_state == right.br_state &&
2004 new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
2005 (!STATE_TEST(LEFT_CONTIG) ||
2006 left.br_blockcount + new->br_blockcount +
2007 right.br_blockcount <= MAXEXTLEN));
2008
2009 /*
2010 * Select which case we're in here, and implement it.
2011 */
2012 switch (SWITCH_STATE) {
2013
2014 case MASK2(LEFT_CONTIG, RIGHT_CONTIG):
2015 /*
2016 * New allocation is contiguous with real allocations on the
2017 * left and on the right.
2018 * Merge all three into a single extent list entry.
2019 */
2020 xfs_bmap_trace_pre_update(fname, "LC|RC", ip, idx - 1,
2021 whichfork);
2022 xfs_bmbt_set_blockcount(ep - 1,
2023 left.br_blockcount + new->br_blockcount +
2024 right.br_blockcount);
2025 xfs_bmap_trace_post_update(fname, "LC|RC", ip, idx - 1,
2026 whichfork);
2027 xfs_bmap_trace_delete(fname, "LC|RC", ip,
2028 idx, 1, whichfork);
2029 xfs_bmap_delete_exlist(ip, idx, 1, whichfork);
2030 ifp->if_lastex = idx - 1;
2031 XFS_IFORK_NEXT_SET(ip, whichfork,
2032 XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
2033 if (cur == NULL) {
2034 *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
2035 return 0;
2036 }
2037 *logflagsp = XFS_ILOG_CORE;
2038 if ((error = xfs_bmbt_lookup_eq(cur, right.br_startoff,
2039 right.br_startblock, right.br_blockcount, &i)))
2040 return error;
2041 ASSERT(i == 1);
2042 if ((error = xfs_bmbt_delete(cur, &i)))
2043 return error;
2044 ASSERT(i == 1);
2045 if ((error = xfs_bmbt_decrement(cur, 0, &i)))
2046 return error;
2047 ASSERT(i == 1);
2048 error = xfs_bmbt_update(cur, left.br_startoff,
2049 left.br_startblock,
2050 left.br_blockcount + new->br_blockcount +
2051 right.br_blockcount, left.br_state);
2052 return error;
2053
2054 case MASK(LEFT_CONTIG):
2055 /*
2056 * New allocation is contiguous with a real allocation
2057 * on the left.
2058 * Merge the new allocation with the left neighbor.
2059 */
2060 xfs_bmap_trace_pre_update(fname, "LC", ip, idx - 1, whichfork);
2061 xfs_bmbt_set_blockcount(ep - 1,
2062 left.br_blockcount + new->br_blockcount);
2063 xfs_bmap_trace_post_update(fname, "LC", ip, idx - 1, whichfork);
2064 ifp->if_lastex = idx - 1;
2065 if (cur == NULL) {
2066 *logflagsp = XFS_ILOG_FEXT(whichfork);
2067 return 0;
2068 }
2069 *logflagsp = 0;
2070 if ((error = xfs_bmbt_lookup_eq(cur, left.br_startoff,
2071 left.br_startblock, left.br_blockcount, &i)))
2072 return error;
2073 ASSERT(i == 1);
2074 error = xfs_bmbt_update(cur, left.br_startoff,
2075 left.br_startblock,
2076 left.br_blockcount + new->br_blockcount,
2077 left.br_state);
2078 return error;
2079
2080 case MASK(RIGHT_CONTIG):
2081 /*
2082 * New allocation is contiguous with a real allocation
2083 * on the right.
2084 * Merge the new allocation with the right neighbor.
2085 */
2086 xfs_bmap_trace_pre_update(fname, "RC", ip, idx, whichfork);
2087 xfs_bmbt_set_allf(ep, new->br_startoff, new->br_startblock,
2088 new->br_blockcount + right.br_blockcount,
2089 right.br_state);
2090 xfs_bmap_trace_post_update(fname, "RC", ip, idx, whichfork);
2091 ifp->if_lastex = idx;
2092 if (cur == NULL) {
2093 *logflagsp = XFS_ILOG_FEXT(whichfork);
2094 return 0;
2095 }
2096 *logflagsp = 0;
2097 if ((error = xfs_bmbt_lookup_eq(cur, right.br_startoff,
2098 right.br_startblock, right.br_blockcount, &i)))
2099 return error;
2100 ASSERT(i == 1);
2101 error = xfs_bmbt_update(cur, new->br_startoff,
2102 new->br_startblock,
2103 new->br_blockcount + right.br_blockcount,
2104 right.br_state);
2105 return error;
2106
2107 case 0:
2108 /*
2109 * New allocation is not contiguous with another
2110 * real allocation.
2111 * Insert a new entry.
2112 */
2113 xfs_bmap_trace_insert(fname, "0", ip, idx, 1, new, NULL,
2114 whichfork);
2115 xfs_bmap_insert_exlist(ip, idx, 1, new, whichfork);
2116 ifp->if_lastex = idx;
2117 XFS_IFORK_NEXT_SET(ip, whichfork,
2118 XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
2119 if (cur == NULL) {
2120 *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
2121 return 0;
2122 }
2123 *logflagsp = XFS_ILOG_CORE;
2124 if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
2125 new->br_startblock, new->br_blockcount, &i)))
2126 return error;
2127 ASSERT(i == 0);
2128 cur->bc_rec.b.br_state = new->br_state;
2129 if ((error = xfs_bmbt_insert(cur, &i)))
2130 return error;
2131 ASSERT(i == 1);
2132 return 0;
2133 }
2134#undef MASK
2135#undef MASK2
2136#undef STATE_SET
2137#undef STATE_TEST
2138#undef STATE_SET_TEST
2139#undef SWITCH_STATE
2140 /* NOTREACHED */
2141 ASSERT(0);
2142 return 0; /* keep gcc quite */
2143}
2144
2145#define XFS_ALLOC_GAP_UNITS 4
2146
2147/*
2148 * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
2149 * It figures out where to ask the underlying allocator to put the new extent.
2150 */
2151STATIC int /* error */
2152xfs_bmap_alloc(
2153 xfs_bmalloca_t *ap) /* bmap alloc argument struct */
2154{
2155 xfs_fsblock_t adjust; /* adjustment to block numbers */
2156 xfs_alloctype_t atype=0; /* type for allocation routines */
2157 int error; /* error return value */
2158 xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */
2159 xfs_mount_t *mp; /* mount point structure */
2160 int nullfb; /* true if ap->firstblock isn't set */
2161 int rt; /* true if inode is realtime */
2162#ifdef __KERNEL__
2163 xfs_extlen_t prod=0; /* product factor for allocators */
2164 xfs_extlen_t ralen=0; /* realtime allocation length */
2165#endif
2166
2167#define ISVALID(x,y) \
2168 (rt ? \
2169 (x) < mp->m_sb.sb_rblocks : \
2170 XFS_FSB_TO_AGNO(mp, x) == XFS_FSB_TO_AGNO(mp, y) && \
2171 XFS_FSB_TO_AGNO(mp, x) < mp->m_sb.sb_agcount && \
2172 XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks)
2173
2174 /*
2175 * Set up variables.
2176 */
2177 mp = ap->ip->i_mount;
2178 nullfb = ap->firstblock == NULLFSBLOCK;
2179 rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata;
2180 fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock);
2181#ifdef __KERNEL__
2182 if (rt) {
2183 xfs_extlen_t extsz; /* file extent size for rt */
2184 xfs_fileoff_t nexto; /* next file offset */
2185 xfs_extlen_t orig_alen; /* original ap->alen */
2186 xfs_fileoff_t orig_end; /* original off+len */
2187 xfs_fileoff_t orig_off; /* original ap->off */
2188 xfs_extlen_t mod_off; /* modulus calculations */
2189 xfs_fileoff_t prevo; /* previous file offset */
2190 xfs_rtblock_t rtx; /* realtime extent number */
2191 xfs_extlen_t temp; /* temp for rt calculations */
2192
2193 /*
2194 * Set prod to match the realtime extent size.
2195 */
2196 if (!(extsz = ap->ip->i_d.di_extsize))
2197 extsz = mp->m_sb.sb_rextsize;
2198 prod = extsz / mp->m_sb.sb_rextsize;
2199 orig_off = ap->off;
2200 orig_alen = ap->alen;
2201 orig_end = orig_off + orig_alen;
2202 /*
2203 * If the file offset is unaligned vs. the extent size
2204 * we need to align it. This will be possible unless
2205 * the file was previously written with a kernel that didn't
2206 * perform this alignment.
2207 */
2208 mod_off = do_mod(orig_off, extsz);
2209 if (mod_off) {
2210 ap->alen += mod_off;
2211 ap->off -= mod_off;
2212 }
2213 /*
2214 * Same adjustment for the end of the requested area.
2215 */
2216 if ((temp = (ap->alen % extsz)))
2217 ap->alen += extsz - temp;
2218 /*
2219 * If the previous block overlaps with this proposed allocation
2220 * then move the start forward without adjusting the length.
2221 */
2222 prevo =
2223 ap->prevp->br_startoff == NULLFILEOFF ?
2224 0 :
2225 (ap->prevp->br_startoff +
2226 ap->prevp->br_blockcount);
2227 if (ap->off != orig_off && ap->off < prevo)
2228 ap->off = prevo;
2229 /*
2230 * If the next block overlaps with this proposed allocation
2231 * then move the start back without adjusting the length,
2232 * but not before offset 0.
2233 * This may of course make the start overlap previous block,
2234 * and if we hit the offset 0 limit then the next block
2235 * can still overlap too.
2236 */
2237 nexto = (ap->eof || ap->gotp->br_startoff == NULLFILEOFF) ?
2238 NULLFILEOFF : ap->gotp->br_startoff;
2239 if (!ap->eof &&
2240 ap->off + ap->alen != orig_end &&
2241 ap->off + ap->alen > nexto)
2242 ap->off = nexto > ap->alen ? nexto - ap->alen : 0;
2243 /*
2244 * If we're now overlapping the next or previous extent that
2245 * means we can't fit an extsz piece in this hole. Just move
2246 * the start forward to the first valid spot and set
2247 * the length so we hit the end.
2248 */
2249 if ((ap->off != orig_off && ap->off < prevo) ||
2250 (ap->off + ap->alen != orig_end &&
2251 ap->off + ap->alen > nexto)) {
2252 ap->off = prevo;
2253 ap->alen = nexto - prevo;
2254 }
2255 /*
2256 * If the result isn't a multiple of rtextents we need to
2257 * remove blocks until it is.
2258 */
2259 if ((temp = (ap->alen % mp->m_sb.sb_rextsize))) {
2260 /*
2261 * We're not covering the original request, or
2262 * we won't be able to once we fix the length.
2263 */
2264 if (orig_off < ap->off ||
2265 orig_end > ap->off + ap->alen ||
2266 ap->alen - temp < orig_alen)
2267 return XFS_ERROR(EINVAL);
2268 /*
2269 * Try to fix it by moving the start up.
2270 */
2271 if (ap->off + temp <= orig_off) {
2272 ap->alen -= temp;
2273 ap->off += temp;
2274 }
2275 /*
2276 * Try to fix it by moving the end in.
2277 */
2278 else if (ap->off + ap->alen - temp >= orig_end)
2279 ap->alen -= temp;
2280 /*
2281 * Set the start to the minimum then trim the length.
2282 */
2283 else {
2284 ap->alen -= orig_off - ap->off;
2285 ap->off = orig_off;
2286 ap->alen -= ap->alen % mp->m_sb.sb_rextsize;
2287 }
2288 /*
2289 * Result doesn't cover the request, fail it.
2290 */
2291 if (orig_off < ap->off || orig_end > ap->off + ap->alen)
2292 return XFS_ERROR(EINVAL);
2293 }
2294 ASSERT(ap->alen % mp->m_sb.sb_rextsize == 0);
2295 /*
2296 * If the offset & length are not perfectly aligned
2297 * then kill prod, it will just get us in trouble.
2298 */
2299 if (do_mod(ap->off, extsz) || ap->alen % extsz)
2300 prod = 1;
2301 /*
2302 * Set ralen to be the actual requested length in rtextents.
2303 */
2304 ralen = ap->alen / mp->m_sb.sb_rextsize;
2305 /*
2306 * If the old value was close enough to MAXEXTLEN that
2307 * we rounded up to it, cut it back so it's valid again.
2308 * Note that if it's a really large request (bigger than
2309 * MAXEXTLEN), we don't hear about that number, and can't
2310 * adjust the starting point to match it.
2311 */
2312 if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
2313 ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
2314 /*
2315 * If it's an allocation to an empty file at offset 0,
2316 * pick an extent that will space things out in the rt area.
2317 */
2318 if (ap->eof && ap->off == 0) {
2319 error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
2320 if (error)
2321 return error;
2322 ap->rval = rtx * mp->m_sb.sb_rextsize;
2323 } else
2324 ap->rval = 0;
2325 }
2326#else
2327 if (rt)
2328 ap->rval = 0;
2329#endif /* __KERNEL__ */
2330 else if (nullfb)
2331 ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
2332 else
2333 ap->rval = ap->firstblock;
2334 /*
2335 * If allocating at eof, and there's a previous real block,
2336 * try to use it's last block as our starting point.
2337 */
2338 if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF &&
2339 !ISNULLSTARTBLOCK(ap->prevp->br_startblock) &&
2340 ISVALID(ap->prevp->br_startblock + ap->prevp->br_blockcount,
2341 ap->prevp->br_startblock)) {
2342 ap->rval = ap->prevp->br_startblock + ap->prevp->br_blockcount;
2343 /*
2344 * Adjust for the gap between prevp and us.
2345 */
2346 adjust = ap->off -
2347 (ap->prevp->br_startoff + ap->prevp->br_blockcount);
2348 if (adjust &&
2349 ISVALID(ap->rval + adjust, ap->prevp->br_startblock))
2350 ap->rval += adjust;
2351 }
2352 /*
2353 * If not at eof, then compare the two neighbor blocks.
2354 * Figure out whether either one gives us a good starting point,
2355 * and pick the better one.
2356 */
2357 else if (!ap->eof) {
2358 xfs_fsblock_t gotbno; /* right side block number */
2359 xfs_fsblock_t gotdiff=0; /* right side difference */
2360 xfs_fsblock_t prevbno; /* left side block number */
2361 xfs_fsblock_t prevdiff=0; /* left side difference */
2362
2363 /*
2364 * If there's a previous (left) block, select a requested
2365 * start block based on it.
2366 */
2367 if (ap->prevp->br_startoff != NULLFILEOFF &&
2368 !ISNULLSTARTBLOCK(ap->prevp->br_startblock) &&
2369 (prevbno = ap->prevp->br_startblock +
2370 ap->prevp->br_blockcount) &&
2371 ISVALID(prevbno, ap->prevp->br_startblock)) {
2372 /*
2373 * Calculate gap to end of previous block.
2374 */
2375 adjust = prevdiff = ap->off -
2376 (ap->prevp->br_startoff +
2377 ap->prevp->br_blockcount);
2378 /*
2379 * Figure the startblock based on the previous block's
2380 * end and the gap size.
2381 * Heuristic!
2382 * If the gap is large relative to the piece we're
2383 * allocating, or using it gives us an invalid block
2384 * number, then just use the end of the previous block.
2385 */
2386 if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->alen &&
2387 ISVALID(prevbno + prevdiff,
2388 ap->prevp->br_startblock))
2389 prevbno += adjust;
2390 else
2391 prevdiff += adjust;
2392 /*
2393 * If the firstblock forbids it, can't use it,
2394 * must use default.
2395 */
2396 if (!rt && !nullfb &&
2397 XFS_FSB_TO_AGNO(mp, prevbno) != fb_agno)
2398 prevbno = NULLFSBLOCK;
2399 }
2400 /*
2401 * No previous block or can't follow it, just default.
2402 */
2403 else
2404 prevbno = NULLFSBLOCK;
2405 /*
2406 * If there's a following (right) block, select a requested
2407 * start block based on it.
2408 */
2409 if (!ISNULLSTARTBLOCK(ap->gotp->br_startblock)) {
2410 /*
2411 * Calculate gap to start of next block.
2412 */
2413 adjust = gotdiff = ap->gotp->br_startoff - ap->off;
2414 /*
2415 * Figure the startblock based on the next block's
2416 * start and the gap size.
2417 */
2418 gotbno = ap->gotp->br_startblock;
2419 /*
2420 * Heuristic!
2421 * If the gap is large relative to the piece we're
2422 * allocating, or using it gives us an invalid block
2423 * number, then just use the start of the next block
2424 * offset by our length.
2425 */
2426 if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->alen &&
2427 ISVALID(gotbno - gotdiff, gotbno))
2428 gotbno -= adjust;
2429 else if (ISVALID(gotbno - ap->alen, gotbno)) {
2430 gotbno -= ap->alen;
2431 gotdiff += adjust - ap->alen;
2432 } else
2433 gotdiff += adjust;
2434 /*
2435 * If the firstblock forbids it, can't use it,
2436 * must use default.
2437 */
2438 if (!rt && !nullfb &&
2439 XFS_FSB_TO_AGNO(mp, gotbno) != fb_agno)
2440 gotbno = NULLFSBLOCK;
2441 }
2442 /*
2443 * No next block, just default.
2444 */
2445 else
2446 gotbno = NULLFSBLOCK;
2447 /*
2448 * If both valid, pick the better one, else the only good
2449 * one, else ap->rval is already set (to 0 or the inode block).
2450 */
2451 if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK)
2452 ap->rval = prevdiff <= gotdiff ? prevbno : gotbno;
2453 else if (prevbno != NULLFSBLOCK)
2454 ap->rval = prevbno;
2455 else if (gotbno != NULLFSBLOCK)
2456 ap->rval = gotbno;
2457 }
2458 /*
2459 * If allowed, use ap->rval; otherwise must use firstblock since
2460 * it's in the right allocation group.
2461 */
2462 if (nullfb || rt || XFS_FSB_TO_AGNO(mp, ap->rval) == fb_agno)
2463 ;
2464 else
2465 ap->rval = ap->firstblock;
2466 /*
2467 * Realtime allocation, done through xfs_rtallocate_extent.
2468 */
2469 if (rt) {
2470#ifndef __KERNEL__
2471 ASSERT(0);
2472#else
2473 xfs_rtblock_t rtb;
2474
2475 atype = ap->rval == 0 ?
2476 XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
2477 do_div(ap->rval, mp->m_sb.sb_rextsize);
2478 rtb = ap->rval;
2479 ap->alen = ralen;
2480 if ((error = xfs_rtallocate_extent(ap->tp, ap->rval, 1, ap->alen,
2481 &ralen, atype, ap->wasdel, prod, &rtb)))
2482 return error;
2483 if (rtb == NULLFSBLOCK && prod > 1 &&
2484 (error = xfs_rtallocate_extent(ap->tp, ap->rval, 1,
2485 ap->alen, &ralen, atype,
2486 ap->wasdel, 1, &rtb)))
2487 return error;
2488 ap->rval = rtb;
2489 if (ap->rval != NULLFSBLOCK) {
2490 ap->rval *= mp->m_sb.sb_rextsize;
2491 ralen *= mp->m_sb.sb_rextsize;
2492 ap->alen = ralen;
2493 ap->ip->i_d.di_nblocks += ralen;
2494 xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
2495 if (ap->wasdel)
2496 ap->ip->i_delayed_blks -= ralen;
2497 /*
2498 * Adjust the disk quota also. This was reserved
2499 * earlier.
2500 */
2501 XFS_TRANS_MOD_DQUOT_BYINO(mp, ap->tp, ap->ip,
2502 ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
2503 XFS_TRANS_DQ_RTBCOUNT,
2504 (long) ralen);
2505 } else
2506 ap->alen = 0;
2507#endif /* __KERNEL__ */
2508 }
2509 /*
2510 * Normal allocation, done through xfs_alloc_vextent.
2511 */
2512 else {
2513 xfs_agnumber_t ag;
2514 xfs_alloc_arg_t args;
2515 xfs_extlen_t blen;
2516 xfs_extlen_t delta;
2517 int isaligned;
2518 xfs_extlen_t longest;
2519 xfs_extlen_t need;
2520 xfs_extlen_t nextminlen=0;
2521 int notinit;
2522 xfs_perag_t *pag;
2523 xfs_agnumber_t startag;
2524 int tryagain;
2525
2526 tryagain = isaligned = 0;
2527 args.tp = ap->tp;
2528 args.mp = mp;
2529 args.fsbno = ap->rval;
2530 args.maxlen = MIN(ap->alen, mp->m_sb.sb_agblocks);
2531 blen = 0;
2532 if (nullfb) {
2533 args.type = XFS_ALLOCTYPE_START_BNO;
2534 args.total = ap->total;
2535 /*
2536 * Find the longest available space.
2537 * We're going to try for the whole allocation at once.
2538 */
2539 startag = ag = XFS_FSB_TO_AGNO(mp, args.fsbno);
2540 notinit = 0;
2541 down_read(&mp->m_peraglock);
2542 while (blen < ap->alen) {
2543 pag = &mp->m_perag[ag];
2544 if (!pag->pagf_init &&
2545 (error = xfs_alloc_pagf_init(mp, args.tp,
2546 ag, XFS_ALLOC_FLAG_TRYLOCK))) {
2547 up_read(&mp->m_peraglock);
2548 return error;
2549 }
2550 /*
2551 * See xfs_alloc_fix_freelist...
2552 */
2553 if (pag->pagf_init) {
2554 need = XFS_MIN_FREELIST_PAG(pag, mp);
2555 delta = need > pag->pagf_flcount ?
2556 need - pag->pagf_flcount : 0;
2557 longest = (pag->pagf_longest > delta) ?
2558 (pag->pagf_longest - delta) :
2559 (pag->pagf_flcount > 0 ||
2560 pag->pagf_longest > 0);
2561 if (blen < longest)
2562 blen = longest;
2563 } else
2564 notinit = 1;
2565 if (++ag == mp->m_sb.sb_agcount)
2566 ag = 0;
2567 if (ag == startag)
2568 break;
2569 }
2570 up_read(&mp->m_peraglock);
2571 /*
2572 * Since the above loop did a BUF_TRYLOCK, it is
2573 * possible that there is space for this request.
2574 */
2575 if (notinit || blen < ap->minlen)
2576 args.minlen = ap->minlen;
2577 /*
2578 * If the best seen length is less than the request
2579 * length, use the best as the minimum.
2580 */
2581 else if (blen < ap->alen)
2582 args.minlen = blen;
2583 /*
2584 * Otherwise we've seen an extent as big as alen,
2585 * use that as the minimum.
2586 */
2587 else
2588 args.minlen = ap->alen;
2589 } else if (ap->low) {
2590 args.type = XFS_ALLOCTYPE_FIRST_AG;
2591 args.total = args.minlen = ap->minlen;
2592 } else {
2593 args.type = XFS_ALLOCTYPE_NEAR_BNO;
2594 args.total = ap->total;
2595 args.minlen = ap->minlen;
2596 }
2597 if (ap->ip->i_d.di_extsize) {
2598 args.prod = ap->ip->i_d.di_extsize;
2599 if ((args.mod = (xfs_extlen_t)do_mod(ap->off, args.prod)))
2600 args.mod = (xfs_extlen_t)(args.prod - args.mod);
2601 } else if (mp->m_sb.sb_blocksize >= NBPP) {
2602 args.prod = 1;
2603 args.mod = 0;
2604 } else {
2605 args.prod = NBPP >> mp->m_sb.sb_blocklog;
2606 if ((args.mod = (xfs_extlen_t)(do_mod(ap->off, args.prod))))
2607 args.mod = (xfs_extlen_t)(args.prod - args.mod);
2608 }
2609 /*
2610 * If we are not low on available data blocks, and the
2611 * underlying logical volume manager is a stripe, and
2612 * the file offset is zero then try to allocate data
2613 * blocks on stripe unit boundary.
2614 * NOTE: ap->aeof is only set if the allocation length
2615 * is >= the stripe unit and the allocation offset is
2616 * at the end of file.
2617 */
2618 if (!ap->low && ap->aeof) {
2619 if (!ap->off) {
2620 args.alignment = mp->m_dalign;
2621 atype = args.type;
2622 isaligned = 1;
2623 /*
2624 * Adjust for alignment
2625 */
2626 if (blen > args.alignment && blen <= ap->alen)
2627 args.minlen = blen - args.alignment;
2628 args.minalignslop = 0;
2629 } else {
2630 /*
2631 * First try an exact bno allocation.
2632 * If it fails then do a near or start bno
2633 * allocation with alignment turned on.
2634 */
2635 atype = args.type;
2636 tryagain = 1;
2637 args.type = XFS_ALLOCTYPE_THIS_BNO;
2638 args.alignment = 1;
2639 /*
2640 * Compute the minlen+alignment for the
2641 * next case. Set slop so that the value
2642 * of minlen+alignment+slop doesn't go up
2643 * between the calls.
2644 */
2645 if (blen > mp->m_dalign && blen <= ap->alen)
2646 nextminlen = blen - mp->m_dalign;
2647 else
2648 nextminlen = args.minlen;
2649 if (nextminlen + mp->m_dalign > args.minlen + 1)
2650 args.minalignslop =
2651 nextminlen + mp->m_dalign -
2652 args.minlen - 1;
2653 else
2654 args.minalignslop = 0;
2655 }
2656 } else {
2657 args.alignment = 1;
2658 args.minalignslop = 0;
2659 }
2660 args.minleft = ap->minleft;
2661 args.wasdel = ap->wasdel;
2662 args.isfl = 0;
2663 args.userdata = ap->userdata;
2664 if ((error = xfs_alloc_vextent(&args)))
2665 return error;
2666 if (tryagain && args.fsbno == NULLFSBLOCK) {
2667 /*
2668 * Exact allocation failed. Now try with alignment
2669 * turned on.
2670 */
2671 args.type = atype;
2672 args.fsbno = ap->rval;
2673 args.alignment = mp->m_dalign;
2674 args.minlen = nextminlen;
2675 args.minalignslop = 0;
2676 isaligned = 1;
2677 if ((error = xfs_alloc_vextent(&args)))
2678 return error;
2679 }
2680 if (isaligned && args.fsbno == NULLFSBLOCK) {
2681 /*
2682 * allocation failed, so turn off alignment and
2683 * try again.
2684 */
2685 args.type = atype;
2686 args.fsbno = ap->rval;
2687 args.alignment = 0;
2688 if ((error = xfs_alloc_vextent(&args)))
2689 return error;
2690 }
2691 if (args.fsbno == NULLFSBLOCK && nullfb &&
2692 args.minlen > ap->minlen) {
2693 args.minlen = ap->minlen;
2694 args.type = XFS_ALLOCTYPE_START_BNO;
2695 args.fsbno = ap->rval;
2696 if ((error = xfs_alloc_vextent(&args)))
2697 return error;
2698 }
2699 if (args.fsbno == NULLFSBLOCK && nullfb) {
2700 args.fsbno = 0;
2701 args.type = XFS_ALLOCTYPE_FIRST_AG;
2702 args.total = ap->minlen;
2703 args.minleft = 0;
2704 if ((error = xfs_alloc_vextent(&args)))
2705 return error;
2706 ap->low = 1;
2707 }
2708 if (args.fsbno != NULLFSBLOCK) {
2709 ap->firstblock = ap->rval = args.fsbno;
2710 ASSERT(nullfb || fb_agno == args.agno ||
2711 (ap->low && fb_agno < args.agno));
2712 ap->alen = args.len;
2713 ap->ip->i_d.di_nblocks += args.len;
2714 xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
2715 if (ap->wasdel)
2716 ap->ip->i_delayed_blks -= args.len;
2717 /*
2718 * Adjust the disk quota also. This was reserved
2719 * earlier.
2720 */
2721 XFS_TRANS_MOD_DQUOT_BYINO(mp, ap->tp, ap->ip,
2722 ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT :
2723 XFS_TRANS_DQ_BCOUNT,
2724 (long) args.len);
2725 } else {
2726 ap->rval = NULLFSBLOCK;
2727 ap->alen = 0;
2728 }
2729 }
2730 return 0;
2731#undef ISVALID
2732}
2733
2734/*
2735 * Transform a btree format file with only one leaf node, where the
2736 * extents list will fit in the inode, into an extents format file.
2737 * Since the extent list is already in-core, all we have to do is
2738 * give up the space for the btree root and pitch the leaf block.
2739 */
2740STATIC int /* error */
2741xfs_bmap_btree_to_extents(
2742 xfs_trans_t *tp, /* transaction pointer */
2743 xfs_inode_t *ip, /* incore inode pointer */
2744 xfs_btree_cur_t *cur, /* btree cursor */
2745 int *logflagsp, /* inode logging flags */
2746 int whichfork) /* data or attr fork */
2747{
2748 /* REFERENCED */
2749 xfs_bmbt_block_t *cblock;/* child btree block */
2750 xfs_fsblock_t cbno; /* child block number */
2751 xfs_buf_t *cbp; /* child block's buffer */
2752 int error; /* error return value */
2753 xfs_ifork_t *ifp; /* inode fork data */
2754 xfs_mount_t *mp; /* mount point structure */
2755 xfs_bmbt_ptr_t *pp; /* ptr to block address */
2756 xfs_bmbt_block_t *rblock;/* root btree block */
2757
2758 ifp = XFS_IFORK_PTR(ip, whichfork);
2759 ASSERT(ifp->if_flags & XFS_IFEXTENTS);
2760 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
2761 rblock = ifp->if_broot;
2762 ASSERT(INT_GET(rblock->bb_level, ARCH_CONVERT) == 1);
2763 ASSERT(INT_GET(rblock->bb_numrecs, ARCH_CONVERT) == 1);
2764 ASSERT(XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes) == 1);
2765 mp = ip->i_mount;
2766 pp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, ifp->if_broot_bytes);
2767 *logflagsp = 0;
2768#ifdef DEBUG
2769 if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), 1)))
2770 return error;
2771#endif
2772 cbno = INT_GET(*pp, ARCH_CONVERT);
2773 if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp,
2774 XFS_BMAP_BTREE_REF)))
2775 return error;
2776 cblock = XFS_BUF_TO_BMBT_BLOCK(cbp);
2777 if ((error = xfs_btree_check_lblock(cur, cblock, 0, cbp)))
2778 return error;
2779 xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
2780 ip->i_d.di_nblocks--;
2781 XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
2782 xfs_trans_binval(tp, cbp);
2783 if (cur->bc_bufs[0] == cbp)
2784 cur->bc_bufs[0] = NULL;
2785 xfs_iroot_realloc(ip, -1, whichfork);
2786 ASSERT(ifp->if_broot == NULL);
2787 ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
2788 XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
2789 *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
2790 return 0;
2791}
2792
2793/*
2794 * Called by xfs_bmapi to update extent list structure and the btree
2795 * after removing space (or undoing a delayed allocation).
2796 */
2797STATIC int /* error */
2798xfs_bmap_del_extent(
2799 xfs_inode_t *ip, /* incore inode pointer */
2800 xfs_trans_t *tp, /* current transaction pointer */
2801 xfs_extnum_t idx, /* extent number to update/delete */
2802 xfs_bmap_free_t *flist, /* list of extents to be freed */
2803 xfs_btree_cur_t *cur, /* if null, not a btree */
2804 xfs_bmbt_irec_t *del, /* data to remove from extent list */
2805 int *logflagsp, /* inode logging flags */
2806 int whichfork, /* data or attr fork */
2807 int rsvd) /* OK to allocate reserved blocks */
2808{
2809 xfs_filblks_t da_new; /* new delay-alloc indirect blocks */
2810 xfs_filblks_t da_old; /* old delay-alloc indirect blocks */
2811 xfs_fsblock_t del_endblock=0; /* first block past del */
2812 xfs_fileoff_t del_endoff; /* first offset past del */
2813 int delay; /* current block is delayed allocated */
2814 int do_fx; /* free extent at end of routine */
2815 xfs_bmbt_rec_t *ep; /* current extent entry pointer */
2816 int error; /* error return value */
2817 int flags; /* inode logging flags */
2818#ifdef XFS_BMAP_TRACE
2819 static char fname[] = "xfs_bmap_del_extent";
2820#endif
2821 xfs_bmbt_irec_t got; /* current extent entry */
2822 xfs_fileoff_t got_endoff; /* first offset past got */
2823 int i; /* temp state */
2824 xfs_ifork_t *ifp; /* inode fork pointer */
2825 xfs_mount_t *mp; /* mount structure */
2826 xfs_filblks_t nblks; /* quota/sb block count */
2827 xfs_bmbt_irec_t new; /* new record to be inserted */
2828 /* REFERENCED */
2829 xfs_extnum_t nextents; /* number of extents in list */
2830 uint qfield; /* quota field to update */
2831 xfs_filblks_t temp; /* for indirect length calculations */
2832 xfs_filblks_t temp2; /* for indirect length calculations */
2833
2834 XFS_STATS_INC(xs_del_exlist);
2835 mp = ip->i_mount;
2836 ifp = XFS_IFORK_PTR(ip, whichfork);
2837 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
2838 ASSERT(idx >= 0 && idx < nextents);
2839 ASSERT(del->br_blockcount > 0);
2840 ep = &ifp->if_u1.if_extents[idx];
2841 xfs_bmbt_get_all(ep, &got);
2842 ASSERT(got.br_startoff <= del->br_startoff);
2843 del_endoff = del->br_startoff + del->br_blockcount;
2844 got_endoff = got.br_startoff + got.br_blockcount;
2845 ASSERT(got_endoff >= del_endoff);
2846 delay = ISNULLSTARTBLOCK(got.br_startblock);
2847 ASSERT(ISNULLSTARTBLOCK(del->br_startblock) == delay);
2848 flags = 0;
2849 qfield = 0;
2850 error = 0;
2851 /*
2852 * If deleting a real allocation, must free up the disk space.
2853 */
2854 if (!delay) {
2855 flags = XFS_ILOG_CORE;
2856 /*
2857 * Realtime allocation. Free it and record di_nblocks update.
2858 */
2859 if (whichfork == XFS_DATA_FORK &&
2860 (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) {
2861 xfs_fsblock_t bno;
2862 xfs_filblks_t len;
2863
2864 ASSERT(do_mod(del->br_blockcount,
2865 mp->m_sb.sb_rextsize) == 0);
2866 ASSERT(do_mod(del->br_startblock,
2867 mp->m_sb.sb_rextsize) == 0);
2868 bno = del->br_startblock;
2869 len = del->br_blockcount;
2870 do_div(bno, mp->m_sb.sb_rextsize);
2871 do_div(len, mp->m_sb.sb_rextsize);
2872 if ((error = xfs_rtfree_extent(ip->i_transp, bno,
2873 (xfs_extlen_t)len)))
2874 goto done;
2875 do_fx = 0;
2876 nblks = len * mp->m_sb.sb_rextsize;
2877 qfield = XFS_TRANS_DQ_RTBCOUNT;
2878 }
2879 /*
2880 * Ordinary allocation.
2881 */
2882 else {
2883 do_fx = 1;
2884 nblks = del->br_blockcount;
2885 qfield = XFS_TRANS_DQ_BCOUNT;
2886 }
2887 /*
2888 * Set up del_endblock and cur for later.
2889 */
2890 del_endblock = del->br_startblock + del->br_blockcount;
2891 if (cur) {
2892 if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
2893 got.br_startblock, got.br_blockcount,
2894 &i)))
2895 goto done;
2896 ASSERT(i == 1);
2897 }
2898 da_old = da_new = 0;
2899 } else {
2900 da_old = STARTBLOCKVAL(got.br_startblock);
2901 da_new = 0;
2902 nblks = 0;
2903 do_fx = 0;
2904 }
2905 /*
2906 * Set flag value to use in switch statement.
2907 * Left-contig is 2, right-contig is 1.
2908 */
2909 switch (((got.br_startoff == del->br_startoff) << 1) |
2910 (got_endoff == del_endoff)) {
2911 case 3:
2912 /*
2913 * Matches the whole extent. Delete the entry.
2914 */
2915 xfs_bmap_trace_delete(fname, "3", ip, idx, 1, whichfork);
2916 xfs_bmap_delete_exlist(ip, idx, 1, whichfork);
2917 ifp->if_lastex = idx;
2918 if (delay)
2919 break;
2920 XFS_IFORK_NEXT_SET(ip, whichfork,
2921 XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
2922 flags |= XFS_ILOG_CORE;
2923 if (!cur) {
2924 flags |= XFS_ILOG_FEXT(whichfork);
2925 break;
2926 }
2927 if ((error = xfs_bmbt_delete(cur, &i)))
2928 goto done;
2929 ASSERT(i == 1);
2930 break;
2931
2932 case 2:
2933 /*
2934 * Deleting the first part of the extent.
2935 */
2936 xfs_bmap_trace_pre_update(fname, "2", ip, idx, whichfork);
2937 xfs_bmbt_set_startoff(ep, del_endoff);
2938 temp = got.br_blockcount - del->br_blockcount;
2939 xfs_bmbt_set_blockcount(ep, temp);
2940 ifp->if_lastex = idx;
2941 if (delay) {
2942 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
2943 da_old);
2944 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
2945 xfs_bmap_trace_post_update(fname, "2", ip, idx,
2946 whichfork);
2947 da_new = temp;
2948 break;
2949 }
2950 xfs_bmbt_set_startblock(ep, del_endblock);
2951 xfs_bmap_trace_post_update(fname, "2", ip, idx, whichfork);
2952 if (!cur) {
2953 flags |= XFS_ILOG_FEXT(whichfork);
2954 break;
2955 }
2956 if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock,
2957 got.br_blockcount - del->br_blockcount,
2958 got.br_state)))
2959 goto done;
2960 break;
2961
2962 case 1:
2963 /*
2964 * Deleting the last part of the extent.
2965 */
2966 temp = got.br_blockcount - del->br_blockcount;
2967 xfs_bmap_trace_pre_update(fname, "1", ip, idx, whichfork);
2968 xfs_bmbt_set_blockcount(ep, temp);
2969 ifp->if_lastex = idx;
2970 if (delay) {
2971 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
2972 da_old);
2973 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
2974 xfs_bmap_trace_post_update(fname, "1", ip, idx,
2975 whichfork);
2976 da_new = temp;
2977 break;
2978 }
2979 xfs_bmap_trace_post_update(fname, "1", ip, idx, whichfork);
2980 if (!cur) {
2981 flags |= XFS_ILOG_FEXT(whichfork);
2982 break;
2983 }
2984 if ((error = xfs_bmbt_update(cur, got.br_startoff,
2985 got.br_startblock,
2986 got.br_blockcount - del->br_blockcount,
2987 got.br_state)))
2988 goto done;
2989 break;
2990
2991 case 0:
2992 /*
2993 * Deleting the middle of the extent.
2994 */
2995 temp = del->br_startoff - got.br_startoff;
2996 xfs_bmap_trace_pre_update(fname, "0", ip, idx, whichfork);
2997 xfs_bmbt_set_blockcount(ep, temp);
2998 new.br_startoff = del_endoff;
2999 temp2 = got_endoff - del_endoff;
3000 new.br_blockcount = temp2;
3001 new.br_state = got.br_state;
3002 if (!delay) {
3003 new.br_startblock = del_endblock;
3004 flags |= XFS_ILOG_CORE;
3005 if (cur) {
3006 if ((error = xfs_bmbt_update(cur,
3007 got.br_startoff,
3008 got.br_startblock, temp,
3009 got.br_state)))
3010 goto done;
3011 if ((error = xfs_bmbt_increment(cur, 0, &i)))
3012 goto done;
3013 cur->bc_rec.b = new;
3014 error = xfs_bmbt_insert(cur, &i);
3015 if (error && error != ENOSPC)
3016 goto done;
3017 /*
3018 * If get no-space back from btree insert,
3019 * it tried a split, and we have a zero
3020 * block reservation.
3021 * Fix up our state and return the error.
3022 */
3023 if (error == ENOSPC) {
3024 /*
3025 * Reset the cursor, don't trust
3026 * it after any insert operation.
3027 */
3028 if ((error = xfs_bmbt_lookup_eq(cur,
3029 got.br_startoff,
3030 got.br_startblock,
3031 temp, &i)))
3032 goto done;
3033 ASSERT(i == 1);
3034 /*
3035 * Update the btree record back
3036 * to the original value.
3037 */
3038 if ((error = xfs_bmbt_update(cur,
3039 got.br_startoff,
3040 got.br_startblock,
3041 got.br_blockcount,
3042 got.br_state)))
3043 goto done;
3044 /*
3045 * Reset the extent record back
3046 * to the original value.
3047 */
3048 xfs_bmbt_set_blockcount(ep,
3049 got.br_blockcount);
3050 flags = 0;
3051 error = XFS_ERROR(ENOSPC);
3052 goto done;
3053 }
3054 ASSERT(i == 1);
3055 } else
3056 flags |= XFS_ILOG_FEXT(whichfork);
3057 XFS_IFORK_NEXT_SET(ip, whichfork,
3058 XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
3059 } else {
3060 ASSERT(whichfork == XFS_DATA_FORK);
3061 temp = xfs_bmap_worst_indlen(ip, temp);
3062 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
3063 temp2 = xfs_bmap_worst_indlen(ip, temp2);
3064 new.br_startblock = NULLSTARTBLOCK((int)temp2);
3065 da_new = temp + temp2;
3066 while (da_new > da_old) {
3067 if (temp) {
3068 temp--;
3069 da_new--;
3070 xfs_bmbt_set_startblock(ep,
3071 NULLSTARTBLOCK((int)temp));
3072 }
3073 if (da_new == da_old)
3074 break;
3075 if (temp2) {
3076 temp2--;
3077 da_new--;
3078 new.br_startblock =
3079 NULLSTARTBLOCK((int)temp2);
3080 }
3081 }
3082 }
3083 xfs_bmap_trace_post_update(fname, "0", ip, idx, whichfork);
3084 xfs_bmap_trace_insert(fname, "0", ip, idx + 1, 1, &new, NULL,
3085 whichfork);
3086 xfs_bmap_insert_exlist(ip, idx + 1, 1, &new, whichfork);
3087 ifp->if_lastex = idx + 1;
3088 break;
3089 }
3090 /*
3091 * If we need to, add to list of extents to delete.
3092 */
3093 if (do_fx)
3094 xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist,
3095 mp);
3096 /*
3097 * Adjust inode # blocks in the file.
3098 */
3099 if (nblks)
3100 ip->i_d.di_nblocks -= nblks;
3101 /*
3102 * Adjust quota data.
3103 */
3104 if (qfield)
3105 XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, qfield, (long)-nblks);
3106
3107 /*
3108 * Account for change in delayed indirect blocks.
3109 * Nothing to do for disk quota accounting here.
3110 */
3111 ASSERT(da_old >= da_new);
3112 if (da_old > da_new)
3113 xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, (int)(da_old - da_new),
3114 rsvd);
3115done:
3116 *logflagsp = flags;
3117 return error;
3118}
3119
3120/*
3121 * Remove the entry "free" from the free item list. Prev points to the
3122 * previous entry, unless "free" is the head of the list.
3123 */
3124STATIC void
3125xfs_bmap_del_free(
3126 xfs_bmap_free_t *flist, /* free item list header */
3127 xfs_bmap_free_item_t *prev, /* previous item on list, if any */
3128 xfs_bmap_free_item_t *free) /* list item to be freed */
3129{
3130 if (prev)
3131 prev->xbfi_next = free->xbfi_next;
3132 else
3133 flist->xbf_first = free->xbfi_next;
3134 flist->xbf_count--;
3135 kmem_zone_free(xfs_bmap_free_item_zone, free);
3136}
3137
3138/*
3139 * Remove count entries from the extents array for inode "ip", starting
3140 * at index "idx". Copies the remaining items down over the deleted ones,
3141 * and gives back the excess memory.
3142 */
3143STATIC void
3144xfs_bmap_delete_exlist(
3145 xfs_inode_t *ip, /* incore inode pointer */
3146 xfs_extnum_t idx, /* starting delete index */
3147 xfs_extnum_t count, /* count of items to delete */
3148 int whichfork) /* data or attr fork */
3149{
3150 xfs_bmbt_rec_t *base; /* base of extent list */
3151 xfs_ifork_t *ifp; /* inode fork pointer */
3152 xfs_extnum_t nextents; /* number of extents in list after */
3153
3154 ifp = XFS_IFORK_PTR(ip, whichfork);
3155 ASSERT(ifp->if_flags & XFS_IFEXTENTS);
3156 base = ifp->if_u1.if_extents;
3157 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - count;
3158 memmove(&base[idx], &base[idx + count],
3159 (nextents - idx) * sizeof(*base));
3160 xfs_iext_realloc(ip, -count, whichfork);
3161}
3162
3163/*
3164 * Convert an extents-format file into a btree-format file.
3165 * The new file will have a root block (in the inode) and a single child block.
3166 */
3167STATIC int /* error */
3168xfs_bmap_extents_to_btree(
3169 xfs_trans_t *tp, /* transaction pointer */
3170 xfs_inode_t *ip, /* incore inode pointer */
3171 xfs_fsblock_t *firstblock, /* first-block-allocated */
3172 xfs_bmap_free_t *flist, /* blocks freed in xaction */
3173 xfs_btree_cur_t **curp, /* cursor returned to caller */
3174 int wasdel, /* converting a delayed alloc */
3175 int *logflagsp, /* inode logging flags */
3176 int whichfork) /* data or attr fork */
3177{
3178 xfs_bmbt_block_t *ablock; /* allocated (child) bt block */
3179 xfs_buf_t *abp; /* buffer for ablock */
3180 xfs_alloc_arg_t args; /* allocation arguments */
3181 xfs_bmbt_rec_t *arp; /* child record pointer */
3182 xfs_bmbt_block_t *block; /* btree root block */
3183 xfs_btree_cur_t *cur; /* bmap btree cursor */
3184 xfs_bmbt_rec_t *ep; /* extent list pointer */
3185 int error; /* error return value */
3186 xfs_extnum_t i, cnt; /* extent list index */
3187 xfs_ifork_t *ifp; /* inode fork pointer */
3188 xfs_bmbt_key_t *kp; /* root block key pointer */
3189 xfs_mount_t *mp; /* mount structure */
3190 xfs_extnum_t nextents; /* extent list size */
3191 xfs_bmbt_ptr_t *pp; /* root block address pointer */
3192
3193 ifp = XFS_IFORK_PTR(ip, whichfork);
3194 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
3195 ASSERT(ifp->if_ext_max ==
3196 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
3197 /*
3198 * Make space in the inode incore.
3199 */
3200 xfs_iroot_realloc(ip, 1, whichfork);
3201 ifp->if_flags |= XFS_IFBROOT;
3202 /*
3203 * Fill in the root.
3204 */
3205 block = ifp->if_broot;
3206 INT_SET(block->bb_magic, ARCH_CONVERT, XFS_BMAP_MAGIC);
3207 INT_SET(block->bb_level, ARCH_CONVERT, 1);
3208 INT_SET(block->bb_numrecs, ARCH_CONVERT, 1);
3209 INT_SET(block->bb_leftsib, ARCH_CONVERT, NULLDFSBNO);
3210 INT_SET(block->bb_rightsib, ARCH_CONVERT, NULLDFSBNO);
3211 /*
3212 * Need a cursor. Can't allocate until bb_level is filled in.
3213 */
3214 mp = ip->i_mount;
3215 cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip,
3216 whichfork);
3217 cur->bc_private.b.firstblock = *firstblock;
3218 cur->bc_private.b.flist = flist;
3219 cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
3220 /*
3221 * Convert to a btree with two levels, one record in root.
3222 */
3223 XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
3224 args.tp = tp;
3225 args.mp = mp;
3226 if (*firstblock == NULLFSBLOCK) {
3227 args.type = XFS_ALLOCTYPE_START_BNO;
3228 args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
3229 } else if (flist->xbf_low) {
3230 args.type = XFS_ALLOCTYPE_START_BNO;
3231 args.fsbno = *firstblock;
3232 } else {
3233 args.type = XFS_ALLOCTYPE_NEAR_BNO;
3234 args.fsbno = *firstblock;
3235 }
3236 args.minlen = args.maxlen = args.prod = 1;
3237 args.total = args.minleft = args.alignment = args.mod = args.isfl =
3238 args.minalignslop = 0;
3239 args.wasdel = wasdel;
3240 *logflagsp = 0;
3241 if ((error = xfs_alloc_vextent(&args))) {
3242 xfs_iroot_realloc(ip, -1, whichfork);
3243 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
3244 return error;
3245 }
3246 /*
3247 * Allocation can't fail, the space was reserved.
3248 */
3249 ASSERT(args.fsbno != NULLFSBLOCK);
3250 ASSERT(*firstblock == NULLFSBLOCK ||
3251 args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) ||
3252 (flist->xbf_low &&
3253 args.agno > XFS_FSB_TO_AGNO(mp, *firstblock)));
3254 *firstblock = cur->bc_private.b.firstblock = args.fsbno;
3255 cur->bc_private.b.allocated++;
3256 ip->i_d.di_nblocks++;
3257 XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
3258 abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0);
3259 /*
3260 * Fill in the child block.
3261 */
3262 ablock = XFS_BUF_TO_BMBT_BLOCK(abp);
3263 INT_SET(ablock->bb_magic, ARCH_CONVERT, XFS_BMAP_MAGIC);
3264 ablock->bb_level = 0;
3265 INT_SET(ablock->bb_leftsib, ARCH_CONVERT, NULLDFSBNO);
3266 INT_SET(ablock->bb_rightsib, ARCH_CONVERT, NULLDFSBNO);
3267 arp = XFS_BMAP_REC_IADDR(ablock, 1, cur);
3268 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3269 for (ep = ifp->if_u1.if_extents, cnt = i = 0; i < nextents; i++, ep++) {
3270 if (!ISNULLSTARTBLOCK(xfs_bmbt_get_startblock(ep))) {
3271 arp->l0 = INT_GET(ep->l0, ARCH_CONVERT);
3272 arp->l1 = INT_GET(ep->l1, ARCH_CONVERT);
3273 arp++; cnt++;
3274 }
3275 }
3276 INT_SET(ablock->bb_numrecs, ARCH_CONVERT, cnt);
3277 ASSERT(INT_GET(ablock->bb_numrecs, ARCH_CONVERT) == XFS_IFORK_NEXTENTS(ip, whichfork));
3278 /*
3279 * Fill in the root key and pointer.
3280 */
3281 kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
3282 arp = XFS_BMAP_REC_IADDR(ablock, 1, cur);
3283 INT_SET(kp->br_startoff, ARCH_CONVERT, xfs_bmbt_disk_get_startoff(arp));
3284 pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
3285 INT_SET(*pp, ARCH_CONVERT, args.fsbno);
3286 /*
3287 * Do all this logging at the end so that
3288 * the root is at the right level.
3289 */
3290 xfs_bmbt_log_block(cur, abp, XFS_BB_ALL_BITS);
3291 xfs_bmbt_log_recs(cur, abp, 1, INT_GET(ablock->bb_numrecs, ARCH_CONVERT));
3292 ASSERT(*curp == NULL);
3293 *curp = cur;
3294 *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FBROOT(whichfork);
3295 return 0;
3296}
3297
3298/*
3299 * Insert new item(s) in the extent list for inode "ip".
3300 * Count new items are inserted at offset idx.
3301 */
3302STATIC void
3303xfs_bmap_insert_exlist(
3304 xfs_inode_t *ip, /* incore inode pointer */
3305 xfs_extnum_t idx, /* starting index of new items */
3306 xfs_extnum_t count, /* number of inserted items */
3307 xfs_bmbt_irec_t *new, /* items to insert */
3308 int whichfork) /* data or attr fork */
3309{
3310 xfs_bmbt_rec_t *base; /* extent list base */
3311 xfs_ifork_t *ifp; /* inode fork pointer */
3312 xfs_extnum_t nextents; /* extent list size */
3313 xfs_extnum_t to; /* extent list index */
3314
3315 ifp = XFS_IFORK_PTR(ip, whichfork);
3316 ASSERT(ifp->if_flags & XFS_IFEXTENTS);
3317 xfs_iext_realloc(ip, count, whichfork);
3318 base = ifp->if_u1.if_extents;
3319 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3320 memmove(&base[idx + count], &base[idx],
3321 (nextents - (idx + count)) * sizeof(*base));
3322 for (to = idx; to < idx + count; to++, new++)
3323 xfs_bmbt_set_all(&base[to], new);
3324}
3325
3326/*
3327 * Convert a local file to an extents file.
3328 * This code is out of bounds for data forks of regular files,
3329 * since the file data needs to get logged so things will stay consistent.
3330 * (The bmap-level manipulations are ok, though).
3331 */
3332STATIC int /* error */
3333xfs_bmap_local_to_extents(
3334 xfs_trans_t *tp, /* transaction pointer */
3335 xfs_inode_t *ip, /* incore inode pointer */
3336 xfs_fsblock_t *firstblock, /* first block allocated in xaction */
3337 xfs_extlen_t total, /* total blocks needed by transaction */
3338 int *logflagsp, /* inode logging flags */
3339 int whichfork) /* data or attr fork */
3340{
3341 int error; /* error return value */
3342 int flags; /* logging flags returned */
3343#ifdef XFS_BMAP_TRACE
3344 static char fname[] = "xfs_bmap_local_to_extents";
3345#endif
3346 xfs_ifork_t *ifp; /* inode fork pointer */
3347
3348 /*
3349 * We don't want to deal with the case of keeping inode data inline yet.
3350 * So sending the data fork of a regular inode is invalid.
3351 */
3352 ASSERT(!((ip->i_d.di_mode & S_IFMT) == S_IFREG &&
3353 whichfork == XFS_DATA_FORK));
3354 ifp = XFS_IFORK_PTR(ip, whichfork);
3355 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
3356 flags = 0;
3357 error = 0;
3358 if (ifp->if_bytes) {
3359 xfs_alloc_arg_t args; /* allocation arguments */
3360 xfs_buf_t *bp; /* buffer for extent list block */
3361 xfs_bmbt_rec_t *ep; /* extent list pointer */
3362
3363 args.tp = tp;
3364 args.mp = ip->i_mount;
3365 ASSERT(ifp->if_flags & XFS_IFINLINE);
3366 /*
3367 * Allocate a block. We know we need only one, since the
3368 * file currently fits in an inode.
3369 */
3370 if (*firstblock == NULLFSBLOCK) {
3371 args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
3372 args.type = XFS_ALLOCTYPE_START_BNO;
3373 } else {
3374 args.fsbno = *firstblock;
3375 args.type = XFS_ALLOCTYPE_NEAR_BNO;
3376 }
3377 args.total = total;
3378 args.mod = args.minleft = args.alignment = args.wasdel =
3379 args.isfl = args.minalignslop = 0;
3380 args.minlen = args.maxlen = args.prod = 1;
3381 if ((error = xfs_alloc_vextent(&args)))
3382 goto done;
3383 /*
3384 * Can't fail, the space was reserved.
3385 */
3386 ASSERT(args.fsbno != NULLFSBLOCK);
3387 ASSERT(args.len == 1);
3388 *firstblock = args.fsbno;
3389 bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
3390 memcpy((char *)XFS_BUF_PTR(bp), ifp->if_u1.if_data,
3391 ifp->if_bytes);
3392 xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
3393 xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
3394 xfs_iext_realloc(ip, 1, whichfork);
3395 ep = ifp->if_u1.if_extents;
3396 xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
3397 xfs_bmap_trace_post_update(fname, "new", ip, 0, whichfork);
3398 XFS_IFORK_NEXT_SET(ip, whichfork, 1);
3399 ip->i_d.di_nblocks = 1;
3400 XFS_TRANS_MOD_DQUOT_BYINO(args.mp, tp, ip,
3401 XFS_TRANS_DQ_BCOUNT, 1L);
3402 flags |= XFS_ILOG_FEXT(whichfork);
3403 } else
3404 ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
3405 ifp->if_flags &= ~XFS_IFINLINE;
3406 ifp->if_flags |= XFS_IFEXTENTS;
3407 XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
3408 flags |= XFS_ILOG_CORE;
3409done:
3410 *logflagsp = flags;
3411 return error;
3412}
3413
3414xfs_bmbt_rec_t * /* pointer to found extent entry */
3415xfs_bmap_do_search_extents(
3416 xfs_bmbt_rec_t *base, /* base of extent list */
3417 xfs_extnum_t lastx, /* last extent index used */
3418 xfs_extnum_t nextents, /* extent list size */
3419 xfs_fileoff_t bno, /* block number searched for */
3420 int *eofp, /* out: end of file found */
3421 xfs_extnum_t *lastxp, /* out: last extent index */
3422 xfs_bmbt_irec_t *gotp, /* out: extent entry found */
3423 xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */
3424{
3425 xfs_bmbt_rec_t *ep; /* extent list entry pointer */
3426 xfs_bmbt_irec_t got; /* extent list entry, decoded */
3427 int high; /* high index of binary search */
3428 int low; /* low index of binary search */
3429
3430 /*
3431 * Initialize the extent entry structure to catch access to
3432 * uninitialized br_startblock field.
3433 */
3434 got.br_startoff = 0xffa5a5a5a5a5a5a5LL;
3435 got.br_blockcount = 0xa55a5a5a5a5a5a5aLL;
3436 got.br_state = XFS_EXT_INVALID;
3437
3438#if XFS_BIG_BLKNOS
3439 got.br_startblock = 0xffffa5a5a5a5a5a5LL;
3440#else
3441 got.br_startblock = 0xffffa5a5;
3442#endif
3443
3444 if (lastx != NULLEXTNUM && lastx < nextents)
3445 ep = base + lastx;
3446 else
3447 ep = NULL;
3448 prevp->br_startoff = NULLFILEOFF;
3449 if (ep && bno >= (got.br_startoff = xfs_bmbt_get_startoff(ep)) &&
3450 bno < got.br_startoff +
3451 (got.br_blockcount = xfs_bmbt_get_blockcount(ep)))
3452 *eofp = 0;
3453 else if (ep && lastx < nextents - 1 &&
3454 bno >= (got.br_startoff = xfs_bmbt_get_startoff(ep + 1)) &&
3455 bno < got.br_startoff +
3456 (got.br_blockcount = xfs_bmbt_get_blockcount(ep + 1))) {
3457 lastx++;
3458 ep++;
3459 *eofp = 0;
3460 } else if (nextents == 0)
3461 *eofp = 1;
3462 else if (bno == 0 &&
3463 (got.br_startoff = xfs_bmbt_get_startoff(base)) == 0) {
3464 ep = base;
3465 lastx = 0;
3466 got.br_blockcount = xfs_bmbt_get_blockcount(ep);
3467 *eofp = 0;
3468 } else {
3469 /* binary search the extents array */
3470 low = 0;
3471 high = nextents - 1;
3472 while (low <= high) {
3473 XFS_STATS_INC(xs_cmp_exlist);
3474 lastx = (low + high) >> 1;
3475 ep = base + lastx;
3476 got.br_startoff = xfs_bmbt_get_startoff(ep);
3477 got.br_blockcount = xfs_bmbt_get_blockcount(ep);
3478 if (bno < got.br_startoff)
3479 high = lastx - 1;
3480 else if (bno >= got.br_startoff + got.br_blockcount)
3481 low = lastx + 1;
3482 else {
3483 got.br_startblock = xfs_bmbt_get_startblock(ep);
3484 got.br_state = xfs_bmbt_get_state(ep);
3485 *eofp = 0;
3486 *lastxp = lastx;
3487 *gotp = got;
3488 return ep;
3489 }
3490 }
3491 if (bno >= got.br_startoff + got.br_blockcount) {
3492 lastx++;
3493 if (lastx == nextents) {
3494 *eofp = 1;
3495 got.br_startblock = xfs_bmbt_get_startblock(ep);
3496 got.br_state = xfs_bmbt_get_state(ep);
3497 *prevp = got;
3498 ep = NULL;
3499 } else {
3500 *eofp = 0;
3501 xfs_bmbt_get_all(ep, prevp);
3502 ep++;
3503 got.br_startoff = xfs_bmbt_get_startoff(ep);
3504 got.br_blockcount = xfs_bmbt_get_blockcount(ep);
3505 }
3506 } else {
3507 *eofp = 0;
3508 if (ep > base)
3509 xfs_bmbt_get_all(ep - 1, prevp);
3510 }
3511 }
3512 if (ep) {
3513 got.br_startblock = xfs_bmbt_get_startblock(ep);
3514 got.br_state = xfs_bmbt_get_state(ep);
3515 }
3516 *lastxp = lastx;
3517 *gotp = got;
3518 return ep;
3519}
3520
3521/*
3522 * Search the extents list for the inode, for the extent containing bno.
3523 * If bno lies in a hole, point to the next entry. If bno lies past eof,
3524 * *eofp will be set, and *prevp will contain the last entry (null if none).
3525 * Else, *lastxp will be set to the index of the found
3526 * entry; *gotp will contain the entry.
3527 */
3528STATIC xfs_bmbt_rec_t * /* pointer to found extent entry */
3529xfs_bmap_search_extents(
3530 xfs_inode_t *ip, /* incore inode pointer */
3531 xfs_fileoff_t bno, /* block number searched for */
3532 int whichfork, /* data or attr fork */
3533 int *eofp, /* out: end of file found */
3534 xfs_extnum_t *lastxp, /* out: last extent index */
3535 xfs_bmbt_irec_t *gotp, /* out: extent entry found */
3536 xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */
3537{
3538 xfs_ifork_t *ifp; /* inode fork pointer */
3539 xfs_bmbt_rec_t *base; /* base of extent list */
3540 xfs_extnum_t lastx; /* last extent index used */
3541 xfs_extnum_t nextents; /* extent list size */
3542 xfs_bmbt_rec_t *ep; /* extent list entry pointer */
3543 int rt; /* realtime flag */
3544
3545 XFS_STATS_INC(xs_look_exlist);
3546 ifp = XFS_IFORK_PTR(ip, whichfork);
3547 lastx = ifp->if_lastex;
3548 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3549 base = &ifp->if_u1.if_extents[0];
3550
3551 ep = xfs_bmap_do_search_extents(base, lastx, nextents, bno, eofp,
3552 lastxp, gotp, prevp);
3553 rt = ip->i_d.di_flags & XFS_DIFLAG_REALTIME;
3554 if(!rt && !gotp->br_startblock && (*lastxp != NULLEXTNUM)) {
3555 cmn_err(CE_PANIC,"Access to block zero: fs: <%s> inode: %lld "
3556 "start_block : %llx start_off : %llx blkcnt : %llx "
3557 "extent-state : %x \n",
3558 (ip->i_mount)->m_fsname,(long long)ip->i_ino,
3559 gotp->br_startblock, gotp->br_startoff,
3560 gotp->br_blockcount,gotp->br_state);
3561 }
3562 return ep;
3563}
3564
3565
3566#ifdef XFS_BMAP_TRACE
3567ktrace_t *xfs_bmap_trace_buf;
3568
3569/*
3570 * Add a bmap trace buffer entry. Base routine for the others.
3571 */
3572STATIC void
3573xfs_bmap_trace_addentry(
3574 int opcode, /* operation */
3575 char *fname, /* function name */
3576 char *desc, /* operation description */
3577 xfs_inode_t *ip, /* incore inode pointer */
3578 xfs_extnum_t idx, /* index of entry(ies) */
3579 xfs_extnum_t cnt, /* count of entries, 1 or 2 */
3580 xfs_bmbt_rec_t *r1, /* first record */
3581 xfs_bmbt_rec_t *r2, /* second record or null */
3582 int whichfork) /* data or attr fork */
3583{
3584 xfs_bmbt_rec_t tr2;
3585
3586 ASSERT(cnt == 1 || cnt == 2);
3587 ASSERT(r1 != NULL);
3588 if (cnt == 1) {
3589 ASSERT(r2 == NULL);
3590 r2 = &tr2;
3591 memset(&tr2, 0, sizeof(tr2));
3592 } else
3593 ASSERT(r2 != NULL);
3594 ktrace_enter(xfs_bmap_trace_buf,
3595 (void *)(__psint_t)(opcode | (whichfork << 16)),
3596 (void *)fname, (void *)desc, (void *)ip,
3597 (void *)(__psint_t)idx,
3598 (void *)(__psint_t)cnt,
3599 (void *)(__psunsigned_t)(ip->i_ino >> 32),
3600 (void *)(__psunsigned_t)(unsigned)ip->i_ino,
3601 (void *)(__psunsigned_t)(r1->l0 >> 32),
3602 (void *)(__psunsigned_t)(unsigned)(r1->l0),
3603 (void *)(__psunsigned_t)(r1->l1 >> 32),
3604 (void *)(__psunsigned_t)(unsigned)(r1->l1),
3605 (void *)(__psunsigned_t)(r2->l0 >> 32),
3606 (void *)(__psunsigned_t)(unsigned)(r2->l0),
3607 (void *)(__psunsigned_t)(r2->l1 >> 32),
3608 (void *)(__psunsigned_t)(unsigned)(r2->l1)
3609 );
3610 ASSERT(ip->i_xtrace);
3611 ktrace_enter(ip->i_xtrace,
3612 (void *)(__psint_t)(opcode | (whichfork << 16)),
3613 (void *)fname, (void *)desc, (void *)ip,
3614 (void *)(__psint_t)idx,
3615 (void *)(__psint_t)cnt,
3616 (void *)(__psunsigned_t)(ip->i_ino >> 32),
3617 (void *)(__psunsigned_t)(unsigned)ip->i_ino,
3618 (void *)(__psunsigned_t)(r1->l0 >> 32),
3619 (void *)(__psunsigned_t)(unsigned)(r1->l0),
3620 (void *)(__psunsigned_t)(r1->l1 >> 32),
3621 (void *)(__psunsigned_t)(unsigned)(r1->l1),
3622 (void *)(__psunsigned_t)(r2->l0 >> 32),
3623 (void *)(__psunsigned_t)(unsigned)(r2->l0),
3624 (void *)(__psunsigned_t)(r2->l1 >> 32),
3625 (void *)(__psunsigned_t)(unsigned)(r2->l1)
3626 );
3627}
3628
3629/*
3630 * Add bmap trace entry prior to a call to xfs_bmap_delete_exlist.
3631 */
3632STATIC void
3633xfs_bmap_trace_delete(
3634 char *fname, /* function name */
3635 char *desc, /* operation description */
3636 xfs_inode_t *ip, /* incore inode pointer */
3637 xfs_extnum_t idx, /* index of entry(entries) deleted */
3638 xfs_extnum_t cnt, /* count of entries deleted, 1 or 2 */
3639 int whichfork) /* data or attr fork */
3640{
3641 xfs_ifork_t *ifp; /* inode fork pointer */
3642
3643 ifp = XFS_IFORK_PTR(ip, whichfork);
3644 xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_DELETE, fname, desc, ip, idx,
3645 cnt, &ifp->if_u1.if_extents[idx],
3646 cnt == 2 ? &ifp->if_u1.if_extents[idx + 1] : NULL,
3647 whichfork);
3648}
3649
3650/*
3651 * Add bmap trace entry prior to a call to xfs_bmap_insert_exlist, or
3652 * reading in the extents list from the disk (in the btree).
3653 */
3654STATIC void
3655xfs_bmap_trace_insert(
3656 char *fname, /* function name */
3657 char *desc, /* operation description */
3658 xfs_inode_t *ip, /* incore inode pointer */
3659 xfs_extnum_t idx, /* index of entry(entries) inserted */
3660 xfs_extnum_t cnt, /* count of entries inserted, 1 or 2 */
3661 xfs_bmbt_irec_t *r1, /* inserted record 1 */
3662 xfs_bmbt_irec_t *r2, /* inserted record 2 or null */
3663 int whichfork) /* data or attr fork */
3664{
3665 xfs_bmbt_rec_t tr1; /* compressed record 1 */
3666 xfs_bmbt_rec_t tr2; /* compressed record 2 if needed */
3667
3668 xfs_bmbt_set_all(&tr1, r1);
3669 if (cnt == 2) {
3670 ASSERT(r2 != NULL);
3671 xfs_bmbt_set_all(&tr2, r2);
3672 } else {
3673 ASSERT(cnt == 1);
3674 ASSERT(r2 == NULL);
3675 }
3676 xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_INSERT, fname, desc, ip, idx,
3677 cnt, &tr1, cnt == 2 ? &tr2 : NULL, whichfork);
3678}
3679
3680/*
3681 * Add bmap trace entry after updating an extent list entry in place.
3682 */
3683STATIC void
3684xfs_bmap_trace_post_update(
3685 char *fname, /* function name */
3686 char *desc, /* operation description */
3687 xfs_inode_t *ip, /* incore inode pointer */
3688 xfs_extnum_t idx, /* index of entry updated */
3689 int whichfork) /* data or attr fork */
3690{
3691 xfs_ifork_t *ifp; /* inode fork pointer */
3692
3693 ifp = XFS_IFORK_PTR(ip, whichfork);
3694 xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_POST_UP, fname, desc, ip, idx,
3695 1, &ifp->if_u1.if_extents[idx], NULL, whichfork);
3696}
3697
3698/*
3699 * Add bmap trace entry prior to updating an extent list entry in place.
3700 */
3701STATIC void
3702xfs_bmap_trace_pre_update(
3703 char *fname, /* function name */
3704 char *desc, /* operation description */
3705 xfs_inode_t *ip, /* incore inode pointer */
3706 xfs_extnum_t idx, /* index of entry to be updated */
3707 int whichfork) /* data or attr fork */
3708{
3709 xfs_ifork_t *ifp; /* inode fork pointer */
3710
3711 ifp = XFS_IFORK_PTR(ip, whichfork);
3712 xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_PRE_UP, fname, desc, ip, idx, 1,
3713 &ifp->if_u1.if_extents[idx], NULL, whichfork);
3714}
3715#endif /* XFS_BMAP_TRACE */
3716
3717/*
3718 * Compute the worst-case number of indirect blocks that will be used
3719 * for ip's delayed extent of length "len".
3720 */
3721STATIC xfs_filblks_t
3722xfs_bmap_worst_indlen(
3723 xfs_inode_t *ip, /* incore inode pointer */
3724 xfs_filblks_t len) /* delayed extent length */
3725{
3726 int level; /* btree level number */
3727 int maxrecs; /* maximum record count at this level */
3728 xfs_mount_t *mp; /* mount structure */
3729 xfs_filblks_t rval; /* return value */
3730
3731 mp = ip->i_mount;
3732 maxrecs = mp->m_bmap_dmxr[0];
3733 for (level = 0, rval = 0;
3734 level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
3735 level++) {
3736 len += maxrecs - 1;
3737 do_div(len, maxrecs);
3738 rval += len;
3739 if (len == 1)
3740 return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) -
3741 level - 1;
3742 if (level == 0)
3743 maxrecs = mp->m_bmap_dmxr[1];
3744 }
3745 return rval;
3746}
3747
3748#if defined(XFS_RW_TRACE)
3749STATIC void
3750xfs_bunmap_trace(
3751 xfs_inode_t *ip,
3752 xfs_fileoff_t bno,
3753 xfs_filblks_t len,
3754 int flags,
3755 inst_t *ra)
3756{
3757 if (ip->i_rwtrace == NULL)
3758 return;
3759 ktrace_enter(ip->i_rwtrace,
3760 (void *)(__psint_t)XFS_BUNMAPI,
3761 (void *)ip,
3762 (void *)(__psint_t)((ip->i_d.di_size >> 32) & 0xffffffff),
3763 (void *)(__psint_t)(ip->i_d.di_size & 0xffffffff),
3764 (void *)(__psint_t)(((xfs_dfiloff_t)bno >> 32) & 0xffffffff),
3765 (void *)(__psint_t)((xfs_dfiloff_t)bno & 0xffffffff),
3766 (void *)(__psint_t)len,
3767 (void *)(__psint_t)flags,
3768 (void *)(unsigned long)current_cpu(),
3769 (void *)ra,
3770 (void *)0,
3771 (void *)0,
3772 (void *)0,
3773 (void *)0,
3774 (void *)0,
3775 (void *)0);
3776}
3777#endif
3778
3779/*
3780 * Convert inode from non-attributed to attributed.
3781 * Must not be in a transaction, ip must not be locked.
3782 */
3783int /* error code */
3784xfs_bmap_add_attrfork(
3785 xfs_inode_t *ip, /* incore inode pointer */
3786 int rsvd) /* OK to allocated reserved blocks in trans */
3787{
3788 int blks; /* space reservation */
3789 int committed; /* xaction was committed */
3790 int error; /* error return value */
3791 xfs_fsblock_t firstblock; /* 1st block/ag allocated */
3792 xfs_bmap_free_t flist; /* freed extent list */
3793 int logflags; /* logging flags */
3794 xfs_mount_t *mp; /* mount structure */
3795 unsigned long s; /* spinlock spl value */
3796 xfs_trans_t *tp; /* transaction pointer */
3797
3798 ASSERT(ip->i_df.if_ext_max ==
3799 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
3800 if (XFS_IFORK_Q(ip))
3801 return 0;
3802 mp = ip->i_mount;
3803 ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
3804 tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK);
3805 blks = XFS_ADDAFORK_SPACE_RES(mp);
3806 if (rsvd)
3807 tp->t_flags |= XFS_TRANS_RESERVE;
3808 if ((error = xfs_trans_reserve(tp, blks, XFS_ADDAFORK_LOG_RES(mp), 0,
3809 XFS_TRANS_PERM_LOG_RES, XFS_ADDAFORK_LOG_COUNT)))
3810 goto error0;
3811 xfs_ilock(ip, XFS_ILOCK_EXCL);
3812 error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, blks, 0, rsvd ?
3813 XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
3814 XFS_QMOPT_RES_REGBLKS);
3815 if (error) {
3816 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3817 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES);
3818 return error;
3819 }
3820 if (XFS_IFORK_Q(ip))
3821 goto error1;
3822 if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
3823 /*
3824 * For inodes coming from pre-6.2 filesystems.
3825 */
3826 ASSERT(ip->i_d.di_aformat == 0);
3827 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
3828 }
3829 ASSERT(ip->i_d.di_anextents == 0);
3830 VN_HOLD(XFS_ITOV(ip));
3831 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3832 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3833 switch (ip->i_d.di_format) {
3834 case XFS_DINODE_FMT_DEV:
3835 ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
3836 break;
3837 case XFS_DINODE_FMT_UUID:
3838 ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3;
3839 break;
3840 case XFS_DINODE_FMT_LOCAL:
3841 case XFS_DINODE_FMT_EXTENTS:
3842 case XFS_DINODE_FMT_BTREE:
3843 ip->i_d.di_forkoff = mp->m_attroffset >> 3;
3844 break;
3845 default:
3846 ASSERT(0);
3847 error = XFS_ERROR(EINVAL);
3848 goto error1;
3849 }
3850 ip->i_df.if_ext_max =
3851 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
3852 ASSERT(ip->i_afp == NULL);
3853 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
3854 ip->i_afp->if_ext_max =
3855 XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
3856 ip->i_afp->if_flags = XFS_IFEXTENTS;
3857 logflags = 0;
3858 XFS_BMAP_INIT(&flist, &firstblock);
3859 switch (ip->i_d.di_format) {
3860 case XFS_DINODE_FMT_LOCAL:
3861 error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist,
3862 &logflags);
3863 break;
3864 case XFS_DINODE_FMT_EXTENTS:
3865 error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock,
3866 &flist, &logflags);
3867 break;
3868 case XFS_DINODE_FMT_BTREE:
3869 error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist,
3870 &logflags);
3871 break;
3872 default:
3873 error = 0;
3874 break;
3875 }
3876 if (logflags)
3877 xfs_trans_log_inode(tp, ip, logflags);
3878 if (error)
3879 goto error2;
3880 if (!XFS_SB_VERSION_HASATTR(&mp->m_sb)) {
3881 s = XFS_SB_LOCK(mp);
3882 if (!XFS_SB_VERSION_HASATTR(&mp->m_sb)) {
3883 XFS_SB_VERSION_ADDATTR(&mp->m_sb);
3884 XFS_SB_UNLOCK(mp, s);
3885 xfs_mod_sb(tp, XFS_SB_VERSIONNUM);
3886 } else
3887 XFS_SB_UNLOCK(mp, s);
3888 }
3889 if ((error = xfs_bmap_finish(&tp, &flist, firstblock, &committed)))
3890 goto error2;
3891 error = xfs_trans_commit(tp, XFS_TRANS_PERM_LOG_RES, NULL);
3892 ASSERT(ip->i_df.if_ext_max ==
3893 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
3894 return error;
3895error2:
3896 xfs_bmap_cancel(&flist);
3897error1:
3898 ASSERT(ismrlocked(&ip->i_lock,MR_UPDATE));
3899 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3900error0:
3901 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
3902 ASSERT(ip->i_df.if_ext_max ==
3903 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
3904 return error;
3905}
3906
3907/*
3908 * Add the extent to the list of extents to be free at transaction end.
3909 * The list is maintained sorted (by block number).
3910 */
3911/* ARGSUSED */
3912void
3913xfs_bmap_add_free(
3914 xfs_fsblock_t bno, /* fs block number of extent */
3915 xfs_filblks_t len, /* length of extent */
3916 xfs_bmap_free_t *flist, /* list of extents */
3917 xfs_mount_t *mp) /* mount point structure */
3918{
3919 xfs_bmap_free_item_t *cur; /* current (next) element */
3920 xfs_bmap_free_item_t *new; /* new element */
3921 xfs_bmap_free_item_t *prev; /* previous element */
3922#ifdef DEBUG
3923 xfs_agnumber_t agno;
3924 xfs_agblock_t agbno;
3925
3926 ASSERT(bno != NULLFSBLOCK);
3927 ASSERT(len > 0);
3928 ASSERT(len <= MAXEXTLEN);
3929 ASSERT(!ISNULLSTARTBLOCK(bno));
3930 agno = XFS_FSB_TO_AGNO(mp, bno);
3931 agbno = XFS_FSB_TO_AGBNO(mp, bno);
3932 ASSERT(agno < mp->m_sb.sb_agcount);
3933 ASSERT(agbno < mp->m_sb.sb_agblocks);
3934 ASSERT(len < mp->m_sb.sb_agblocks);
3935 ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
3936#endif
3937 ASSERT(xfs_bmap_free_item_zone != NULL);
3938 new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
3939 new->xbfi_startblock = bno;
3940 new->xbfi_blockcount = (xfs_extlen_t)len;
3941 for (prev = NULL, cur = flist->xbf_first;
3942 cur != NULL;
3943 prev = cur, cur = cur->xbfi_next) {
3944 if (cur->xbfi_startblock >= bno)
3945 break;
3946 }
3947 if (prev)
3948 prev->xbfi_next = new;
3949 else
3950 flist->xbf_first = new;
3951 new->xbfi_next = cur;
3952 flist->xbf_count++;
3953}
3954
3955/*
3956 * Compute and fill in the value of the maximum depth of a bmap btree
3957 * in this filesystem. Done once, during mount.
3958 */
3959void
3960xfs_bmap_compute_maxlevels(
3961 xfs_mount_t *mp, /* file system mount structure */
3962 int whichfork) /* data or attr fork */
3963{
3964 int level; /* btree level */
3965 uint maxblocks; /* max blocks at this level */
3966 uint maxleafents; /* max leaf entries possible */
3967 int maxrootrecs; /* max records in root block */
3968 int minleafrecs; /* min records in leaf block */
3969 int minnoderecs; /* min records in node block */
3970 int sz; /* root block size */
3971
3972 /*
3973 * The maximum number of extents in a file, hence the maximum
3974 * number of leaf entries, is controlled by the type of di_nextents
3975 * (a signed 32-bit number, xfs_extnum_t), or by di_anextents
3976 * (a signed 16-bit number, xfs_aextnum_t).
3977 */
3978 maxleafents = (whichfork == XFS_DATA_FORK) ? MAXEXTNUM : MAXAEXTNUM;
3979 minleafrecs = mp->m_bmap_dmnr[0];
3980 minnoderecs = mp->m_bmap_dmnr[1];
3981 sz = (whichfork == XFS_DATA_FORK) ?
3982 mp->m_attroffset :
3983 mp->m_sb.sb_inodesize - mp->m_attroffset;
3984 maxrootrecs = (int)XFS_BTREE_BLOCK_MAXRECS(sz, xfs_bmdr, 0);
3985 maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
3986 for (level = 1; maxblocks > 1; level++) {
3987 if (maxblocks <= maxrootrecs)
3988 maxblocks = 1;
3989 else
3990 maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
3991 }
3992 mp->m_bm_maxlevels[whichfork] = level;
3993}
3994
3995/*
3996 * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
3997 * caller. Frees all the extents that need freeing, which must be done
3998 * last due to locking considerations. We never free any extents in
3999 * the first transaction. This is to allow the caller to make the first
4000 * transaction a synchronous one so that the pointers to the data being
4001 * broken in this transaction will be permanent before the data is actually
4002 * freed. This is necessary to prevent blocks from being reallocated
4003 * and written to before the free and reallocation are actually permanent.
4004 * We do not just make the first transaction synchronous here, because
4005 * there are more efficient ways to gain the same protection in some cases
4006 * (see the file truncation code).
4007 *
4008 * Return 1 if the given transaction was committed and a new one
4009 * started, and 0 otherwise in the committed parameter.
4010 */
4011/*ARGSUSED*/
4012int /* error */
4013xfs_bmap_finish(
4014 xfs_trans_t **tp, /* transaction pointer addr */
4015 xfs_bmap_free_t *flist, /* i/o: list extents to free */
4016 xfs_fsblock_t firstblock, /* controlled ag for allocs */
4017 int *committed) /* xact committed or not */
4018{
4019 xfs_efd_log_item_t *efd; /* extent free data */
4020 xfs_efi_log_item_t *efi; /* extent free intention */
4021 int error; /* error return value */
4022 xfs_bmap_free_item_t *free; /* free extent list item */
4023 unsigned int logres; /* new log reservation */
4024 unsigned int logcount; /* new log count */
4025 xfs_mount_t *mp; /* filesystem mount structure */
4026 xfs_bmap_free_item_t *next; /* next item on free list */
4027 xfs_trans_t *ntp; /* new transaction pointer */
4028
4029 ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
4030 if (flist->xbf_count == 0) {
4031 *committed = 0;
4032 return 0;
4033 }
4034 ntp = *tp;
4035 efi = xfs_trans_get_efi(ntp, flist->xbf_count);
4036 for (free = flist->xbf_first; free; free = free->xbfi_next)
4037 xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock,
4038 free->xbfi_blockcount);
4039 logres = ntp->t_log_res;
4040 logcount = ntp->t_log_count;
4041 ntp = xfs_trans_dup(*tp);
4042 error = xfs_trans_commit(*tp, 0, NULL);
4043 *tp = ntp;
4044 *committed = 1;
4045 /*
4046 * We have a new transaction, so we should return committed=1,
4047 * even though we're returning an error.
4048 */
4049 if (error) {
4050 return error;
4051 }
4052 if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES,
4053 logcount)))
4054 return error;
4055 efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count);
4056 for (free = flist->xbf_first; free != NULL; free = next) {
4057 next = free->xbfi_next;
4058 if ((error = xfs_free_extent(ntp, free->xbfi_startblock,
4059 free->xbfi_blockcount))) {
4060 /*
4061 * The bmap free list will be cleaned up at a
4062 * higher level. The EFI will be canceled when
4063 * this transaction is aborted.
4064 * Need to force shutdown here to make sure it
4065 * happens, since this transaction may not be
4066 * dirty yet.
4067 */
4068 mp = ntp->t_mountp;
4069 if (!XFS_FORCED_SHUTDOWN(mp))
4070 xfs_force_shutdown(mp,
4071 (error == EFSCORRUPTED) ?
4072 XFS_CORRUPT_INCORE :
4073 XFS_METADATA_IO_ERROR);
4074 return error;
4075 }
4076 xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock,
4077 free->xbfi_blockcount);
4078 xfs_bmap_del_free(flist, NULL, free);
4079 }
4080 return 0;
4081}
4082
4083/*
4084 * Free up any items left in the list.
4085 */
4086void
4087xfs_bmap_cancel(
4088 xfs_bmap_free_t *flist) /* list of bmap_free_items */
4089{
4090 xfs_bmap_free_item_t *free; /* free list item */
4091 xfs_bmap_free_item_t *next;
4092
4093 if (flist->xbf_count == 0)
4094 return;
4095 ASSERT(flist->xbf_first != NULL);
4096 for (free = flist->xbf_first; free; free = next) {
4097 next = free->xbfi_next;
4098 xfs_bmap_del_free(flist, NULL, free);
4099 }
4100 ASSERT(flist->xbf_count == 0);
4101}
4102
4103/*
4104 * Returns the file-relative block number of the first unused block(s)
4105 * in the file with at least "len" logically contiguous blocks free.
4106 * This is the lowest-address hole if the file has holes, else the first block
4107 * past the end of file.
4108 * Return 0 if the file is currently local (in-inode).
4109 */
4110int /* error */
4111xfs_bmap_first_unused(
4112 xfs_trans_t *tp, /* transaction pointer */
4113 xfs_inode_t *ip, /* incore inode */
4114 xfs_extlen_t len, /* size of hole to find */
4115 xfs_fileoff_t *first_unused, /* unused block */
4116 int whichfork) /* data or attr fork */
4117{
4118 xfs_bmbt_rec_t *base; /* base of extent array */
4119 xfs_bmbt_rec_t *ep; /* pointer to an extent entry */
4120 int error; /* error return value */
4121 xfs_ifork_t *ifp; /* inode fork pointer */
4122 xfs_fileoff_t lastaddr; /* last block number seen */
4123 xfs_fileoff_t lowest; /* lowest useful block */
4124 xfs_fileoff_t max; /* starting useful block */
4125 xfs_fileoff_t off; /* offset for this block */
4126 xfs_extnum_t nextents; /* number of extent entries */
4127
4128 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE ||
4129 XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ||
4130 XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
4131 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
4132 *first_unused = 0;
4133 return 0;
4134 }
4135 ifp = XFS_IFORK_PTR(ip, whichfork);
4136 if (!(ifp->if_flags & XFS_IFEXTENTS) &&
4137 (error = xfs_iread_extents(tp, ip, whichfork)))
4138 return error;
4139 lowest = *first_unused;
4140 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
4141 base = &ifp->if_u1.if_extents[0];
4142 for (lastaddr = 0, max = lowest, ep = base;
4143 ep < &base[nextents];
4144 ep++) {
4145 off = xfs_bmbt_get_startoff(ep);
4146 /*
4147 * See if the hole before this extent will work.
4148 */
4149 if (off >= lowest + len && off - max >= len) {
4150 *first_unused = max;
4151 return 0;
4152 }
4153 lastaddr = off + xfs_bmbt_get_blockcount(ep);
4154 max = XFS_FILEOFF_MAX(lastaddr, lowest);
4155 }
4156 *first_unused = max;
4157 return 0;
4158}
4159
4160/*
4161 * Returns the file-relative block number of the last block + 1 before
4162 * last_block (input value) in the file.
4163 * This is not based on i_size, it is based on the extent list.
4164 * Returns 0 for local files, as they do not have an extent list.
4165 */
4166int /* error */
4167xfs_bmap_last_before(
4168 xfs_trans_t *tp, /* transaction pointer */
4169 xfs_inode_t *ip, /* incore inode */
4170 xfs_fileoff_t *last_block, /* last block */
4171 int whichfork) /* data or attr fork */
4172{
4173 xfs_fileoff_t bno; /* input file offset */
4174 int eof; /* hit end of file */
4175 xfs_bmbt_rec_t *ep; /* pointer to last extent */
4176 int error; /* error return value */
4177 xfs_bmbt_irec_t got; /* current extent value */
4178 xfs_ifork_t *ifp; /* inode fork pointer */
4179 xfs_extnum_t lastx; /* last extent used */
4180 xfs_bmbt_irec_t prev; /* previous extent value */
4181
4182 if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
4183 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
4184 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
4185 return XFS_ERROR(EIO);
4186 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
4187 *last_block = 0;
4188 return 0;
4189 }
4190 ifp = XFS_IFORK_PTR(ip, whichfork);
4191 if (!(ifp->if_flags & XFS_IFEXTENTS) &&
4192 (error = xfs_iread_extents(tp, ip, whichfork)))
4193 return error;
4194 bno = *last_block - 1;
4195 ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
4196 &prev);
4197 if (eof || xfs_bmbt_get_startoff(ep) > bno) {
4198 if (prev.br_startoff == NULLFILEOFF)
4199 *last_block = 0;
4200 else
4201 *last_block = prev.br_startoff + prev.br_blockcount;
4202 }
4203 /*
4204 * Otherwise *last_block is already the right answer.
4205 */
4206 return 0;
4207}
4208
4209/*
4210 * Returns the file-relative block number of the first block past eof in
4211 * the file. This is not based on i_size, it is based on the extent list.
4212 * Returns 0 for local files, as they do not have an extent list.
4213 */
4214int /* error */
4215xfs_bmap_last_offset(
4216 xfs_trans_t *tp, /* transaction pointer */
4217 xfs_inode_t *ip, /* incore inode */
4218 xfs_fileoff_t *last_block, /* last block */
4219 int whichfork) /* data or attr fork */
4220{
4221 xfs_bmbt_rec_t *base; /* base of extent array */
4222 xfs_bmbt_rec_t *ep; /* pointer to last extent */
4223 int error; /* error return value */
4224 xfs_ifork_t *ifp; /* inode fork pointer */
4225 xfs_extnum_t nextents; /* number of extent entries */
4226
4227 if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
4228 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
4229 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
4230 return XFS_ERROR(EIO);
4231 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
4232 *last_block = 0;
4233 return 0;
4234 }
4235 ifp = XFS_IFORK_PTR(ip, whichfork);
4236 if (!(ifp->if_flags & XFS_IFEXTENTS) &&
4237 (error = xfs_iread_extents(tp, ip, whichfork)))
4238 return error;
4239 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
4240 if (!nextents) {
4241 *last_block = 0;
4242 return 0;
4243 }
4244 base = &ifp->if_u1.if_extents[0];
4245 ASSERT(base != NULL);
4246 ep = &base[nextents - 1];
4247 *last_block = xfs_bmbt_get_startoff(ep) + xfs_bmbt_get_blockcount(ep);
4248 return 0;
4249}
4250
4251/*
4252 * Returns whether the selected fork of the inode has exactly one
4253 * block or not. For the data fork we check this matches di_size,
4254 * implying the file's range is 0..bsize-1.
4255 */
4256int /* 1=>1 block, 0=>otherwise */
4257xfs_bmap_one_block(
4258 xfs_inode_t *ip, /* incore inode */
4259 int whichfork) /* data or attr fork */
4260{
4261 xfs_bmbt_rec_t *ep; /* ptr to fork's extent */
4262 xfs_ifork_t *ifp; /* inode fork pointer */
4263 int rval; /* return value */
4264 xfs_bmbt_irec_t s; /* internal version of extent */
4265
4266#ifndef DEBUG
4267 if (whichfork == XFS_DATA_FORK)
4268 return ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize;
4269#endif /* !DEBUG */
4270 if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
4271 return 0;
4272 if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
4273 return 0;
4274 ifp = XFS_IFORK_PTR(ip, whichfork);
4275 ASSERT(ifp->if_flags & XFS_IFEXTENTS);
4276 ep = ifp->if_u1.if_extents;
4277 xfs_bmbt_get_all(ep, &s);
4278 rval = s.br_startoff == 0 && s.br_blockcount == 1;
4279 if (rval && whichfork == XFS_DATA_FORK)
4280 ASSERT(ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize);
4281 return rval;
4282}
4283
4284/*
4285 * Read in the extents to if_extents.
4286 * All inode fields are set up by caller, we just traverse the btree
4287 * and copy the records in. If the file system cannot contain unwritten
4288 * extents, the records are checked for no "state" flags.
4289 */
4290int /* error */
4291xfs_bmap_read_extents(
4292 xfs_trans_t *tp, /* transaction pointer */
4293 xfs_inode_t *ip, /* incore inode */
4294 int whichfork) /* data or attr fork */
4295{
4296 xfs_bmbt_block_t *block; /* current btree block */
4297 xfs_fsblock_t bno; /* block # of "block" */
4298 xfs_buf_t *bp; /* buffer for "block" */
4299 int error; /* error return value */
4300 xfs_exntfmt_t exntf; /* XFS_EXTFMT_NOSTATE, if checking */
4301#ifdef XFS_BMAP_TRACE
4302 static char fname[] = "xfs_bmap_read_extents";
4303#endif
4304 xfs_extnum_t i, j; /* index into the extents list */
4305 xfs_ifork_t *ifp; /* fork structure */
4306 int level; /* btree level, for checking */
4307 xfs_mount_t *mp; /* file system mount structure */
4308 xfs_bmbt_ptr_t *pp; /* pointer to block address */
4309 /* REFERENCED */
4310 xfs_extnum_t room; /* number of entries there's room for */
4311 xfs_bmbt_rec_t *trp; /* target record pointer */
4312
4313 bno = NULLFSBLOCK;
4314 mp = ip->i_mount;
4315 ifp = XFS_IFORK_PTR(ip, whichfork);
4316 exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE :
4317 XFS_EXTFMT_INODE(ip);
4318 block = ifp->if_broot;
4319 /*
4320 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
4321 */
4322 ASSERT(INT_GET(block->bb_level, ARCH_CONVERT) > 0);
4323 level = INT_GET(block->bb_level, ARCH_CONVERT);
4324 pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
4325 ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO);
4326 ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount);
4327 ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks);
4328 bno = INT_GET(*pp, ARCH_CONVERT);
4329 /*
4330 * Go down the tree until leaf level is reached, following the first
4331 * pointer (leftmost) at each level.
4332 */
4333 while (level-- > 0) {
4334 if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
4335 XFS_BMAP_BTREE_REF)))
4336 return error;
4337 block = XFS_BUF_TO_BMBT_BLOCK(bp);
4338 XFS_WANT_CORRUPTED_GOTO(
4339 XFS_BMAP_SANITY_CHECK(mp, block, level),
4340 error0);
4341 if (level == 0)
4342 break;
4343 pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, block,
4344 1, mp->m_bmap_dmxr[1]);
4345 XFS_WANT_CORRUPTED_GOTO(
4346 XFS_FSB_SANITY_CHECK(mp, INT_GET(*pp, ARCH_CONVERT)),
4347 error0);
4348 bno = INT_GET(*pp, ARCH_CONVERT);
4349 xfs_trans_brelse(tp, bp);
4350 }
4351 /*
4352 * Here with bp and block set to the leftmost leaf node in the tree.
4353 */
4354 room = ifp->if_bytes / (uint)sizeof(*trp);
4355 trp = ifp->if_u1.if_extents;
4356 i = 0;
4357 /*
4358 * Loop over all leaf nodes. Copy information to the extent list.
4359 */
4360 for (;;) {
4361 xfs_bmbt_rec_t *frp, *temp;
4362 xfs_fsblock_t nextbno;
4363 xfs_extnum_t num_recs;
4364
4365
4366 num_recs = INT_GET(block->bb_numrecs, ARCH_CONVERT);
4367 if (unlikely(i + num_recs > room)) {
4368 ASSERT(i + num_recs <= room);
4369 xfs_fs_cmn_err(CE_WARN, ip->i_mount,
4370 "corrupt dinode %Lu, (btree extents). Unmount and run xfs_repair.",
4371 (unsigned long long) ip->i_ino);
4372 XFS_ERROR_REPORT("xfs_bmap_read_extents(1)",
4373 XFS_ERRLEVEL_LOW,
4374 ip->i_mount);
4375 goto error0;
4376 }
4377 XFS_WANT_CORRUPTED_GOTO(
4378 XFS_BMAP_SANITY_CHECK(mp, block, 0),
4379 error0);
4380 /*
4381 * Read-ahead the next leaf block, if any.
4382 */
4383 nextbno = INT_GET(block->bb_rightsib, ARCH_CONVERT);
4384 if (nextbno != NULLFSBLOCK)
4385 xfs_btree_reada_bufl(mp, nextbno, 1);
4386 /*
4387 * Copy records into the extent list.
4388 */
4389 frp = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt,
4390 block, 1, mp->m_bmap_dmxr[0]);
4391 temp = trp;
4392 for (j = 0; j < num_recs; j++, frp++, trp++) {
4393 trp->l0 = INT_GET(frp->l0, ARCH_CONVERT);
4394 trp->l1 = INT_GET(frp->l1, ARCH_CONVERT);
4395 }
4396 if (exntf == XFS_EXTFMT_NOSTATE) {
4397 /*
4398 * Check all attribute bmap btree records and
4399 * any "older" data bmap btree records for a
4400 * set bit in the "extent flag" position.
4401 */
4402 if (unlikely(xfs_check_nostate_extents(temp, num_recs))) {
4403 XFS_ERROR_REPORT("xfs_bmap_read_extents(2)",
4404 XFS_ERRLEVEL_LOW,
4405 ip->i_mount);
4406 goto error0;
4407 }
4408 }
4409 i += num_recs;
4410 xfs_trans_brelse(tp, bp);
4411 bno = nextbno;
4412 /*
4413 * If we've reached the end, stop.
4414 */
4415 if (bno == NULLFSBLOCK)
4416 break;
4417 if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
4418 XFS_BMAP_BTREE_REF)))
4419 return error;
4420 block = XFS_BUF_TO_BMBT_BLOCK(bp);
4421 }
4422 ASSERT(i == ifp->if_bytes / (uint)sizeof(*trp));
4423 ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork));
4424 xfs_bmap_trace_exlist(fname, ip, i, whichfork);
4425 return 0;
4426error0:
4427 xfs_trans_brelse(tp, bp);
4428 return XFS_ERROR(EFSCORRUPTED);
4429}
4430
4431#ifdef XFS_BMAP_TRACE
4432/*
4433 * Add bmap trace insert entries for all the contents of the extent list.
4434 */
4435void
4436xfs_bmap_trace_exlist(
4437 char *fname, /* function name */
4438 xfs_inode_t *ip, /* incore inode pointer */
4439 xfs_extnum_t cnt, /* count of entries in the list */
4440 int whichfork) /* data or attr fork */
4441{
4442 xfs_bmbt_rec_t *base; /* base of extent list */
4443 xfs_bmbt_rec_t *ep; /* current entry in extent list */
4444 xfs_extnum_t idx; /* extent list entry number */
4445 xfs_ifork_t *ifp; /* inode fork pointer */
4446 xfs_bmbt_irec_t s; /* extent list record */
4447
4448 ifp = XFS_IFORK_PTR(ip, whichfork);
4449 ASSERT(cnt == ifp->if_bytes / (uint)sizeof(*base));
4450 base = ifp->if_u1.if_extents;
4451 for (idx = 0, ep = base; idx < cnt; idx++, ep++) {
4452 xfs_bmbt_get_all(ep, &s);
4453 xfs_bmap_trace_insert(fname, "exlist", ip, idx, 1, &s, NULL,
4454 whichfork);
4455 }
4456}
4457#endif
4458
4459#ifdef DEBUG
4460/*
4461 * Validate that the bmbt_irecs being returned from bmapi are valid
4462 * given the callers original parameters. Specifically check the
4463 * ranges of the returned irecs to ensure that they only extent beyond
4464 * the given parameters if the XFS_BMAPI_ENTIRE flag was set.
4465 */
4466STATIC void
4467xfs_bmap_validate_ret(
4468 xfs_fileoff_t bno,
4469 xfs_filblks_t len,
4470 int flags,
4471 xfs_bmbt_irec_t *mval,
4472 int nmap,
4473 int ret_nmap)
4474{
4475 int i; /* index to map values */
4476
4477 ASSERT(ret_nmap <= nmap);
4478
4479 for (i = 0; i < ret_nmap; i++) {
4480 ASSERT(mval[i].br_blockcount > 0);
4481 if (!(flags & XFS_BMAPI_ENTIRE)) {
4482 ASSERT(mval[i].br_startoff >= bno);
4483 ASSERT(mval[i].br_blockcount <= len);
4484 ASSERT(mval[i].br_startoff + mval[i].br_blockcount <=
4485 bno + len);
4486 } else {
4487 ASSERT(mval[i].br_startoff < bno + len);
4488 ASSERT(mval[i].br_startoff + mval[i].br_blockcount >
4489 bno);
4490 }
4491 ASSERT(i == 0 ||
4492 mval[i - 1].br_startoff + mval[i - 1].br_blockcount ==
4493 mval[i].br_startoff);
4494 if ((flags & XFS_BMAPI_WRITE) && !(flags & XFS_BMAPI_DELAY))
4495 ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK &&
4496 mval[i].br_startblock != HOLESTARTBLOCK);
4497 ASSERT(mval[i].br_state == XFS_EXT_NORM ||
4498 mval[i].br_state == XFS_EXT_UNWRITTEN);
4499 }
4500}
4501#endif /* DEBUG */
4502
4503
4504/*
4505 * Map file blocks to filesystem blocks.
4506 * File range is given by the bno/len pair.
4507 * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set)
4508 * into a hole or past eof.
4509 * Only allocates blocks from a single allocation group,
4510 * to avoid locking problems.
4511 * The returned value in "firstblock" from the first call in a transaction
4512 * must be remembered and presented to subsequent calls in "firstblock".
4513 * An upper bound for the number of blocks to be allocated is supplied to
4514 * the first call in "total"; if no allocation group has that many free
4515 * blocks then the call will fail (return NULLFSBLOCK in "firstblock").
4516 */
4517int /* error */
4518xfs_bmapi(
4519 xfs_trans_t *tp, /* transaction pointer */
4520 xfs_inode_t *ip, /* incore inode */
4521 xfs_fileoff_t bno, /* starting file offs. mapped */
4522 xfs_filblks_t len, /* length to map in file */
4523 int flags, /* XFS_BMAPI_... */
4524 xfs_fsblock_t *firstblock, /* first allocated block
4525 controls a.g. for allocs */
4526 xfs_extlen_t total, /* total blocks needed */
4527 xfs_bmbt_irec_t *mval, /* output: map values */
4528 int *nmap, /* i/o: mval size/count */
4529 xfs_bmap_free_t *flist) /* i/o: list extents to free */
4530{
4531 xfs_fsblock_t abno; /* allocated block number */
4532 xfs_extlen_t alen; /* allocated extent length */
4533 xfs_fileoff_t aoff; /* allocated file offset */
4534 xfs_bmalloca_t bma; /* args for xfs_bmap_alloc */
4535 char contig; /* allocation must be one extent */
4536 xfs_btree_cur_t *cur; /* bmap btree cursor */
4537 char delay; /* this request is for delayed alloc */
4538 xfs_fileoff_t end; /* end of mapped file region */
4539 int eof; /* we've hit the end of extent list */
4540 xfs_bmbt_rec_t *ep; /* extent list entry pointer */
4541 int error; /* error return */
4542 char exact; /* don't do all of wasdelayed extent */
4543 xfs_bmbt_irec_t got; /* current extent list record */
4544 xfs_ifork_t *ifp; /* inode fork pointer */
4545 xfs_extlen_t indlen; /* indirect blocks length */
4546 char inhole; /* current location is hole in file */
4547 xfs_extnum_t lastx; /* last useful extent number */
4548 int logflags; /* flags for transaction logging */
4549 xfs_extlen_t minleft; /* min blocks left after allocation */
4550 xfs_extlen_t minlen; /* min allocation size */
4551 xfs_mount_t *mp; /* xfs mount structure */
4552 int n; /* current extent index */
4553 int nallocs; /* number of extents alloc\'d */
4554 xfs_extnum_t nextents; /* number of extents in file */
4555 xfs_fileoff_t obno; /* old block number (offset) */
4556 xfs_bmbt_irec_t prev; /* previous extent list record */
4557 char stateless; /* ignore state flag set */
4558 int tmp_logflags; /* temp flags holder */
4559 char trim; /* output trimmed to match range */
4560 char userdata; /* allocating non-metadata */
4561 char wasdelay; /* old extent was delayed */
4562 int whichfork; /* data or attr fork */
4563 char wr; /* this is a write request */
4564 char rsvd; /* OK to allocate reserved blocks */
4565#ifdef DEBUG
4566 xfs_fileoff_t orig_bno; /* original block number value */
4567 int orig_flags; /* original flags arg value */
4568 xfs_filblks_t orig_len; /* original value of len arg */
4569 xfs_bmbt_irec_t *orig_mval; /* original value of mval */
4570 int orig_nmap; /* original value of *nmap */
4571
4572 orig_bno = bno;
4573 orig_len = len;
4574 orig_flags = flags;
4575 orig_mval = mval;
4576 orig_nmap = *nmap;
4577#endif
4578 ASSERT(*nmap >= 1);
4579 ASSERT(*nmap <= XFS_BMAP_MAX_NMAP || !(flags & XFS_BMAPI_WRITE));
4580 whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
4581 XFS_ATTR_FORK : XFS_DATA_FORK;
4582 mp = ip->i_mount;
4583 if (unlikely(XFS_TEST_ERROR(
4584 (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
4585 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
4586 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL),
4587 mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
4588 XFS_ERROR_REPORT("xfs_bmapi", XFS_ERRLEVEL_LOW, mp);
4589 return XFS_ERROR(EFSCORRUPTED);
4590 }
4591 if (XFS_FORCED_SHUTDOWN(mp))
4592 return XFS_ERROR(EIO);
4593 ifp = XFS_IFORK_PTR(ip, whichfork);
4594 ASSERT(ifp->if_ext_max ==
4595 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
4596 if ((wr = (flags & XFS_BMAPI_WRITE)) != 0)
4597 XFS_STATS_INC(xs_blk_mapw);
4598 else
4599 XFS_STATS_INC(xs_blk_mapr);
4600 delay = (flags & XFS_BMAPI_DELAY) != 0;
4601 trim = (flags & XFS_BMAPI_ENTIRE) == 0;
4602 userdata = (flags & XFS_BMAPI_METADATA) == 0;
4603 exact = (flags & XFS_BMAPI_EXACT) != 0;
4604 rsvd = (flags & XFS_BMAPI_RSVBLOCKS) != 0;
4605 contig = (flags & XFS_BMAPI_CONTIG) != 0;
4606 /*
4607 * stateless is used to combine extents which
4608 * differ only due to the state of the extents.
4609 * This technique is used from xfs_getbmap()
4610 * when the caller does not wish to see the
4611 * separation (which is the default).
4612 *
4613 * This technique is also used when writing a
4614 * buffer which has been partially written,
4615 * (usually by being flushed during a chunkread),
4616 * to ensure one write takes place. This also
4617 * prevents a change in the xfs inode extents at
4618 * this time, intentionally. This change occurs
4619 * on completion of the write operation, in
4620 * xfs_strat_comp(), where the xfs_bmapi() call
4621 * is transactioned, and the extents combined.
4622 */
4623 stateless = (flags & XFS_BMAPI_IGSTATE) != 0;
4624 if (stateless && wr) /* if writing unwritten space, no */
4625 wr = 0; /* allocations are allowed */
4626 ASSERT(wr || !delay);
4627 logflags = 0;
4628 nallocs = 0;
4629 cur = NULL;
4630 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
4631 ASSERT(wr && tp);
4632 if ((error = xfs_bmap_local_to_extents(tp, ip,
4633 firstblock, total, &logflags, whichfork)))
4634 goto error0;
4635 }
4636 if (wr && *firstblock == NULLFSBLOCK) {
4637 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
4638 minleft = INT_GET(ifp->if_broot->bb_level, ARCH_CONVERT) + 1;
4639 else
4640 minleft = 1;
4641 } else
4642 minleft = 0;
4643 if (!(ifp->if_flags & XFS_IFEXTENTS) &&
4644 (error = xfs_iread_extents(tp, ip, whichfork)))
4645 goto error0;
4646 ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
4647 &prev);
4648 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
4649 n = 0;
4650 end = bno + len;
4651 obno = bno;
4652 bma.ip = NULL;
4653 while (bno < end && n < *nmap) {
4654 /*
4655 * Reading past eof, act as though there's a hole
4656 * up to end.
4657 */
4658 if (eof && !wr)
4659 got.br_startoff = end;
4660 inhole = eof || got.br_startoff > bno;
4661 wasdelay = wr && !inhole && !delay &&
4662 ISNULLSTARTBLOCK(got.br_startblock);
4663 /*
4664 * First, deal with the hole before the allocated space
4665 * that we found, if any.
4666 */
4667 if (wr && (inhole || wasdelay)) {
4668 /*
4669 * For the wasdelay case, we could also just
4670 * allocate the stuff asked for in this bmap call
4671 * but that wouldn't be as good.
4672 */
4673 if (wasdelay && !exact) {
4674 alen = (xfs_extlen_t)got.br_blockcount;
4675 aoff = got.br_startoff;
4676 if (lastx != NULLEXTNUM && lastx) {
4677 ep = &ifp->if_u1.if_extents[lastx - 1];
4678 xfs_bmbt_get_all(ep, &prev);
4679 }
4680 } else if (wasdelay) {
4681 alen = (xfs_extlen_t)
4682 XFS_FILBLKS_MIN(len,
4683 (got.br_startoff +
4684 got.br_blockcount) - bno);
4685 aoff = bno;
4686 } else {
4687 alen = (xfs_extlen_t)
4688 XFS_FILBLKS_MIN(len, MAXEXTLEN);
4689 if (!eof)
4690 alen = (xfs_extlen_t)
4691 XFS_FILBLKS_MIN(alen,
4692 got.br_startoff - bno);
4693 aoff = bno;
4694 }
4695 minlen = contig ? alen : 1;
4696 if (delay) {
4697 indlen = (xfs_extlen_t)
4698 xfs_bmap_worst_indlen(ip, alen);
4699 ASSERT(indlen > 0);
4700 /*
4701 * Make a transaction-less quota reservation for
4702 * delayed allocation blocks. This number gets
4703 * adjusted later.
4704 * We return EDQUOT if we haven't allocated
4705 * blks already inside this loop;
4706 */
4707 if (XFS_TRANS_RESERVE_BLKQUOTA(
4708 mp, NULL, ip, (long)alen)) {
4709 if (n == 0) {
4710 *nmap = 0;
4711 ASSERT(cur == NULL);
4712 return XFS_ERROR(EDQUOT);
4713 }
4714 break;
4715 }
4716
4717 /*
4718 * Split changing sb for alen and indlen since
4719 * they could be coming from different places.
4720 */
4721 if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) {
4722 xfs_extlen_t extsz;
4723 xfs_extlen_t ralen;
4724 if (!(extsz = ip->i_d.di_extsize))
4725 extsz = mp->m_sb.sb_rextsize;
4726 ralen = roundup(alen, extsz);
4727 ralen = ralen / mp->m_sb.sb_rextsize;
4728 if (xfs_mod_incore_sb(mp,
4729 XFS_SBS_FREXTENTS,
4730 -(ralen), rsvd)) {
4731 if (XFS_IS_QUOTA_ON(ip->i_mount))
4732 XFS_TRANS_UNRESERVE_BLKQUOTA(
4733 mp, NULL, ip,
4734 (long)alen);
4735 break;
4736 }
4737 } else {
4738 if (xfs_mod_incore_sb(mp,
4739 XFS_SBS_FDBLOCKS,
4740 -(alen), rsvd)) {
4741 if (XFS_IS_QUOTA_ON(ip->i_mount))
4742 XFS_TRANS_UNRESERVE_BLKQUOTA(
4743 mp, NULL, ip,
4744 (long)alen);
4745 break;
4746 }
4747 }
4748
4749 if (xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS,
4750 -(indlen), rsvd)) {
4751 XFS_TRANS_UNRESERVE_BLKQUOTA(
4752 mp, NULL, ip, (long)alen);
4753 break;
4754 }
4755 ip->i_delayed_blks += alen;
4756 abno = NULLSTARTBLOCK(indlen);
4757 } else {
4758 /*
4759 * If first time, allocate and fill in
4760 * once-only bma fields.
4761 */
4762 if (bma.ip == NULL) {
4763 bma.tp = tp;
4764 bma.ip = ip;
4765 bma.prevp = &prev;
4766 bma.gotp = &got;
4767 bma.total = total;
4768 bma.userdata = 0;
4769 }
4770 /* Indicate if this is the first user data
4771 * in the file, or just any user data.
4772 */
4773 if (userdata) {
4774 bma.userdata = (aoff == 0) ?
4775 XFS_ALLOC_INITIAL_USER_DATA :
4776 XFS_ALLOC_USERDATA;
4777 }
4778 /*
4779 * Fill in changeable bma fields.
4780 */
4781 bma.eof = eof;
4782 bma.firstblock = *firstblock;
4783 bma.alen = alen;
4784 bma.off = aoff;
4785 bma.wasdel = wasdelay;
4786 bma.minlen = minlen;
4787 bma.low = flist->xbf_low;
4788 bma.minleft = minleft;
4789 /*
4790 * Only want to do the alignment at the
4791 * eof if it is userdata and allocation length
4792 * is larger than a stripe unit.
4793 */
4794 if (mp->m_dalign && alen >= mp->m_dalign &&
4795 userdata && whichfork == XFS_DATA_FORK) {
4796 if ((error = xfs_bmap_isaeof(ip, aoff,
4797 whichfork, &bma.aeof)))
4798 goto error0;
4799 } else
4800 bma.aeof = 0;
4801 /*
4802 * Call allocator.
4803 */
4804 if ((error = xfs_bmap_alloc(&bma)))
4805 goto error0;
4806 /*
4807 * Copy out result fields.
4808 */
4809 abno = bma.rval;
4810 if ((flist->xbf_low = bma.low))
4811 minleft = 0;
4812 alen = bma.alen;
4813 aoff = bma.off;
4814 ASSERT(*firstblock == NULLFSBLOCK ||
4815 XFS_FSB_TO_AGNO(mp, *firstblock) ==
4816 XFS_FSB_TO_AGNO(mp, bma.firstblock) ||
4817 (flist->xbf_low &&
4818 XFS_FSB_TO_AGNO(mp, *firstblock) <
4819 XFS_FSB_TO_AGNO(mp, bma.firstblock)));
4820 *firstblock = bma.firstblock;
4821 if (cur)
4822 cur->bc_private.b.firstblock =
4823 *firstblock;
4824 if (abno == NULLFSBLOCK)
4825 break;
4826 if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
4827 cur = xfs_btree_init_cursor(mp,
4828 tp, NULL, 0, XFS_BTNUM_BMAP,
4829 ip, whichfork);
4830 cur->bc_private.b.firstblock =
4831 *firstblock;
4832 cur->bc_private.b.flist = flist;
4833 }
4834 /*
4835 * Bump the number of extents we've allocated
4836 * in this call.
4837 */
4838 nallocs++;
4839 }
4840 if (cur)
4841 cur->bc_private.b.flags =
4842 wasdelay ? XFS_BTCUR_BPRV_WASDEL : 0;
4843 got.br_startoff = aoff;
4844 got.br_startblock = abno;
4845 got.br_blockcount = alen;
4846 got.br_state = XFS_EXT_NORM; /* assume normal */
4847 /*
4848 * Determine state of extent, and the filesystem.
4849 * A wasdelay extent has been initialized, so
4850 * shouldn't be flagged as unwritten.
4851 */
4852 if (wr && XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
4853 if (!wasdelay && (flags & XFS_BMAPI_PREALLOC))
4854 got.br_state = XFS_EXT_UNWRITTEN;
4855 }
4856 error = xfs_bmap_add_extent(ip, lastx, &cur, &got,
4857 firstblock, flist, &tmp_logflags, whichfork,
4858 rsvd);
4859 logflags |= tmp_logflags;
4860 if (error)
4861 goto error0;
4862 lastx = ifp->if_lastex;
4863 ep = &ifp->if_u1.if_extents[lastx];
4864 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
4865 xfs_bmbt_get_all(ep, &got);
4866 ASSERT(got.br_startoff <= aoff);
4867 ASSERT(got.br_startoff + got.br_blockcount >=
4868 aoff + alen);
4869#ifdef DEBUG
4870 if (delay) {
4871 ASSERT(ISNULLSTARTBLOCK(got.br_startblock));
4872 ASSERT(STARTBLOCKVAL(got.br_startblock) > 0);
4873 }
4874 ASSERT(got.br_state == XFS_EXT_NORM ||
4875 got.br_state == XFS_EXT_UNWRITTEN);
4876#endif
4877 /*
4878 * Fall down into the found allocated space case.
4879 */
4880 } else if (inhole) {
4881 /*
4882 * Reading in a hole.
4883 */
4884 mval->br_startoff = bno;
4885 mval->br_startblock = HOLESTARTBLOCK;
4886 mval->br_blockcount =
4887 XFS_FILBLKS_MIN(len, got.br_startoff - bno);
4888 mval->br_state = XFS_EXT_NORM;
4889 bno += mval->br_blockcount;
4890 len -= mval->br_blockcount;
4891 mval++;
4892 n++;
4893 continue;
4894 }
4895 /*
4896 * Then deal with the allocated space we found.
4897 */
4898 ASSERT(ep != NULL);
4899 if (trim && (got.br_startoff + got.br_blockcount > obno)) {
4900 if (obno > bno)
4901 bno = obno;
4902 ASSERT((bno >= obno) || (n == 0));
4903 ASSERT(bno < end);
4904 mval->br_startoff = bno;
4905 if (ISNULLSTARTBLOCK(got.br_startblock)) {
4906 ASSERT(!wr || delay);
4907 mval->br_startblock = DELAYSTARTBLOCK;
4908 } else
4909 mval->br_startblock =
4910 got.br_startblock +
4911 (bno - got.br_startoff);
4912 /*
4913 * Return the minimum of what we got and what we
4914 * asked for for the length. We can use the len
4915 * variable here because it is modified below
4916 * and we could have been there before coming
4917 * here if the first part of the allocation
4918 * didn't overlap what was asked for.
4919 */
4920 mval->br_blockcount =
4921 XFS_FILBLKS_MIN(end - bno, got.br_blockcount -
4922 (bno - got.br_startoff));
4923 mval->br_state = got.br_state;
4924 ASSERT(mval->br_blockcount <= len);
4925 } else {
4926 *mval = got;
4927 if (ISNULLSTARTBLOCK(mval->br_startblock)) {
4928 ASSERT(!wr || delay);
4929 mval->br_startblock = DELAYSTARTBLOCK;
4930 }
4931 }
4932
4933 /*
4934 * Check if writing previously allocated but
4935 * unwritten extents.
4936 */
4937 if (wr && mval->br_state == XFS_EXT_UNWRITTEN &&
4938 ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) {
4939 /*
4940 * Modify (by adding) the state flag, if writing.
4941 */
4942 ASSERT(mval->br_blockcount <= len);
4943 if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
4944 cur = xfs_btree_init_cursor(mp,
4945 tp, NULL, 0, XFS_BTNUM_BMAP,
4946 ip, whichfork);
4947 cur->bc_private.b.firstblock =
4948 *firstblock;
4949 cur->bc_private.b.flist = flist;
4950 }
4951 mval->br_state = XFS_EXT_NORM;
4952 error = xfs_bmap_add_extent(ip, lastx, &cur, mval,
4953 firstblock, flist, &tmp_logflags, whichfork,
4954 rsvd);
4955 logflags |= tmp_logflags;
4956 if (error)
4957 goto error0;
4958 lastx = ifp->if_lastex;
4959 ep = &ifp->if_u1.if_extents[lastx];
4960 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
4961 xfs_bmbt_get_all(ep, &got);
4962 /*
4963 * We may have combined previously unwritten
4964 * space with written space, so generate
4965 * another request.
4966 */
4967 if (mval->br_blockcount < len)
4968 continue;
4969 }
4970
4971 ASSERT(!trim ||
4972 ((mval->br_startoff + mval->br_blockcount) <= end));
4973 ASSERT(!trim || (mval->br_blockcount <= len) ||
4974 (mval->br_startoff < obno));
4975 bno = mval->br_startoff + mval->br_blockcount;
4976 len = end - bno;
4977 if (n > 0 && mval->br_startoff == mval[-1].br_startoff) {
4978 ASSERT(mval->br_startblock == mval[-1].br_startblock);
4979 ASSERT(mval->br_blockcount > mval[-1].br_blockcount);
4980 ASSERT(mval->br_state == mval[-1].br_state);
4981 mval[-1].br_blockcount = mval->br_blockcount;
4982 mval[-1].br_state = mval->br_state;
4983 } else if (n > 0 && mval->br_startblock != DELAYSTARTBLOCK &&
4984 mval[-1].br_startblock != DELAYSTARTBLOCK &&
4985 mval[-1].br_startblock != HOLESTARTBLOCK &&
4986 mval->br_startblock ==
4987 mval[-1].br_startblock + mval[-1].br_blockcount &&
4988 (stateless || mval[-1].br_state == mval->br_state)) {
4989 ASSERT(mval->br_startoff ==
4990 mval[-1].br_startoff + mval[-1].br_blockcount);
4991 mval[-1].br_blockcount += mval->br_blockcount;
4992 } else if (n > 0 &&
4993 mval->br_startblock == DELAYSTARTBLOCK &&
4994 mval[-1].br_startblock == DELAYSTARTBLOCK &&
4995 mval->br_startoff ==
4996 mval[-1].br_startoff + mval[-1].br_blockcount) {
4997 mval[-1].br_blockcount += mval->br_blockcount;
4998 mval[-1].br_state = mval->br_state;
4999 } else if (!((n == 0) &&
5000 ((mval->br_startoff + mval->br_blockcount) <=
5001 obno))) {
5002 mval++;
5003 n++;
5004 }
5005 /*
5006 * If we're done, stop now. Stop when we've allocated
5007 * XFS_BMAP_MAX_NMAP extents no matter what. Otherwise
5008 * the transaction may get too big.
5009 */
5010 if (bno >= end || n >= *nmap || nallocs >= *nmap)
5011 break;
5012 /*
5013 * Else go on to the next record.
5014 */
5015 ep++;
5016 lastx++;
5017 if (lastx >= nextents) {
5018 eof = 1;
5019 prev = got;
5020 } else
5021 xfs_bmbt_get_all(ep, &got);
5022 }
5023 ifp->if_lastex = lastx;
5024 *nmap = n;
5025 /*
5026 * Transform from btree to extents, give it cur.
5027 */
5028 if (tp && XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
5029 XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
5030 ASSERT(wr && cur);
5031 error = xfs_bmap_btree_to_extents(tp, ip, cur,
5032 &tmp_logflags, whichfork);
5033 logflags |= tmp_logflags;
5034 if (error)
5035 goto error0;
5036 }
5037 ASSERT(ifp->if_ext_max ==
5038 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
5039 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
5040 XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max);
5041 error = 0;
5042
5043error0:
5044 /*
5045 * Log everything. Do this after conversion, there's no point in
5046 * logging the extent list if we've converted to btree format.
5047 */
5048 if ((logflags & XFS_ILOG_FEXT(whichfork)) &&
5049 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
5050 logflags &= ~XFS_ILOG_FEXT(whichfork);
5051 else if ((logflags & XFS_ILOG_FBROOT(whichfork)) &&
5052 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
5053 logflags &= ~XFS_ILOG_FBROOT(whichfork);
5054 /*
5055 * Log whatever the flags say, even if error. Otherwise we might miss
5056 * detecting a case where the data is changed, there's an error,
5057 * and it's not logged so we don't shutdown when we should.
5058 */
5059 if (logflags) {
5060 ASSERT(tp && wr);
5061 xfs_trans_log_inode(tp, ip, logflags);
5062 }
5063 if (cur) {
5064 if (!error) {
5065 ASSERT(*firstblock == NULLFSBLOCK ||
5066 XFS_FSB_TO_AGNO(mp, *firstblock) ==
5067 XFS_FSB_TO_AGNO(mp,
5068 cur->bc_private.b.firstblock) ||
5069 (flist->xbf_low &&
5070 XFS_FSB_TO_AGNO(mp, *firstblock) <
5071 XFS_FSB_TO_AGNO(mp,
5072 cur->bc_private.b.firstblock)));
5073 *firstblock = cur->bc_private.b.firstblock;
5074 }
5075 xfs_btree_del_cursor(cur,
5076 error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
5077 }
5078 if (!error)
5079 xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
5080 orig_nmap, *nmap);
5081 return error;
5082}
5083
5084/*
5085 * Map file blocks to filesystem blocks, simple version.
5086 * One block (extent) only, read-only.
5087 * For flags, only the XFS_BMAPI_ATTRFORK flag is examined.
5088 * For the other flag values, the effect is as if XFS_BMAPI_METADATA
5089 * was set and all the others were clear.
5090 */
5091int /* error */
5092xfs_bmapi_single(
5093 xfs_trans_t *tp, /* transaction pointer */
5094 xfs_inode_t *ip, /* incore inode */
5095 int whichfork, /* data or attr fork */
5096 xfs_fsblock_t *fsb, /* output: mapped block */
5097 xfs_fileoff_t bno) /* starting file offs. mapped */
5098{
5099 int eof; /* we've hit the end of extent list */
5100 int error; /* error return */
5101 xfs_bmbt_irec_t got; /* current extent list record */
5102 xfs_ifork_t *ifp; /* inode fork pointer */
5103 xfs_extnum_t lastx; /* last useful extent number */
5104 xfs_bmbt_irec_t prev; /* previous extent list record */
5105
5106 ifp = XFS_IFORK_PTR(ip, whichfork);
5107 if (unlikely(
5108 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
5109 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)) {
5110 XFS_ERROR_REPORT("xfs_bmapi_single", XFS_ERRLEVEL_LOW,
5111 ip->i_mount);
5112 return XFS_ERROR(EFSCORRUPTED);
5113 }
5114 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
5115 return XFS_ERROR(EIO);
5116 XFS_STATS_INC(xs_blk_mapr);
5117 if (!(ifp->if_flags & XFS_IFEXTENTS) &&
5118 (error = xfs_iread_extents(tp, ip, whichfork)))
5119 return error;
5120 (void)xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
5121 &prev);
5122 /*
5123 * Reading past eof, act as though there's a hole
5124 * up to end.
5125 */
5126 if (eof || got.br_startoff > bno) {
5127 *fsb = NULLFSBLOCK;
5128 return 0;
5129 }
5130 ASSERT(!ISNULLSTARTBLOCK(got.br_startblock));
5131 ASSERT(bno < got.br_startoff + got.br_blockcount);
5132 *fsb = got.br_startblock + (bno - got.br_startoff);
5133 ifp->if_lastex = lastx;
5134 return 0;
5135}
5136
5137/*
5138 * Unmap (remove) blocks from a file.
5139 * If nexts is nonzero then the number of extents to remove is limited to
5140 * that value. If not all extents in the block range can be removed then
5141 * *done is set.
5142 */
5143int /* error */
5144xfs_bunmapi(
5145 xfs_trans_t *tp, /* transaction pointer */
5146 struct xfs_inode *ip, /* incore inode */
5147 xfs_fileoff_t bno, /* starting offset to unmap */
5148 xfs_filblks_t len, /* length to unmap in file */
5149 int flags, /* misc flags */
5150 xfs_extnum_t nexts, /* number of extents max */
5151 xfs_fsblock_t *firstblock, /* first allocated block
5152 controls a.g. for allocs */
5153 xfs_bmap_free_t *flist, /* i/o: list extents to free */
5154 int *done) /* set if not done yet */
5155{
5156 xfs_btree_cur_t *cur; /* bmap btree cursor */
5157 xfs_bmbt_irec_t del; /* extent being deleted */
5158 int eof; /* is deleting at eof */
5159 xfs_bmbt_rec_t *ep; /* extent list entry pointer */
5160 int error; /* error return value */
5161 xfs_extnum_t extno; /* extent number in list */
5162 xfs_bmbt_irec_t got; /* current extent list entry */
5163 xfs_ifork_t *ifp; /* inode fork pointer */
5164 int isrt; /* freeing in rt area */
5165 xfs_extnum_t lastx; /* last extent index used */
5166 int logflags; /* transaction logging flags */
5167 xfs_extlen_t mod; /* rt extent offset */
5168 xfs_mount_t *mp; /* mount structure */
5169 xfs_extnum_t nextents; /* size of extent list */
5170 xfs_bmbt_irec_t prev; /* previous extent list entry */
5171 xfs_fileoff_t start; /* first file offset deleted */
5172 int tmp_logflags; /* partial logging flags */
5173 int wasdel; /* was a delayed alloc extent */
5174 int whichfork; /* data or attribute fork */
5175 int rsvd; /* OK to allocate reserved blocks */
5176 xfs_fsblock_t sum;
5177
5178 xfs_bunmap_trace(ip, bno, len, flags, (inst_t *)__return_address);
5179 whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
5180 XFS_ATTR_FORK : XFS_DATA_FORK;
5181 ifp = XFS_IFORK_PTR(ip, whichfork);
5182 if (unlikely(
5183 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
5184 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
5185 XFS_ERROR_REPORT("xfs_bunmapi", XFS_ERRLEVEL_LOW,
5186 ip->i_mount);
5187 return XFS_ERROR(EFSCORRUPTED);
5188 }
5189 mp = ip->i_mount;
5190 if (XFS_FORCED_SHUTDOWN(mp))
5191 return XFS_ERROR(EIO);
5192 rsvd = (flags & XFS_BMAPI_RSVBLOCKS) != 0;
5193 ASSERT(len > 0);
5194 ASSERT(nexts >= 0);
5195 ASSERT(ifp->if_ext_max ==
5196 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
5197 if (!(ifp->if_flags & XFS_IFEXTENTS) &&
5198 (error = xfs_iread_extents(tp, ip, whichfork)))
5199 return error;
5200 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
5201 if (nextents == 0) {
5202 *done = 1;
5203 return 0;
5204 }
5205 XFS_STATS_INC(xs_blk_unmap);
5206 isrt = (whichfork == XFS_DATA_FORK) &&
5207 (ip->i_d.di_flags & XFS_DIFLAG_REALTIME);
5208 start = bno;
5209 bno = start + len - 1;
5210 ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
5211 &prev);
5212 /*
5213 * Check to see if the given block number is past the end of the
5214 * file, back up to the last block if so...
5215 */
5216 if (eof) {
5217 ep = &ifp->if_u1.if_extents[--lastx];
5218 xfs_bmbt_get_all(ep, &got);
5219 bno = got.br_startoff + got.br_blockcount - 1;
5220 }
5221 logflags = 0;
5222 if (ifp->if_flags & XFS_IFBROOT) {
5223 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
5224 cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip,
5225 whichfork);
5226 cur->bc_private.b.firstblock = *firstblock;
5227 cur->bc_private.b.flist = flist;
5228 cur->bc_private.b.flags = 0;
5229 } else
5230 cur = NULL;
5231 extno = 0;
5232 while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 &&
5233 (nexts == 0 || extno < nexts)) {
5234 /*
5235 * Is the found extent after a hole in which bno lives?
5236 * Just back up to the previous extent, if so.
5237 */
5238 if (got.br_startoff > bno) {
5239 if (--lastx < 0)
5240 break;
5241 ep--;
5242 xfs_bmbt_get_all(ep, &got);
5243 }
5244 /*
5245 * Is the last block of this extent before the range
5246 * we're supposed to delete? If so, we're done.
5247 */
5248 bno = XFS_FILEOFF_MIN(bno,
5249 got.br_startoff + got.br_blockcount - 1);
5250 if (bno < start)
5251 break;
5252 /*
5253 * Then deal with the (possibly delayed) allocated space
5254 * we found.
5255 */
5256 ASSERT(ep != NULL);
5257 del = got;
5258 wasdel = ISNULLSTARTBLOCK(del.br_startblock);
5259 if (got.br_startoff < start) {
5260 del.br_startoff = start;
5261 del.br_blockcount -= start - got.br_startoff;
5262 if (!wasdel)
5263 del.br_startblock += start - got.br_startoff;
5264 }
5265 if (del.br_startoff + del.br_blockcount > bno + 1)
5266 del.br_blockcount = bno + 1 - del.br_startoff;
5267 sum = del.br_startblock + del.br_blockcount;
5268 if (isrt &&
5269 (mod = do_mod(sum, mp->m_sb.sb_rextsize))) {
5270 /*
5271 * Realtime extent not lined up at the end.
5272 * The extent could have been split into written
5273 * and unwritten pieces, or we could just be
5274 * unmapping part of it. But we can't really
5275 * get rid of part of a realtime extent.
5276 */
5277 if (del.br_state == XFS_EXT_UNWRITTEN ||
5278 !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
5279 /*
5280 * This piece is unwritten, or we're not
5281 * using unwritten extents. Skip over it.
5282 */
5283 ASSERT(bno >= mod);
5284 bno -= mod > del.br_blockcount ?
5285 del.br_blockcount : mod;
5286 if (bno < got.br_startoff) {
5287 if (--lastx >= 0)
5288 xfs_bmbt_get_all(--ep, &got);
5289 }
5290 continue;
5291 }
5292 /*
5293 * It's written, turn it unwritten.
5294 * This is better than zeroing it.
5295 */
5296 ASSERT(del.br_state == XFS_EXT_NORM);
5297 ASSERT(xfs_trans_get_block_res(tp) > 0);
5298 /*
5299 * If this spans a realtime extent boundary,
5300 * chop it back to the start of the one we end at.
5301 */
5302 if (del.br_blockcount > mod) {
5303 del.br_startoff += del.br_blockcount - mod;
5304 del.br_startblock += del.br_blockcount - mod;
5305 del.br_blockcount = mod;
5306 }
5307 del.br_state = XFS_EXT_UNWRITTEN;
5308 error = xfs_bmap_add_extent(ip, lastx, &cur, &del,
5309 firstblock, flist, &logflags, XFS_DATA_FORK, 0);
5310 if (error)
5311 goto error0;
5312 goto nodelete;
5313 }
5314 if (isrt && (mod = do_mod(del.br_startblock, mp->m_sb.sb_rextsize))) {
5315 /*
5316 * Realtime extent is lined up at the end but not
5317 * at the front. We'll get rid of full extents if
5318 * we can.
5319 */
5320 mod = mp->m_sb.sb_rextsize - mod;
5321 if (del.br_blockcount > mod) {
5322 del.br_blockcount -= mod;
5323 del.br_startoff += mod;
5324 del.br_startblock += mod;
5325 } else if ((del.br_startoff == start &&
5326 (del.br_state == XFS_EXT_UNWRITTEN ||
5327 xfs_trans_get_block_res(tp) == 0)) ||
5328 !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
5329 /*
5330 * Can't make it unwritten. There isn't
5331 * a full extent here so just skip it.
5332 */
5333 ASSERT(bno >= del.br_blockcount);
5334 bno -= del.br_blockcount;
5335 if (bno < got.br_startoff) {
5336 if (--lastx >= 0)
5337 xfs_bmbt_get_all(--ep, &got);
5338 }
5339 continue;
5340 } else if (del.br_state == XFS_EXT_UNWRITTEN) {
5341 /*
5342 * This one is already unwritten.
5343 * It must have a written left neighbor.
5344 * Unwrite the killed part of that one and
5345 * try again.
5346 */
5347 ASSERT(lastx > 0);
5348 xfs_bmbt_get_all(ep - 1, &prev);
5349 ASSERT(prev.br_state == XFS_EXT_NORM);
5350 ASSERT(!ISNULLSTARTBLOCK(prev.br_startblock));
5351 ASSERT(del.br_startblock ==
5352 prev.br_startblock + prev.br_blockcount);
5353 if (prev.br_startoff < start) {
5354 mod = start - prev.br_startoff;
5355 prev.br_blockcount -= mod;
5356 prev.br_startblock += mod;
5357 prev.br_startoff = start;
5358 }
5359 prev.br_state = XFS_EXT_UNWRITTEN;
5360 error = xfs_bmap_add_extent(ip, lastx - 1, &cur,
5361 &prev, firstblock, flist, &logflags,
5362 XFS_DATA_FORK, 0);
5363 if (error)
5364 goto error0;
5365 goto nodelete;
5366 } else {
5367 ASSERT(del.br_state == XFS_EXT_NORM);
5368 del.br_state = XFS_EXT_UNWRITTEN;
5369 error = xfs_bmap_add_extent(ip, lastx, &cur,
5370 &del, firstblock, flist, &logflags,
5371 XFS_DATA_FORK, 0);
5372 if (error)
5373 goto error0;
5374 goto nodelete;
5375 }
5376 }
5377 if (wasdel) {
5378 ASSERT(STARTBLOCKVAL(del.br_startblock) > 0);
5379 xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS,
5380 (int)del.br_blockcount, rsvd);
5381 /* Unreserve our quota space */
5382 XFS_TRANS_RESERVE_QUOTA_NBLKS(
5383 mp, NULL, ip, -((long)del.br_blockcount), 0,
5384 isrt ? XFS_QMOPT_RES_RTBLKS :
5385 XFS_QMOPT_RES_REGBLKS);
5386 ip->i_delayed_blks -= del.br_blockcount;
5387 if (cur)
5388 cur->bc_private.b.flags |=
5389 XFS_BTCUR_BPRV_WASDEL;
5390 } else if (cur)
5391 cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL;
5392 /*
5393 * If it's the case where the directory code is running
5394 * with no block reservation, and the deleted block is in
5395 * the middle of its extent, and the resulting insert
5396 * of an extent would cause transformation to btree format,
5397 * then reject it. The calling code will then swap
5398 * blocks around instead.
5399 * We have to do this now, rather than waiting for the
5400 * conversion to btree format, since the transaction
5401 * will be dirty.
5402 */
5403 if (!wasdel && xfs_trans_get_block_res(tp) == 0 &&
5404 XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
5405 XFS_IFORK_NEXTENTS(ip, whichfork) >= ifp->if_ext_max &&
5406 del.br_startoff > got.br_startoff &&
5407 del.br_startoff + del.br_blockcount <
5408 got.br_startoff + got.br_blockcount) {
5409 error = XFS_ERROR(ENOSPC);
5410 goto error0;
5411 }
5412 error = xfs_bmap_del_extent(ip, tp, lastx, flist, cur, &del,
5413 &tmp_logflags, whichfork, rsvd);
5414 logflags |= tmp_logflags;
5415 if (error)
5416 goto error0;
5417 bno = del.br_startoff - 1;
5418nodelete:
5419 lastx = ifp->if_lastex;
5420 /*
5421 * If not done go on to the next (previous) record.
5422 * Reset ep in case the extents array was re-alloced.
5423 */
5424 ep = &ifp->if_u1.if_extents[lastx];
5425 if (bno != (xfs_fileoff_t)-1 && bno >= start) {
5426 if (lastx >= XFS_IFORK_NEXTENTS(ip, whichfork) ||
5427 xfs_bmbt_get_startoff(ep) > bno) {
5428 lastx--;
5429 ep--;
5430 }
5431 if (lastx >= 0)
5432 xfs_bmbt_get_all(ep, &got);
5433 extno++;
5434 }
5435 }
5436 ifp->if_lastex = lastx;
5437 *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
5438 ASSERT(ifp->if_ext_max ==
5439 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
5440 /*
5441 * Convert to a btree if necessary.
5442 */
5443 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
5444 XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) {
5445 ASSERT(cur == NULL);
5446 error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist,
5447 &cur, 0, &tmp_logflags, whichfork);
5448 logflags |= tmp_logflags;
5449 if (error)
5450 goto error0;
5451 }
5452 /*
5453 * transform from btree to extents, give it cur
5454 */
5455 else if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
5456 XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
5457 ASSERT(cur != NULL);
5458 error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
5459 whichfork);
5460 logflags |= tmp_logflags;
5461 if (error)
5462 goto error0;
5463 }
5464 /*
5465 * transform from extents to local?
5466 */
5467 ASSERT(ifp->if_ext_max ==
5468 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
5469 error = 0;
5470error0:
5471 /*
5472 * Log everything. Do this after conversion, there's no point in
5473 * logging the extent list if we've converted to btree format.
5474 */
5475 if ((logflags & XFS_ILOG_FEXT(whichfork)) &&
5476 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
5477 logflags &= ~XFS_ILOG_FEXT(whichfork);
5478 else if ((logflags & XFS_ILOG_FBROOT(whichfork)) &&
5479 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
5480 logflags &= ~XFS_ILOG_FBROOT(whichfork);
5481 /*
5482 * Log inode even in the error case, if the transaction
5483 * is dirty we'll need to shut down the filesystem.
5484 */
5485 if (logflags)
5486 xfs_trans_log_inode(tp, ip, logflags);
5487 if (cur) {
5488 if (!error) {
5489 *firstblock = cur->bc_private.b.firstblock;
5490 cur->bc_private.b.allocated = 0;
5491 }
5492 xfs_btree_del_cursor(cur,
5493 error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
5494 }
5495 return error;
5496}
5497
5498/*
5499 * Fcntl interface to xfs_bmapi.
5500 */
5501int /* error code */
5502xfs_getbmap(
5503 bhv_desc_t *bdp, /* XFS behavior descriptor*/
5504 struct getbmap *bmv, /* user bmap structure */
5505 void __user *ap, /* pointer to user's array */
5506 int interface) /* interface flags */
5507{
5508 __int64_t bmvend; /* last block requested */
5509 int error; /* return value */
5510 __int64_t fixlen; /* length for -1 case */
5511 int i; /* extent number */
5512 xfs_inode_t *ip; /* xfs incore inode pointer */
5513 vnode_t *vp; /* corresponding vnode */
5514 int lock; /* lock state */
5515 xfs_bmbt_irec_t *map; /* buffer for user's data */
5516 xfs_mount_t *mp; /* file system mount point */
5517 int nex; /* # of user extents can do */
5518 int nexleft; /* # of user extents left */
5519 int subnex; /* # of bmapi's can do */
5520 int nmap; /* number of map entries */
5521 struct getbmap out; /* output structure */
5522 int whichfork; /* data or attr fork */
5523 int prealloced; /* this is a file with
5524 * preallocated data space */
5525 int sh_unwritten; /* true, if unwritten */
5526 /* extents listed separately */
5527 int bmapi_flags; /* flags for xfs_bmapi */
5528 __int32_t oflags; /* getbmapx bmv_oflags field */
5529
5530 vp = BHV_TO_VNODE(bdp);
5531 ip = XFS_BHVTOI(bdp);
5532 mp = ip->i_mount;
5533
5534 whichfork = interface & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
5535 sh_unwritten = (interface & BMV_IF_PREALLOC) != 0;
5536
5537 /* If the BMV_IF_NO_DMAPI_READ interface bit specified, do not
5538 * generate a DMAPI read event. Otherwise, if the DM_EVENT_READ
5539 * bit is set for the file, generate a read event in order
5540 * that the DMAPI application may do its thing before we return
5541 * the extents. Usually this means restoring user file data to
5542 * regions of the file that look like holes.
5543 *
5544 * The "old behavior" (from XFS_IOC_GETBMAP) is to not specify
5545 * BMV_IF_NO_DMAPI_READ so that read events are generated.
5546 * If this were not true, callers of ioctl( XFS_IOC_GETBMAP )
5547 * could misinterpret holes in a DMAPI file as true holes,
5548 * when in fact they may represent offline user data.
5549 */
5550 if ( (interface & BMV_IF_NO_DMAPI_READ) == 0
5551 && DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ)
5552 && whichfork == XFS_DATA_FORK) {
5553
5554 error = XFS_SEND_DATA(mp, DM_EVENT_READ, vp, 0, 0, 0, NULL);
5555 if (error)
5556 return XFS_ERROR(error);
5557 }
5558
5559 if (whichfork == XFS_ATTR_FORK) {
5560 if (XFS_IFORK_Q(ip)) {
5561 if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
5562 ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
5563 ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
5564 return XFS_ERROR(EINVAL);
5565 } else if (unlikely(
5566 ip->i_d.di_aformat != 0 &&
5567 ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) {
5568 XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW,
5569 ip->i_mount);
5570 return XFS_ERROR(EFSCORRUPTED);
5571 }
5572 } else if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
5573 ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
5574 ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
5575 return XFS_ERROR(EINVAL);
5576 if (whichfork == XFS_DATA_FORK) {
5577 if (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC) {
5578 prealloced = 1;
5579 fixlen = XFS_MAXIOFFSET(mp);
5580 } else {
5581 prealloced = 0;
5582 fixlen = ip->i_d.di_size;
5583 }
5584 } else {
5585 prealloced = 0;
5586 fixlen = 1LL << 32;
5587 }
5588
5589 if (bmv->bmv_length == -1) {
5590 fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
5591 bmv->bmv_length = MAX( (__int64_t)(fixlen - bmv->bmv_offset),
5592 (__int64_t)0);
5593 } else if (bmv->bmv_length < 0)
5594 return XFS_ERROR(EINVAL);
5595 if (bmv->bmv_length == 0) {
5596 bmv->bmv_entries = 0;
5597 return 0;
5598 }
5599 nex = bmv->bmv_count - 1;
5600 if (nex <= 0)
5601 return XFS_ERROR(EINVAL);
5602 bmvend = bmv->bmv_offset + bmv->bmv_length;
5603
5604 xfs_ilock(ip, XFS_IOLOCK_SHARED);
5605
5606 if (whichfork == XFS_DATA_FORK && ip->i_delayed_blks) {
5607 /* xfs_fsize_t last_byte = xfs_file_last_byte(ip); */
5608 VOP_FLUSH_PAGES(vp, (xfs_off_t)0, -1, 0, FI_REMAPF, error);
5609 }
5610
5611 ASSERT(whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0);
5612
5613 lock = xfs_ilock_map_shared(ip);
5614
5615 /*
5616 * Don't let nex be bigger than the number of extents
5617 * we can have assuming alternating holes and real extents.
5618 */
5619 if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
5620 nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
5621
5622 bmapi_flags = XFS_BMAPI_AFLAG(whichfork) |
5623 ((sh_unwritten) ? 0 : XFS_BMAPI_IGSTATE);
5624
5625 /*
5626 * Allocate enough space to handle "subnex" maps at a time.
5627 */
5628 subnex = 16;
5629 map = kmem_alloc(subnex * sizeof(*map), KM_SLEEP);
5630
5631 bmv->bmv_entries = 0;
5632
5633 if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0) {
5634 error = 0;
5635 goto unlock_and_return;
5636 }
5637
5638 nexleft = nex;
5639
5640 do {
5641 nmap = (nexleft > subnex) ? subnex : nexleft;
5642 error = xfs_bmapi(NULL, ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
5643 XFS_BB_TO_FSB(mp, bmv->bmv_length),
5644 bmapi_flags, NULL, 0, map, &nmap, NULL);
5645 if (error)
5646 goto unlock_and_return;
5647 ASSERT(nmap <= subnex);
5648
5649 for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
5650 nexleft--;
5651 oflags = (map[i].br_state == XFS_EXT_UNWRITTEN) ?
5652 BMV_OF_PREALLOC : 0;
5653 out.bmv_offset = XFS_FSB_TO_BB(mp, map[i].br_startoff);
5654 out.bmv_length = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
5655 ASSERT(map[i].br_startblock != DELAYSTARTBLOCK);
5656 if (prealloced &&
5657 map[i].br_startblock == HOLESTARTBLOCK &&
5658 out.bmv_offset + out.bmv_length == bmvend) {
5659 /*
5660 * came to hole at end of file
5661 */
5662 goto unlock_and_return;
5663 } else {
5664 out.bmv_block =
5665 (map[i].br_startblock == HOLESTARTBLOCK) ?
5666 -1 :
5667 XFS_FSB_TO_DB(ip, map[i].br_startblock);
5668
5669 /* return either getbmap/getbmapx structure. */
5670 if (interface & BMV_IF_EXTENDED) {
5671 struct getbmapx outx;
5672
5673 GETBMAP_CONVERT(out,outx);
5674 outx.bmv_oflags = oflags;
5675 outx.bmv_unused1 = outx.bmv_unused2 = 0;
5676 if (copy_to_user(ap, &outx,
5677 sizeof(outx))) {
5678 error = XFS_ERROR(EFAULT);
5679 goto unlock_and_return;
5680 }
5681 } else {
5682 if (copy_to_user(ap, &out,
5683 sizeof(out))) {
5684 error = XFS_ERROR(EFAULT);
5685 goto unlock_and_return;
5686 }
5687 }
5688 bmv->bmv_offset =
5689 out.bmv_offset + out.bmv_length;
5690 bmv->bmv_length = MAX((__int64_t)0,
5691 (__int64_t)(bmvend - bmv->bmv_offset));
5692 bmv->bmv_entries++;
5693 ap = (interface & BMV_IF_EXTENDED) ?
5694 (void __user *)
5695 ((struct getbmapx __user *)ap + 1) :
5696 (void __user *)
5697 ((struct getbmap __user *)ap + 1);
5698 }
5699 }
5700 } while (nmap && nexleft && bmv->bmv_length);
5701
5702unlock_and_return:
5703 xfs_iunlock_map_shared(ip, lock);
5704 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
5705
5706 kmem_free(map, subnex * sizeof(*map));
5707
5708 return error;
5709}
5710
5711/*
5712 * Check the last inode extent to determine whether this allocation will result
5713 * in blocks being allocated at the end of the file. When we allocate new data
5714 * blocks at the end of the file which do not start at the previous data block,
5715 * we will try to align the new blocks at stripe unit boundaries.
5716 */
5717int /* error */
5718xfs_bmap_isaeof(
5719 xfs_inode_t *ip, /* incore inode pointer */
5720 xfs_fileoff_t off, /* file offset in fsblocks */
5721 int whichfork, /* data or attribute fork */
5722 char *aeof) /* return value */
5723{
5724 int error; /* error return value */
5725 xfs_ifork_t *ifp; /* inode fork pointer */
5726 xfs_bmbt_rec_t *lastrec; /* extent list entry pointer */
5727 xfs_extnum_t nextents; /* size of extent list */
5728 xfs_bmbt_irec_t s; /* expanded extent list entry */
5729
5730 ASSERT(whichfork == XFS_DATA_FORK);
5731 ifp = XFS_IFORK_PTR(ip, whichfork);
5732 if (!(ifp->if_flags & XFS_IFEXTENTS) &&
5733 (error = xfs_iread_extents(NULL, ip, whichfork)))
5734 return error;
5735 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
5736 if (nextents == 0) {
5737 *aeof = 1;
5738 return 0;
5739 }
5740 /*
5741 * Go to the last extent
5742 */
5743 lastrec = &ifp->if_u1.if_extents[nextents - 1];
5744 xfs_bmbt_get_all(lastrec, &s);
5745 /*
5746 * Check we are allocating in the last extent (for delayed allocations)
5747 * or past the last extent for non-delayed allocations.
5748 */
5749 *aeof = (off >= s.br_startoff &&
5750 off < s.br_startoff + s.br_blockcount &&
5751 ISNULLSTARTBLOCK(s.br_startblock)) ||
5752 off >= s.br_startoff + s.br_blockcount;
5753 return 0;
5754}
5755
5756/*
5757 * Check if the endoff is outside the last extent. If so the caller will grow
5758 * the allocation to a stripe unit boundary.
5759 */
5760int /* error */
5761xfs_bmap_eof(
5762 xfs_inode_t *ip, /* incore inode pointer */
5763 xfs_fileoff_t endoff, /* file offset in fsblocks */
5764 int whichfork, /* data or attribute fork */
5765 int *eof) /* result value */
5766{
5767 xfs_fsblock_t blockcount; /* extent block count */
5768 int error; /* error return value */
5769 xfs_ifork_t *ifp; /* inode fork pointer */
5770 xfs_bmbt_rec_t *lastrec; /* extent list entry pointer */
5771 xfs_extnum_t nextents; /* size of extent list */
5772 xfs_fileoff_t startoff; /* extent starting file offset */
5773
5774 ASSERT(whichfork == XFS_DATA_FORK);
5775 ifp = XFS_IFORK_PTR(ip, whichfork);
5776 if (!(ifp->if_flags & XFS_IFEXTENTS) &&
5777 (error = xfs_iread_extents(NULL, ip, whichfork)))
5778 return error;
5779 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
5780 if (nextents == 0) {
5781 *eof = 1;
5782 return 0;
5783 }
5784 /*
5785 * Go to the last extent
5786 */
5787 lastrec = &ifp->if_u1.if_extents[nextents - 1];
5788 startoff = xfs_bmbt_get_startoff(lastrec);
5789 blockcount = xfs_bmbt_get_blockcount(lastrec);
5790 *eof = endoff >= startoff + blockcount;
5791 return 0;
5792}
5793
5794#ifdef DEBUG
5795/*
5796 * Check that the extents list for the inode ip is in the right order.
5797 */
5798STATIC void
5799xfs_bmap_check_extents(
5800 xfs_inode_t *ip, /* incore inode pointer */
5801 int whichfork) /* data or attr fork */
5802{
5803 xfs_bmbt_rec_t *base; /* base of extents list */
5804 xfs_bmbt_rec_t *ep; /* current extent entry */
5805 xfs_ifork_t *ifp; /* inode fork pointer */
5806 xfs_extnum_t nextents; /* number of extents in list */
5807
5808 ifp = XFS_IFORK_PTR(ip, whichfork);
5809 ASSERT(ifp->if_flags & XFS_IFEXTENTS);
5810 base = ifp->if_u1.if_extents;
5811 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
5812 for (ep = base; ep < &base[nextents - 1]; ep++) {
5813 xfs_btree_check_rec(XFS_BTNUM_BMAP, (void *)ep,
5814 (void *)(ep + 1));
5815 }
5816}
5817
5818STATIC
5819xfs_buf_t *
5820xfs_bmap_get_bp(
5821 xfs_btree_cur_t *cur,
5822 xfs_fsblock_t bno)
5823{
5824 int i;
5825 xfs_buf_t *bp;
5826
5827 if (!cur)
5828 return(NULL);
5829
5830 bp = NULL;
5831 for(i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
5832 bp = cur->bc_bufs[i];
5833 if (!bp) break;
5834 if (XFS_BUF_ADDR(bp) == bno)
5835 break; /* Found it */
5836 }
5837 if (i == XFS_BTREE_MAXLEVELS)
5838 bp = NULL;
5839
5840 if (!bp) { /* Chase down all the log items to see if the bp is there */
5841 xfs_log_item_chunk_t *licp;
5842 xfs_trans_t *tp;
5843
5844 tp = cur->bc_tp;
5845 licp = &tp->t_items;
5846 while (!bp && licp != NULL) {
5847 if (XFS_LIC_ARE_ALL_FREE(licp)) {
5848 licp = licp->lic_next;
5849 continue;
5850 }
5851 for (i = 0; i < licp->lic_unused; i++) {
5852 xfs_log_item_desc_t *lidp;
5853 xfs_log_item_t *lip;
5854 xfs_buf_log_item_t *bip;
5855 xfs_buf_t *lbp;
5856
5857 if (XFS_LIC_ISFREE(licp, i)) {
5858 continue;
5859 }
5860
5861 lidp = XFS_LIC_SLOT(licp, i);
5862 lip = lidp->lid_item;
5863 if (lip->li_type != XFS_LI_BUF)
5864 continue;
5865
5866 bip = (xfs_buf_log_item_t *)lip;
5867 lbp = bip->bli_buf;
5868
5869 if (XFS_BUF_ADDR(lbp) == bno) {
5870 bp = lbp;
5871 break; /* Found it */
5872 }
5873 }
5874 licp = licp->lic_next;
5875 }
5876 }
5877 return(bp);
5878}
5879
5880void
5881xfs_check_block(
5882 xfs_bmbt_block_t *block,
5883 xfs_mount_t *mp,
5884 int root,
5885 short sz)
5886{
5887 int i, j, dmxr;
5888 xfs_bmbt_ptr_t *pp, *thispa; /* pointer to block address */
5889 xfs_bmbt_key_t *prevp, *keyp;
5890
5891 ASSERT(INT_GET(block->bb_level, ARCH_CONVERT) > 0);
5892
5893 prevp = NULL;
5894 for( i = 1; i <= INT_GET(block->bb_numrecs, ARCH_CONVERT);i++) {
5895 dmxr = mp->m_bmap_dmxr[0];
5896
5897 if (root) {
5898 keyp = XFS_BMAP_BROOT_KEY_ADDR(block, i, sz);
5899 } else {
5900 keyp = XFS_BTREE_KEY_ADDR(mp->m_sb.sb_blocksize,
5901 xfs_bmbt, block, i, dmxr);
5902 }
5903
5904 if (prevp) {
5905 xfs_btree_check_key(XFS_BTNUM_BMAP, prevp, keyp);
5906 }
5907 prevp = keyp;
5908
5909 /*
5910 * Compare the block numbers to see if there are dups.
5911 */
5912
5913 if (root) {
5914 pp = XFS_BMAP_BROOT_PTR_ADDR(block, i, sz);
5915 } else {
5916 pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize,
5917 xfs_bmbt, block, i, dmxr);
5918 }
5919 for (j = i+1; j <= INT_GET(block->bb_numrecs, ARCH_CONVERT); j++) {
5920 if (root) {
5921 thispa = XFS_BMAP_BROOT_PTR_ADDR(block, j, sz);
5922 } else {
5923 thispa = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize,
5924 xfs_bmbt, block, j, dmxr);
5925 }
5926 if (INT_GET(*thispa, ARCH_CONVERT) ==
5927 INT_GET(*pp, ARCH_CONVERT)) {
5928 cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld",
5929 __FUNCTION__, j, i,
5930 INT_GET(*thispa, ARCH_CONVERT));
5931 panic("%s: ptrs are equal in node\n",
5932 __FUNCTION__);
5933 }
5934 }
5935 }
5936}
5937
5938/*
5939 * Check that the extents for the inode ip are in the right order in all
5940 * btree leaves.
5941 */
5942
5943STATIC void
5944xfs_bmap_check_leaf_extents(
5945 xfs_btree_cur_t *cur, /* btree cursor or null */
5946 xfs_inode_t *ip, /* incore inode pointer */
5947 int whichfork) /* data or attr fork */
5948{
5949 xfs_bmbt_block_t *block; /* current btree block */
5950 xfs_fsblock_t bno; /* block # of "block" */
5951 xfs_buf_t *bp; /* buffer for "block" */
5952 int error; /* error return value */
5953 xfs_extnum_t i=0; /* index into the extents list */
5954 xfs_ifork_t *ifp; /* fork structure */
5955 int level; /* btree level, for checking */
5956 xfs_mount_t *mp; /* file system mount structure */
5957 xfs_bmbt_ptr_t *pp; /* pointer to block address */
5958 xfs_bmbt_rec_t *ep, *lastp; /* extent pointers in block entry */
5959 int bp_release = 0;
5960
5961 if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) {
5962 return;
5963 }
5964
5965 bno = NULLFSBLOCK;
5966 mp = ip->i_mount;
5967 ifp = XFS_IFORK_PTR(ip, whichfork);
5968 block = ifp->if_broot;
5969 /*
5970 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
5971 */
5972 ASSERT(INT_GET(block->bb_level, ARCH_CONVERT) > 0);
5973 level = INT_GET(block->bb_level, ARCH_CONVERT);
5974 xfs_check_block(block, mp, 1, ifp->if_broot_bytes);
5975 pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
5976 ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO);
5977 ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount);
5978 ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks);
5979 bno = INT_GET(*pp, ARCH_CONVERT);
5980 /*
5981 * Go down the tree until leaf level is reached, following the first
5982 * pointer (leftmost) at each level.
5983 */
5984 while (level-- > 0) {
5985 /* See if buf is in cur first */
5986 bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
5987 if (bp) {
5988 bp_release = 0;
5989 } else {
5990 bp_release = 1;
5991 }
5992 if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
5993 XFS_BMAP_BTREE_REF)))
5994 goto error_norelse;
5995 block = XFS_BUF_TO_BMBT_BLOCK(bp);
5996 XFS_WANT_CORRUPTED_GOTO(
5997 XFS_BMAP_SANITY_CHECK(mp, block, level),
5998 error0);
5999 if (level == 0)
6000 break;
6001
6002 /*
6003 * Check this block for basic sanity (increasing keys and
6004 * no duplicate blocks).
6005 */
6006
6007 xfs_check_block(block, mp, 0, 0);
6008 pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, block,
6009 1, mp->m_bmap_dmxr[1]);
6010 XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, INT_GET(*pp, ARCH_CONVERT)), error0);
6011 bno = INT_GET(*pp, ARCH_CONVERT);
6012 if (bp_release) {
6013 bp_release = 0;
6014 xfs_trans_brelse(NULL, bp);
6015 }
6016 }
6017
6018 /*
6019 * Here with bp and block set to the leftmost leaf node in the tree.
6020 */
6021 i = 0;
6022
6023 /*
6024 * Loop over all leaf nodes checking that all extents are in the right order.
6025 */
6026 lastp = NULL;
6027 for (;;) {
6028 xfs_bmbt_rec_t *frp;
6029 xfs_fsblock_t nextbno;
6030 xfs_extnum_t num_recs;
6031
6032
6033 num_recs = INT_GET(block->bb_numrecs, ARCH_CONVERT);
6034
6035 /*
6036 * Read-ahead the next leaf block, if any.
6037 */
6038
6039 nextbno = INT_GET(block->bb_rightsib, ARCH_CONVERT);
6040
6041 /*
6042 * Check all the extents to make sure they are OK.
6043 * If we had a previous block, the last entry should
6044 * conform with the first entry in this one.
6045 */
6046
6047 frp = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt,
6048 block, 1, mp->m_bmap_dmxr[0]);
6049
6050 for (ep = frp;ep < frp + (num_recs - 1); ep++) {
6051 if (lastp) {
6052 xfs_btree_check_rec(XFS_BTNUM_BMAP,
6053 (void *)lastp, (void *)ep);
6054 }
6055 xfs_btree_check_rec(XFS_BTNUM_BMAP, (void *)ep,
6056 (void *)(ep + 1));
6057 }
6058 lastp = frp + num_recs - 1; /* For the next iteration */
6059
6060 i += num_recs;
6061 if (bp_release) {
6062 bp_release = 0;
6063 xfs_trans_brelse(NULL, bp);
6064 }
6065 bno = nextbno;
6066 /*
6067 * If we've reached the end, stop.
6068 */
6069 if (bno == NULLFSBLOCK)
6070 break;
6071
6072 bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
6073 if (bp) {
6074 bp_release = 0;
6075 } else {
6076 bp_release = 1;
6077 }
6078 if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
6079 XFS_BMAP_BTREE_REF)))
6080 goto error_norelse;
6081 block = XFS_BUF_TO_BMBT_BLOCK(bp);
6082 }
6083 if (bp_release) {
6084 bp_release = 0;
6085 xfs_trans_brelse(NULL, bp);
6086 }
6087 return;
6088
6089error0:
6090 cmn_err(CE_WARN, "%s: at error0", __FUNCTION__);
6091 if (bp_release)
6092 xfs_trans_brelse(NULL, bp);
6093error_norelse:
6094 cmn_err(CE_WARN, "%s: BAD after btree leaves for %d extents",
6095 i, __FUNCTION__);
6096 panic("%s: CORRUPTED BTREE OR SOMETHING", __FUNCTION__);
6097 return;
6098}
6099#endif
6100
6101/*
6102 * Count fsblocks of the given fork.
6103 */
6104int /* error */
6105xfs_bmap_count_blocks(
6106 xfs_trans_t *tp, /* transaction pointer */
6107 xfs_inode_t *ip, /* incore inode */
6108 int whichfork, /* data or attr fork */
6109 int *count) /* out: count of blocks */
6110{
6111 xfs_bmbt_block_t *block; /* current btree block */
6112 xfs_fsblock_t bno; /* block # of "block" */
6113 xfs_ifork_t *ifp; /* fork structure */
6114 int level; /* btree level, for checking */
6115 xfs_mount_t *mp; /* file system mount structure */
6116 xfs_bmbt_ptr_t *pp; /* pointer to block address */
6117
6118 bno = NULLFSBLOCK;
6119 mp = ip->i_mount;
6120 ifp = XFS_IFORK_PTR(ip, whichfork);
6121 if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
6122 if (unlikely(xfs_bmap_count_leaves(ifp->if_u1.if_extents,
6123 ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
6124 count) < 0)) {
6125 XFS_ERROR_REPORT("xfs_bmap_count_blocks(1)",
6126 XFS_ERRLEVEL_LOW, mp);
6127 return XFS_ERROR(EFSCORRUPTED);
6128 }
6129 return 0;
6130 }
6131
6132 /*
6133 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
6134 */
6135 block = ifp->if_broot;
6136 ASSERT(INT_GET(block->bb_level, ARCH_CONVERT) > 0);
6137 level = INT_GET(block->bb_level, ARCH_CONVERT);
6138 pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
6139 ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO);
6140 ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount);
6141 ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks);
6142 bno = INT_GET(*pp, ARCH_CONVERT);
6143
6144 if (unlikely(xfs_bmap_count_tree(mp, tp, bno, level, count) < 0)) {
6145 XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW,
6146 mp);
6147 return XFS_ERROR(EFSCORRUPTED);
6148 }
6149
6150 return 0;
6151}
6152
6153/*
6154 * Recursively walks each level of a btree
6155 * to count total fsblocks is use.
6156 */
6157int /* error */
6158xfs_bmap_count_tree(
6159 xfs_mount_t *mp, /* file system mount point */
6160 xfs_trans_t *tp, /* transaction pointer */
6161 xfs_fsblock_t blockno, /* file system block number */
6162 int levelin, /* level in btree */
6163 int *count) /* Count of blocks */
6164{
6165 int error;
6166 xfs_buf_t *bp, *nbp;
6167 int level = levelin;
6168 xfs_bmbt_ptr_t *pp;
6169 xfs_fsblock_t bno = blockno;
6170 xfs_fsblock_t nextbno;
6171 xfs_bmbt_block_t *block, *nextblock;
6172 int numrecs;
6173 xfs_bmbt_rec_t *frp;
6174
6175 if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF)))
6176 return error;
6177 *count += 1;
6178 block = XFS_BUF_TO_BMBT_BLOCK(bp);
6179
6180 if (--level) {
6181 /* Not at node above leafs, count this level of nodes */
6182 nextbno = INT_GET(block->bb_rightsib, ARCH_CONVERT);
6183 while (nextbno != NULLFSBLOCK) {
6184 if ((error = xfs_btree_read_bufl(mp, tp, nextbno,
6185 0, &nbp, XFS_BMAP_BTREE_REF)))
6186 return error;
6187 *count += 1;
6188 nextblock = XFS_BUF_TO_BMBT_BLOCK(nbp);
6189 nextbno = INT_GET(nextblock->bb_rightsib, ARCH_CONVERT);
6190 xfs_trans_brelse(tp, nbp);
6191 }
6192
6193 /* Dive to the next level */
6194 pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize,
6195 xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]);
6196 bno = INT_GET(*pp, ARCH_CONVERT);
6197 if (unlikely((error =
6198 xfs_bmap_count_tree(mp, tp, bno, level, count)) < 0)) {
6199 xfs_trans_brelse(tp, bp);
6200 XFS_ERROR_REPORT("xfs_bmap_count_tree(1)",
6201 XFS_ERRLEVEL_LOW, mp);
6202 return XFS_ERROR(EFSCORRUPTED);
6203 }
6204 xfs_trans_brelse(tp, bp);
6205 } else {
6206 /* count all level 1 nodes and their leaves */
6207 for (;;) {
6208 nextbno = INT_GET(block->bb_rightsib, ARCH_CONVERT);
6209 numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT);
6210 frp = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize,
6211 xfs_bmbt, block, 1, mp->m_bmap_dmxr[0]);
6212 if (unlikely(xfs_bmap_count_leaves(frp, numrecs, count) < 0)) {
6213 xfs_trans_brelse(tp, bp);
6214 XFS_ERROR_REPORT("xfs_bmap_count_tree(2)",
6215 XFS_ERRLEVEL_LOW, mp);
6216 return XFS_ERROR(EFSCORRUPTED);
6217 }
6218 xfs_trans_brelse(tp, bp);
6219 if (nextbno == NULLFSBLOCK)
6220 break;
6221 bno = nextbno;
6222 if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
6223 XFS_BMAP_BTREE_REF)))
6224 return error;
6225 *count += 1;
6226 block = XFS_BUF_TO_BMBT_BLOCK(bp);
6227 }
6228 }
6229 return 0;
6230}
6231
6232/*
6233 * Count leaf blocks given a pointer to an extent list.
6234 */
6235int
6236xfs_bmap_count_leaves(
6237 xfs_bmbt_rec_t *frp,
6238 int numrecs,
6239 int *count)
6240{
6241 int b;
6242
6243 for ( b = 1; b <= numrecs; b++, frp++)
6244 *count += xfs_bmbt_disk_get_blockcount(frp);
6245 return 0;
6246}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
new file mode 100644
index 000000000000..f1bc22fb26ae
--- /dev/null
+++ b/fs/xfs/xfs_bmap.h
@@ -0,0 +1,379 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_BMAP_H__
33#define __XFS_BMAP_H__
34
35struct getbmap;
36struct xfs_bmbt_irec;
37struct xfs_inode;
38struct xfs_mount;
39struct xfs_trans;
40
41/*
42 * List of extents to be free "later".
43 * The list is kept sorted on xbf_startblock.
44 */
45typedef struct xfs_bmap_free_item
46{
47 xfs_fsblock_t xbfi_startblock;/* starting fs block number */
48 xfs_extlen_t xbfi_blockcount;/* number of blocks in extent */
49 struct xfs_bmap_free_item *xbfi_next; /* link to next entry */
50} xfs_bmap_free_item_t;
51
52/*
53 * Header for free extent list.
54 */
55typedef struct xfs_bmap_free
56{
57 xfs_bmap_free_item_t *xbf_first; /* list of to-be-free extents */
58 int xbf_count; /* count of items on list */
59 int xbf_low; /* kludge: alloc in low mode */
60} xfs_bmap_free_t;
61
62#define XFS_BMAP_MAX_NMAP 4
63
64/*
65 * Flags for xfs_bmapi
66 */
67#define XFS_BMAPI_WRITE 0x001 /* write operation: allocate space */
68#define XFS_BMAPI_DELAY 0x002 /* delayed write operation */
69#define XFS_BMAPI_ENTIRE 0x004 /* return entire extent, not trimmed */
70#define XFS_BMAPI_METADATA 0x008 /* mapping metadata not user data */
71#define XFS_BMAPI_EXACT 0x010 /* allocate only to spec'd bounds */
72#define XFS_BMAPI_ATTRFORK 0x020 /* use attribute fork not data */
73#define XFS_BMAPI_ASYNC 0x040 /* bunmapi xactions can be async */
74#define XFS_BMAPI_RSVBLOCKS 0x080 /* OK to alloc. reserved data blocks */
75#define XFS_BMAPI_PREALLOC 0x100 /* preallocation op: unwritten space */
76#define XFS_BMAPI_IGSTATE 0x200 /* Ignore state - */
77 /* combine contig. space */
78#define XFS_BMAPI_CONTIG 0x400 /* must allocate only one extent */
79
80#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAPI_AFLAG)
81int xfs_bmapi_aflag(int w);
82#define XFS_BMAPI_AFLAG(w) xfs_bmapi_aflag(w)
83#else
84#define XFS_BMAPI_AFLAG(w) ((w) == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0)
85#endif
86
87/*
88 * Special values for xfs_bmbt_irec_t br_startblock field.
89 */
90#define DELAYSTARTBLOCK ((xfs_fsblock_t)-1LL)
91#define HOLESTARTBLOCK ((xfs_fsblock_t)-2LL)
92
93#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_INIT)
94void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp);
95#define XFS_BMAP_INIT(flp,fbp) xfs_bmap_init(flp,fbp)
96#else
97#define XFS_BMAP_INIT(flp,fbp) \
98 ((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \
99 (flp)->xbf_low = 0, *(fbp) = NULLFSBLOCK)
100#endif
101
102/*
103 * Argument structure for xfs_bmap_alloc.
104 */
105typedef struct xfs_bmalloca {
106 xfs_fsblock_t firstblock; /* i/o first block allocated */
107 xfs_fsblock_t rval; /* starting block of new extent */
108 xfs_fileoff_t off; /* offset in file filling in */
109 struct xfs_trans *tp; /* transaction pointer */
110 struct xfs_inode *ip; /* incore inode pointer */
111 struct xfs_bmbt_irec *prevp; /* extent before the new one */
112 struct xfs_bmbt_irec *gotp; /* extent after, or delayed */
113 xfs_extlen_t alen; /* i/o length asked/allocated */
114 xfs_extlen_t total; /* total blocks needed for xaction */
115 xfs_extlen_t minlen; /* mininum allocation size (blocks) */
116 xfs_extlen_t minleft; /* amount must be left after alloc */
117 char eof; /* set if allocating past last extent */
118 char wasdel; /* replacing a delayed allocation */
119 char userdata;/* set if is user data */
120 char low; /* low on space, using seq'l ags */
121 char aeof; /* allocated space at eof */
122} xfs_bmalloca_t;
123
124#ifdef __KERNEL__
125
126#if defined(XFS_BMAP_TRACE)
127/*
128 * Trace operations for bmap extent tracing
129 */
130#define XFS_BMAP_KTRACE_DELETE 1
131#define XFS_BMAP_KTRACE_INSERT 2
132#define XFS_BMAP_KTRACE_PRE_UP 3
133#define XFS_BMAP_KTRACE_POST_UP 4
134
135#define XFS_BMAP_TRACE_SIZE 4096 /* size of global trace buffer */
136#define XFS_BMAP_KTRACE_SIZE 32 /* size of per-inode trace buffer */
137extern ktrace_t *xfs_bmap_trace_buf;
138
139/*
140 * Add bmap trace insert entries for all the contents of the extent list.
141 */
142void
143xfs_bmap_trace_exlist(
144 char *fname, /* function name */
145 struct xfs_inode *ip, /* incore inode pointer */
146 xfs_extnum_t cnt, /* count of entries in list */
147 int whichfork); /* data or attr fork */
148#else
149#define xfs_bmap_trace_exlist(f,ip,c,w)
150#endif
151
152/*
153 * Convert inode from non-attributed to attributed.
154 * Must not be in a transaction, ip must not be locked.
155 */
156int /* error code */
157xfs_bmap_add_attrfork(
158 struct xfs_inode *ip, /* incore inode pointer */
159 int rsvd); /* flag for reserved block allocation */
160
161/*
162 * Add the extent to the list of extents to be free at transaction end.
163 * The list is maintained sorted (by block number).
164 */
165void
166xfs_bmap_add_free(
167 xfs_fsblock_t bno, /* fs block number of extent */
168 xfs_filblks_t len, /* length of extent */
169 xfs_bmap_free_t *flist, /* list of extents */
170 struct xfs_mount *mp); /* mount point structure */
171
172/*
173 * Routine to clean up the free list data structure when
174 * an error occurs during a transaction.
175 */
176void
177xfs_bmap_cancel(
178 xfs_bmap_free_t *flist); /* free list to clean up */
179
180/*
181 * Compute and fill in the value of the maximum depth of a bmap btree
182 * in this filesystem. Done once, during mount.
183 */
184void
185xfs_bmap_compute_maxlevels(
186 struct xfs_mount *mp, /* file system mount structure */
187 int whichfork); /* data or attr fork */
188
189/*
190 * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
191 * caller. Frees all the extents that need freeing, which must be done
192 * last due to locking considerations.
193 *
194 * Return 1 if the given transaction was committed and a new one allocated,
195 * and 0 otherwise.
196 */
197int /* error */
198xfs_bmap_finish(
199 struct xfs_trans **tp, /* transaction pointer addr */
200 xfs_bmap_free_t *flist, /* i/o: list extents to free */
201 xfs_fsblock_t firstblock, /* controlled a.g. for allocs */
202 int *committed); /* xact committed or not */
203
204/*
205 * Returns the file-relative block number of the first unused block in the file.
206 * This is the lowest-address hole if the file has holes, else the first block
207 * past the end of file.
208 */
209int /* error */
210xfs_bmap_first_unused(
211 struct xfs_trans *tp, /* transaction pointer */
212 struct xfs_inode *ip, /* incore inode */
213 xfs_extlen_t len, /* size of hole to find */
214 xfs_fileoff_t *unused, /* unused block num */
215 int whichfork); /* data or attr fork */
216
217/*
218 * Returns the file-relative block number of the last block + 1 before
219 * last_block (input value) in the file.
220 * This is not based on i_size, it is based on the extent list.
221 * Returns 0 for local files, as they do not have an extent list.
222 */
223int /* error */
224xfs_bmap_last_before(
225 struct xfs_trans *tp, /* transaction pointer */
226 struct xfs_inode *ip, /* incore inode */
227 xfs_fileoff_t *last_block, /* last block */
228 int whichfork); /* data or attr fork */
229
230/*
231 * Returns the file-relative block number of the first block past eof in
232 * the file. This is not based on i_size, it is based on the extent list.
233 * Returns 0 for local files, as they do not have an extent list.
234 */
235int /* error */
236xfs_bmap_last_offset(
237 struct xfs_trans *tp, /* transaction pointer */
238 struct xfs_inode *ip, /* incore inode */
239 xfs_fileoff_t *unused, /* last block num */
240 int whichfork); /* data or attr fork */
241
242/*
243 * Returns whether the selected fork of the inode has exactly one
244 * block or not. For the data fork we check this matches di_size,
245 * implying the file's range is 0..bsize-1.
246 */
247int
248xfs_bmap_one_block(
249 struct xfs_inode *ip, /* incore inode */
250 int whichfork); /* data or attr fork */
251
252/*
253 * Read in the extents to iu_extents.
254 * All inode fields are set up by caller, we just traverse the btree
255 * and copy the records in.
256 */
257int /* error */
258xfs_bmap_read_extents(
259 struct xfs_trans *tp, /* transaction pointer */
260 struct xfs_inode *ip, /* incore inode */
261 int whichfork); /* data or attr fork */
262
263/*
264 * Map file blocks to filesystem blocks.
265 * File range is given by the bno/len pair.
266 * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set)
267 * into a hole or past eof.
268 * Only allocates blocks from a single allocation group,
269 * to avoid locking problems.
270 * The returned value in "firstblock" from the first call in a transaction
271 * must be remembered and presented to subsequent calls in "firstblock".
272 * An upper bound for the number of blocks to be allocated is supplied to
273 * the first call in "total"; if no allocation group has that many free
274 * blocks then the call will fail (return NULLFSBLOCK in "firstblock").
275 */
276int /* error */
277xfs_bmapi(
278 struct xfs_trans *tp, /* transaction pointer */
279 struct xfs_inode *ip, /* incore inode */
280 xfs_fileoff_t bno, /* starting file offs. mapped */
281 xfs_filblks_t len, /* length to map in file */
282 int flags, /* XFS_BMAPI_... */
283 xfs_fsblock_t *firstblock, /* first allocated block
284 controls a.g. for allocs */
285 xfs_extlen_t total, /* total blocks needed */
286 struct xfs_bmbt_irec *mval, /* output: map values */
287 int *nmap, /* i/o: mval size/count */
288 xfs_bmap_free_t *flist); /* i/o: list extents to free */
289
290/*
291 * Map file blocks to filesystem blocks, simple version.
292 * One block only, read-only.
293 * For flags, only the XFS_BMAPI_ATTRFORK flag is examined.
294 * For the other flag values, the effect is as if XFS_BMAPI_METADATA
295 * was set and all the others were clear.
296 */
297int /* error */
298xfs_bmapi_single(
299 struct xfs_trans *tp, /* transaction pointer */
300 struct xfs_inode *ip, /* incore inode */
301 int whichfork, /* data or attr fork */
302 xfs_fsblock_t *fsb, /* output: mapped block */
303 xfs_fileoff_t bno); /* starting file offs. mapped */
304
305/*
306 * Unmap (remove) blocks from a file.
307 * If nexts is nonzero then the number of extents to remove is limited to
308 * that value. If not all extents in the block range can be removed then
309 * *done is set.
310 */
311int /* error */
312xfs_bunmapi(
313 struct xfs_trans *tp, /* transaction pointer */
314 struct xfs_inode *ip, /* incore inode */
315 xfs_fileoff_t bno, /* starting offset to unmap */
316 xfs_filblks_t len, /* length to unmap in file */
317 int flags, /* XFS_BMAPI_... */
318 xfs_extnum_t nexts, /* number of extents max */
319 xfs_fsblock_t *firstblock, /* first allocated block
320 controls a.g. for allocs */
321 xfs_bmap_free_t *flist, /* i/o: list extents to free */
322 int *done); /* set if not done yet */
323
324/*
325 * Fcntl interface to xfs_bmapi.
326 */
327int /* error code */
328xfs_getbmap(
329 bhv_desc_t *bdp, /* XFS behavior descriptor*/
330 struct getbmap *bmv, /* user bmap structure */
331 void __user *ap, /* pointer to user's array */
332 int iflags); /* interface flags */
333
334/*
335 * Check the last inode extent to determine whether this allocation will result
336 * in blocks being allocated at the end of the file. When we allocate new data
337 * blocks at the end of the file which do not start at the previous data block,
338 * we will try to align the new blocks at stripe unit boundaries.
339 */
340int
341xfs_bmap_isaeof(
342 struct xfs_inode *ip,
343 xfs_fileoff_t off,
344 int whichfork,
345 char *aeof);
346
347/*
348 * Check if the endoff is outside the last extent. If so the caller will grow
349 * the allocation to a stripe unit boundary
350 */
351int
352xfs_bmap_eof(
353 struct xfs_inode *ip,
354 xfs_fileoff_t endoff,
355 int whichfork,
356 int *eof);
357
358/*
359 * Count fsblocks of the given fork.
360 */
361int
362xfs_bmap_count_blocks(
363 xfs_trans_t *tp,
364 struct xfs_inode *ip,
365 int whichfork,
366 int *count);
367
368/*
369 * Check an extent list, which has just been read, for
370 * any bit in the extent flag field.
371 */
372int
373xfs_check_nostate_extents(
374 xfs_bmbt_rec_t *ep,
375 xfs_extnum_t num);
376
377#endif /* __KERNEL__ */
378
379#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
new file mode 100644
index 000000000000..163305a79fcc
--- /dev/null
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -0,0 +1,2807 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34
35#include "xfs_macros.h"
36#include "xfs_types.h"
37#include "xfs_inum.h"
38#include "xfs_log.h"
39#include "xfs_trans.h"
40#include "xfs_sb.h"
41#include "xfs_ag.h"
42#include "xfs_dir.h"
43#include "xfs_dir2.h"
44#include "xfs_dmapi.h"
45#include "xfs_mount.h"
46#include "xfs_alloc_btree.h"
47#include "xfs_bmap_btree.h"
48#include "xfs_ialloc_btree.h"
49#include "xfs_btree.h"
50#include "xfs_ialloc.h"
51#include "xfs_itable.h"
52#include "xfs_attr_sf.h"
53#include "xfs_dir_sf.h"
54#include "xfs_dir2_sf.h"
55#include "xfs_dinode.h"
56#include "xfs_inode_item.h"
57#include "xfs_inode.h"
58#include "xfs_alloc.h"
59#include "xfs_bit.h"
60#include "xfs_bmap.h"
61#include "xfs_error.h"
62#include "xfs_quota.h"
63
64#if defined(XFS_BMBT_TRACE)
65ktrace_t *xfs_bmbt_trace_buf;
66#endif
67
68/*
69 * Prototypes for internal btree functions.
70 */
71
72
73STATIC int xfs_bmbt_killroot(xfs_btree_cur_t *);
74STATIC void xfs_bmbt_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
75STATIC void xfs_bmbt_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
76STATIC int xfs_bmbt_lshift(xfs_btree_cur_t *, int, int *);
77STATIC int xfs_bmbt_rshift(xfs_btree_cur_t *, int, int *);
78STATIC int xfs_bmbt_split(xfs_btree_cur_t *, int, xfs_fsblock_t *,
79 xfs_bmbt_key_t *, xfs_btree_cur_t **, int *);
80STATIC int xfs_bmbt_updkey(xfs_btree_cur_t *, xfs_bmbt_key_t *, int);
81
82
83#if defined(XFS_BMBT_TRACE)
84
85static char ARGS[] = "args";
86static char ENTRY[] = "entry";
87static char ERROR[] = "error";
88#undef EXIT
89static char EXIT[] = "exit";
90
91/*
92 * Add a trace buffer entry for the arguments given to the routine,
93 * generic form.
94 */
95STATIC void
96xfs_bmbt_trace_enter(
97 char *func,
98 xfs_btree_cur_t *cur,
99 char *s,
100 int type,
101 int line,
102 __psunsigned_t a0,
103 __psunsigned_t a1,
104 __psunsigned_t a2,
105 __psunsigned_t a3,
106 __psunsigned_t a4,
107 __psunsigned_t a5,
108 __psunsigned_t a6,
109 __psunsigned_t a7,
110 __psunsigned_t a8,
111 __psunsigned_t a9,
112 __psunsigned_t a10)
113{
114 xfs_inode_t *ip;
115 int whichfork;
116
117 ip = cur->bc_private.b.ip;
118 whichfork = cur->bc_private.b.whichfork;
119 ktrace_enter(xfs_bmbt_trace_buf,
120 (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
121 (void *)func, (void *)s, (void *)ip, (void *)cur,
122 (void *)a0, (void *)a1, (void *)a2, (void *)a3,
123 (void *)a4, (void *)a5, (void *)a6, (void *)a7,
124 (void *)a8, (void *)a9, (void *)a10);
125 ASSERT(ip->i_btrace);
126 ktrace_enter(ip->i_btrace,
127 (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
128 (void *)func, (void *)s, (void *)ip, (void *)cur,
129 (void *)a0, (void *)a1, (void *)a2, (void *)a3,
130 (void *)a4, (void *)a5, (void *)a6, (void *)a7,
131 (void *)a8, (void *)a9, (void *)a10);
132}
133/*
134 * Add a trace buffer entry for arguments, for a buffer & 1 integer arg.
135 */
136STATIC void
137xfs_bmbt_trace_argbi(
138 char *func,
139 xfs_btree_cur_t *cur,
140 xfs_buf_t *b,
141 int i,
142 int line)
143{
144 xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGBI, line,
145 (__psunsigned_t)b, i, 0, 0,
146 0, 0, 0, 0,
147 0, 0, 0);
148}
149
150/*
151 * Add a trace buffer entry for arguments, for a buffer & 2 integer args.
152 */
153STATIC void
154xfs_bmbt_trace_argbii(
155 char *func,
156 xfs_btree_cur_t *cur,
157 xfs_buf_t *b,
158 int i0,
159 int i1,
160 int line)
161{
162 xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGBII, line,
163 (__psunsigned_t)b, i0, i1, 0,
164 0, 0, 0, 0,
165 0, 0, 0);
166}
167
168/*
169 * Add a trace buffer entry for arguments, for 3 block-length args
170 * and an integer arg.
171 */
172STATIC void
173xfs_bmbt_trace_argfffi(
174 char *func,
175 xfs_btree_cur_t *cur,
176 xfs_dfiloff_t o,
177 xfs_dfsbno_t b,
178 xfs_dfilblks_t i,
179 int j,
180 int line)
181{
182 xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGFFFI, line,
183 o >> 32, (int)o, b >> 32, (int)b,
184 i >> 32, (int)i, (int)j, 0,
185 0, 0, 0);
186}
187
188/*
189 * Add a trace buffer entry for arguments, for one integer arg.
190 */
191STATIC void
192xfs_bmbt_trace_argi(
193 char *func,
194 xfs_btree_cur_t *cur,
195 int i,
196 int line)
197{
198 xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGI, line,
199 i, 0, 0, 0,
200 0, 0, 0, 0,
201 0, 0, 0);
202}
203
204/*
205 * Add a trace buffer entry for arguments, for int, fsblock, key.
206 */
207STATIC void
208xfs_bmbt_trace_argifk(
209 char *func,
210 xfs_btree_cur_t *cur,
211 int i,
212 xfs_fsblock_t f,
213 xfs_bmbt_key_t *k,
214 int line)
215{
216 xfs_dfsbno_t d;
217 xfs_dfiloff_t o;
218
219 d = (xfs_dfsbno_t)f;
220 o = INT_GET(k->br_startoff, ARCH_CONVERT);
221 xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line,
222 i, d >> 32, (int)d, o >> 32,
223 (int)o, 0, 0, 0,
224 0, 0, 0);
225}
226
227/*
228 * Add a trace buffer entry for arguments, for int, fsblock, rec.
229 */
230STATIC void
231xfs_bmbt_trace_argifr(
232 char *func,
233 xfs_btree_cur_t *cur,
234 int i,
235 xfs_fsblock_t f,
236 xfs_bmbt_rec_t *r,
237 int line)
238{
239 xfs_dfsbno_t b;
240 xfs_dfilblks_t c;
241 xfs_dfsbno_t d;
242 xfs_dfiloff_t o;
243 xfs_bmbt_irec_t s;
244
245 d = (xfs_dfsbno_t)f;
246 xfs_bmbt_disk_get_all(r, &s);
247 o = (xfs_dfiloff_t)s.br_startoff;
248 b = (xfs_dfsbno_t)s.br_startblock;
249 c = s.br_blockcount;
250 xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFR, line,
251 i, d >> 32, (int)d, o >> 32,
252 (int)o, b >> 32, (int)b, c >> 32,
253 (int)c, 0, 0);
254}
255
256/*
257 * Add a trace buffer entry for arguments, for int, key.
258 */
259STATIC void
260xfs_bmbt_trace_argik(
261 char *func,
262 xfs_btree_cur_t *cur,
263 int i,
264 xfs_bmbt_key_t *k,
265 int line)
266{
267 xfs_dfiloff_t o;
268
269 o = INT_GET(k->br_startoff, ARCH_CONVERT);
270 xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line,
271 i, o >> 32, (int)o, 0,
272 0, 0, 0, 0,
273 0, 0, 0);
274}
275
276/*
277 * Add a trace buffer entry for the cursor/operation.
278 */
279STATIC void
280xfs_bmbt_trace_cursor(
281 char *func,
282 xfs_btree_cur_t *cur,
283 char *s,
284 int line)
285{
286 xfs_bmbt_rec_t r;
287
288 xfs_bmbt_set_all(&r, &cur->bc_rec.b);
289 xfs_bmbt_trace_enter(func, cur, s, XFS_BMBT_KTRACE_CUR, line,
290 (cur->bc_nlevels << 24) | (cur->bc_private.b.flags << 16) |
291 cur->bc_private.b.allocated,
292 INT_GET(r.l0, ARCH_CONVERT) >> 32, (int)INT_GET(r.l0, ARCH_CONVERT), INT_GET(r.l1, ARCH_CONVERT) >> 32, (int)INT_GET(r.l1, ARCH_CONVERT),
293 (unsigned long)cur->bc_bufs[0], (unsigned long)cur->bc_bufs[1],
294 (unsigned long)cur->bc_bufs[2], (unsigned long)cur->bc_bufs[3],
295 (cur->bc_ptrs[0] << 16) | cur->bc_ptrs[1],
296 (cur->bc_ptrs[2] << 16) | cur->bc_ptrs[3]);
297}
298
299#define XFS_BMBT_TRACE_ARGBI(c,b,i) \
300 xfs_bmbt_trace_argbi(fname, c, b, i, __LINE__)
301#define XFS_BMBT_TRACE_ARGBII(c,b,i,j) \
302 xfs_bmbt_trace_argbii(fname, c, b, i, j, __LINE__)
303#define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j) \
304 xfs_bmbt_trace_argfffi(fname, c, o, b, i, j, __LINE__)
305#define XFS_BMBT_TRACE_ARGI(c,i) \
306 xfs_bmbt_trace_argi(fname, c, i, __LINE__)
307#define XFS_BMBT_TRACE_ARGIFK(c,i,f,k) \
308 xfs_bmbt_trace_argifk(fname, c, i, f, k, __LINE__)
309#define XFS_BMBT_TRACE_ARGIFR(c,i,f,r) \
310 xfs_bmbt_trace_argifr(fname, c, i, f, r, __LINE__)
311#define XFS_BMBT_TRACE_ARGIK(c,i,k) \
312 xfs_bmbt_trace_argik(fname, c, i, k, __LINE__)
313#define XFS_BMBT_TRACE_CURSOR(c,s) \
314 xfs_bmbt_trace_cursor(fname, c, s, __LINE__)
315#else
316#define XFS_BMBT_TRACE_ARGBI(c,b,i)
317#define XFS_BMBT_TRACE_ARGBII(c,b,i,j)
318#define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j)
319#define XFS_BMBT_TRACE_ARGI(c,i)
320#define XFS_BMBT_TRACE_ARGIFK(c,i,f,k)
321#define XFS_BMBT_TRACE_ARGIFR(c,i,f,r)
322#define XFS_BMBT_TRACE_ARGIK(c,i,k)
323#define XFS_BMBT_TRACE_CURSOR(c,s)
324#endif /* XFS_BMBT_TRACE */
325
326
327/*
328 * Internal functions.
329 */
330
331/*
332 * Delete record pointed to by cur/level.
333 */
334STATIC int /* error */
335xfs_bmbt_delrec(
336 xfs_btree_cur_t *cur,
337 int level,
338 int *stat) /* success/failure */
339{
340 xfs_bmbt_block_t *block; /* bmap btree block */
341 xfs_fsblock_t bno; /* fs-relative block number */
342 xfs_buf_t *bp; /* buffer for block */
343 int error; /* error return value */
344#ifdef XFS_BMBT_TRACE
345 static char fname[] = "xfs_bmbt_delrec";
346#endif
347 int i; /* loop counter */
348 int j; /* temp state */
349 xfs_bmbt_key_t key; /* bmap btree key */
350 xfs_bmbt_key_t *kp=NULL; /* pointer to bmap btree key */
351 xfs_fsblock_t lbno; /* left sibling block number */
352 xfs_buf_t *lbp; /* left buffer pointer */
353 xfs_bmbt_block_t *left; /* left btree block */
354 xfs_bmbt_key_t *lkp; /* left btree key */
355 xfs_bmbt_ptr_t *lpp; /* left address pointer */
356 int lrecs=0; /* left record count */
357 xfs_bmbt_rec_t *lrp; /* left record pointer */
358 xfs_mount_t *mp; /* file system mount point */
359 xfs_bmbt_ptr_t *pp; /* pointer to bmap block addr */
360 int ptr; /* key/record index */
361 xfs_fsblock_t rbno; /* right sibling block number */
362 xfs_buf_t *rbp; /* right buffer pointer */
363 xfs_bmbt_block_t *right; /* right btree block */
364 xfs_bmbt_key_t *rkp; /* right btree key */
365 xfs_bmbt_rec_t *rp; /* pointer to bmap btree rec */
366 xfs_bmbt_ptr_t *rpp; /* right address pointer */
367 xfs_bmbt_block_t *rrblock; /* right-right btree block */
368 xfs_buf_t *rrbp; /* right-right buffer pointer */
369 int rrecs=0; /* right record count */
370 xfs_bmbt_rec_t *rrp; /* right record pointer */
371 xfs_btree_cur_t *tcur; /* temporary btree cursor */
372 int numrecs; /* temporary numrec count */
373 int numlrecs, numrrecs;
374
375 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
376 XFS_BMBT_TRACE_ARGI(cur, level);
377 ptr = cur->bc_ptrs[level];
378 tcur = (xfs_btree_cur_t *)0;
379 if (ptr == 0) {
380 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
381 *stat = 0;
382 return 0;
383 }
384 block = xfs_bmbt_get_block(cur, level, &bp);
385 numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT);
386#ifdef DEBUG
387 if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
388 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
389 goto error0;
390 }
391#endif
392 if (ptr > numrecs) {
393 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
394 *stat = 0;
395 return 0;
396 }
397 XFS_STATS_INC(xs_bmbt_delrec);
398 if (level > 0) {
399 kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
400 pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
401#ifdef DEBUG
402 for (i = ptr; i < numrecs; i++) {
403 if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i], ARCH_CONVERT), level))) {
404 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
405 goto error0;
406 }
407 }
408#endif
409 if (ptr < numrecs) {
410 memmove(&kp[ptr - 1], &kp[ptr],
411 (numrecs - ptr) * sizeof(*kp));
412 memmove(&pp[ptr - 1], &pp[ptr], /* INT_: direct copy */
413 (numrecs - ptr) * sizeof(*pp));
414 xfs_bmbt_log_ptrs(cur, bp, ptr, numrecs - 1);
415 xfs_bmbt_log_keys(cur, bp, ptr, numrecs - 1);
416 }
417 } else {
418 rp = XFS_BMAP_REC_IADDR(block, 1, cur);
419 if (ptr < numrecs) {
420 memmove(&rp[ptr - 1], &rp[ptr],
421 (numrecs - ptr) * sizeof(*rp));
422 xfs_bmbt_log_recs(cur, bp, ptr, numrecs - 1);
423 }
424 if (ptr == 1) {
425 INT_SET(key.br_startoff, ARCH_CONVERT, xfs_bmbt_disk_get_startoff(rp));
426 kp = &key;
427 }
428 }
429 numrecs--;
430 INT_SET(block->bb_numrecs, ARCH_CONVERT, numrecs);
431 xfs_bmbt_log_block(cur, bp, XFS_BB_NUMRECS);
432 /*
433 * We're at the root level.
434 * First, shrink the root block in-memory.
435 * Try to get rid of the next level down.
436 * If we can't then there's nothing left to do.
437 */
438 if (level == cur->bc_nlevels - 1) {
439 xfs_iroot_realloc(cur->bc_private.b.ip, -1,
440 cur->bc_private.b.whichfork);
441 if ((error = xfs_bmbt_killroot(cur))) {
442 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
443 goto error0;
444 }
445 if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &j))) {
446 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
447 goto error0;
448 }
449 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
450 *stat = 1;
451 return 0;
452 }
453 if (ptr == 1 && (error = xfs_bmbt_updkey(cur, kp, level + 1))) {
454 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
455 goto error0;
456 }
457 if (numrecs >= XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
458 if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &j))) {
459 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
460 goto error0;
461 }
462 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
463 *stat = 1;
464 return 0;
465 }
466 rbno = INT_GET(block->bb_rightsib, ARCH_CONVERT);
467 lbno = INT_GET(block->bb_leftsib, ARCH_CONVERT);
468 /*
469 * One child of root, need to get a chance to copy its contents
470 * into the root and delete it. Can't go up to next level,
471 * there's nothing to delete there.
472 */
473 if (lbno == NULLFSBLOCK && rbno == NULLFSBLOCK &&
474 level == cur->bc_nlevels - 2) {
475 if ((error = xfs_bmbt_killroot(cur))) {
476 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
477 goto error0;
478 }
479 if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &i))) {
480 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
481 goto error0;
482 }
483 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
484 *stat = 1;
485 return 0;
486 }
487 ASSERT(rbno != NULLFSBLOCK || lbno != NULLFSBLOCK);
488 if ((error = xfs_btree_dup_cursor(cur, &tcur))) {
489 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
490 goto error0;
491 }
492 bno = NULLFSBLOCK;
493 if (rbno != NULLFSBLOCK) {
494 i = xfs_btree_lastrec(tcur, level);
495 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
496 if ((error = xfs_bmbt_increment(tcur, level, &i))) {
497 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
498 goto error0;
499 }
500 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
501 i = xfs_btree_lastrec(tcur, level);
502 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
503 rbp = tcur->bc_bufs[level];
504 right = XFS_BUF_TO_BMBT_BLOCK(rbp);
505#ifdef DEBUG
506 if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
507 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
508 goto error0;
509 }
510#endif
511 bno = INT_GET(right->bb_leftsib, ARCH_CONVERT);
512 if (INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1 >=
513 XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
514 if ((error = xfs_bmbt_lshift(tcur, level, &i))) {
515 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
516 goto error0;
517 }
518 if (i) {
519 ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >=
520 XFS_BMAP_BLOCK_IMINRECS(level, tcur));
521 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
522 tcur = NULL;
523 if (level > 0) {
524 if ((error = xfs_bmbt_decrement(cur,
525 level, &i))) {
526 XFS_BMBT_TRACE_CURSOR(cur,
527 ERROR);
528 goto error0;
529 }
530 }
531 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
532 *stat = 1;
533 return 0;
534 }
535 }
536 rrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT);
537 if (lbno != NULLFSBLOCK) {
538 i = xfs_btree_firstrec(tcur, level);
539 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
540 if ((error = xfs_bmbt_decrement(tcur, level, &i))) {
541 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
542 goto error0;
543 }
544 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
545 }
546 }
547 if (lbno != NULLFSBLOCK) {
548 i = xfs_btree_firstrec(tcur, level);
549 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
550 /*
551 * decrement to last in block
552 */
553 if ((error = xfs_bmbt_decrement(tcur, level, &i))) {
554 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
555 goto error0;
556 }
557 i = xfs_btree_firstrec(tcur, level);
558 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
559 lbp = tcur->bc_bufs[level];
560 left = XFS_BUF_TO_BMBT_BLOCK(lbp);
561#ifdef DEBUG
562 if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
563 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
564 goto error0;
565 }
566#endif
567 bno = INT_GET(left->bb_rightsib, ARCH_CONVERT);
568 if (INT_GET(left->bb_numrecs, ARCH_CONVERT) - 1 >=
569 XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
570 if ((error = xfs_bmbt_rshift(tcur, level, &i))) {
571 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
572 goto error0;
573 }
574 if (i) {
575 ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >=
576 XFS_BMAP_BLOCK_IMINRECS(level, tcur));
577 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
578 tcur = NULL;
579 if (level == 0)
580 cur->bc_ptrs[0]++;
581 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
582 *stat = 1;
583 return 0;
584 }
585 }
586 lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT);
587 }
588 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
589 tcur = NULL;
590 mp = cur->bc_mp;
591 ASSERT(bno != NULLFSBLOCK);
592 if (lbno != NULLFSBLOCK &&
593 lrecs + INT_GET(block->bb_numrecs, ARCH_CONVERT) <= XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
594 rbno = bno;
595 right = block;
596 rbp = bp;
597 if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, lbno, 0, &lbp,
598 XFS_BMAP_BTREE_REF))) {
599 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
600 goto error0;
601 }
602 left = XFS_BUF_TO_BMBT_BLOCK(lbp);
603 if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
604 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
605 goto error0;
606 }
607 } else if (rbno != NULLFSBLOCK &&
608 rrecs + INT_GET(block->bb_numrecs, ARCH_CONVERT) <=
609 XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
610 lbno = bno;
611 left = block;
612 lbp = bp;
613 if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, rbno, 0, &rbp,
614 XFS_BMAP_BTREE_REF))) {
615 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
616 goto error0;
617 }
618 right = XFS_BUF_TO_BMBT_BLOCK(rbp);
619 if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
620 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
621 goto error0;
622 }
623 lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT);
624 } else {
625 if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &i))) {
626 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
627 goto error0;
628 }
629 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
630 *stat = 1;
631 return 0;
632 }
633 numlrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT);
634 numrrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT);
635 if (level > 0) {
636 lkp = XFS_BMAP_KEY_IADDR(left, numlrecs + 1, cur);
637 lpp = XFS_BMAP_PTR_IADDR(left, numlrecs + 1, cur);
638 rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
639 rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
640#ifdef DEBUG
641 for (i = 0; i < numrrecs; i++) {
642 if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level))) {
643 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
644 goto error0;
645 }
646 }
647#endif
648 memcpy(lkp, rkp, numrrecs * sizeof(*lkp));
649 memcpy(lpp, rpp, numrrecs * sizeof(*lpp));
650 xfs_bmbt_log_keys(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
651 xfs_bmbt_log_ptrs(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
652 } else {
653 lrp = XFS_BMAP_REC_IADDR(left, numlrecs + 1, cur);
654 rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
655 memcpy(lrp, rrp, numrrecs * sizeof(*lrp));
656 xfs_bmbt_log_recs(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
657 }
658 INT_MOD(left->bb_numrecs, ARCH_CONVERT, numrrecs);
659 left->bb_rightsib = right->bb_rightsib; /* INT_: direct copy */
660 xfs_bmbt_log_block(cur, lbp, XFS_BB_RIGHTSIB | XFS_BB_NUMRECS);
661 if (INT_GET(left->bb_rightsib, ARCH_CONVERT) != NULLDFSBNO) {
662 if ((error = xfs_btree_read_bufl(mp, cur->bc_tp,
663 INT_GET(left->bb_rightsib, ARCH_CONVERT),
664 0, &rrbp, XFS_BMAP_BTREE_REF))) {
665 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
666 goto error0;
667 }
668 rrblock = XFS_BUF_TO_BMBT_BLOCK(rrbp);
669 if ((error = xfs_btree_check_lblock(cur, rrblock, level, rrbp))) {
670 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
671 goto error0;
672 }
673 INT_SET(rrblock->bb_leftsib, ARCH_CONVERT, lbno);
674 xfs_bmbt_log_block(cur, rrbp, XFS_BB_LEFTSIB);
675 }
676 xfs_bmap_add_free(XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(rbp)), 1,
677 cur->bc_private.b.flist, mp);
678 cur->bc_private.b.ip->i_d.di_nblocks--;
679 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
680 XFS_TRANS_MOD_DQUOT_BYINO(mp, cur->bc_tp, cur->bc_private.b.ip,
681 XFS_TRANS_DQ_BCOUNT, -1L);
682 xfs_trans_binval(cur->bc_tp, rbp);
683 if (bp != lbp) {
684 cur->bc_bufs[level] = lbp;
685 cur->bc_ptrs[level] += lrecs;
686 cur->bc_ra[level] = 0;
687 } else if ((error = xfs_bmbt_increment(cur, level + 1, &i))) {
688 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
689 goto error0;
690 }
691 if (level > 0)
692 cur->bc_ptrs[level]--;
693 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
694 *stat = 2;
695 return 0;
696
697error0:
698 if (tcur)
699 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
700 return error;
701}
702
703#ifdef DEBUG
704/*
705 * Get the data from the pointed-to record.
706 */
707int
708xfs_bmbt_get_rec(
709 xfs_btree_cur_t *cur,
710 xfs_fileoff_t *off,
711 xfs_fsblock_t *bno,
712 xfs_filblks_t *len,
713 xfs_exntst_t *state,
714 int *stat)
715{
716 xfs_bmbt_block_t *block;
717 xfs_buf_t *bp;
718#ifdef DEBUG
719 int error;
720#endif
721 int ptr;
722 xfs_bmbt_rec_t *rp;
723
724 block = xfs_bmbt_get_block(cur, 0, &bp);
725 ptr = cur->bc_ptrs[0];
726#ifdef DEBUG
727 if ((error = xfs_btree_check_lblock(cur, block, 0, bp)))
728 return error;
729#endif
730 if (ptr > INT_GET(block->bb_numrecs, ARCH_CONVERT) || ptr <= 0) {
731 *stat = 0;
732 return 0;
733 }
734 rp = XFS_BMAP_REC_IADDR(block, ptr, cur);
735 *off = xfs_bmbt_disk_get_startoff(rp);
736 *bno = xfs_bmbt_disk_get_startblock(rp);
737 *len = xfs_bmbt_disk_get_blockcount(rp);
738 *state = xfs_bmbt_disk_get_state(rp);
739 *stat = 1;
740 return 0;
741}
742#endif
743
744/*
745 * Insert one record/level. Return information to the caller
746 * allowing the next level up to proceed if necessary.
747 */
748STATIC int /* error */
749xfs_bmbt_insrec(
750 xfs_btree_cur_t *cur,
751 int level,
752 xfs_fsblock_t *bnop,
753 xfs_bmbt_rec_t *recp,
754 xfs_btree_cur_t **curp,
755 int *stat) /* no-go/done/continue */
756{
757 xfs_bmbt_block_t *block; /* bmap btree block */
758 xfs_buf_t *bp; /* buffer for block */
759 int error; /* error return value */
760#ifdef XFS_BMBT_TRACE
761 static char fname[] = "xfs_bmbt_insrec";
762#endif
763 int i; /* loop index */
764 xfs_bmbt_key_t key; /* bmap btree key */
765 xfs_bmbt_key_t *kp=NULL; /* pointer to bmap btree key */
766 int logflags; /* inode logging flags */
767 xfs_fsblock_t nbno; /* new block number */
768 struct xfs_btree_cur *ncur; /* new btree cursor */
769 xfs_bmbt_key_t nkey; /* new btree key value */
770 xfs_bmbt_rec_t nrec; /* new record count */
771 int optr; /* old key/record index */
772 xfs_bmbt_ptr_t *pp; /* pointer to bmap block addr */
773 int ptr; /* key/record index */
774 xfs_bmbt_rec_t *rp=NULL; /* pointer to bmap btree rec */
775 int numrecs;
776
777 ASSERT(level < cur->bc_nlevels);
778 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
779 XFS_BMBT_TRACE_ARGIFR(cur, level, *bnop, recp);
780 ncur = (xfs_btree_cur_t *)0;
781 INT_SET(key.br_startoff, ARCH_CONVERT,
782 xfs_bmbt_disk_get_startoff(recp));
783 optr = ptr = cur->bc_ptrs[level];
784 if (ptr == 0) {
785 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
786 *stat = 0;
787 return 0;
788 }
789 XFS_STATS_INC(xs_bmbt_insrec);
790 block = xfs_bmbt_get_block(cur, level, &bp);
791 numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT);
792#ifdef DEBUG
793 if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
794 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
795 return error;
796 }
797 if (ptr <= numrecs) {
798 if (level == 0) {
799 rp = XFS_BMAP_REC_IADDR(block, ptr, cur);
800 xfs_btree_check_rec(XFS_BTNUM_BMAP, recp, rp);
801 } else {
802 kp = XFS_BMAP_KEY_IADDR(block, ptr, cur);
803 xfs_btree_check_key(XFS_BTNUM_BMAP, &key, kp);
804 }
805 }
806#endif
807 nbno = NULLFSBLOCK;
808 if (numrecs == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
809 if (numrecs < XFS_BMAP_BLOCK_DMAXRECS(level, cur)) {
810 /*
811 * A root block, that can be made bigger.
812 */
813 xfs_iroot_realloc(cur->bc_private.b.ip, 1,
814 cur->bc_private.b.whichfork);
815 block = xfs_bmbt_get_block(cur, level, &bp);
816 } else if (level == cur->bc_nlevels - 1) {
817 if ((error = xfs_bmbt_newroot(cur, &logflags, stat)) ||
818 *stat == 0) {
819 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
820 return error;
821 }
822 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
823 logflags);
824 block = xfs_bmbt_get_block(cur, level, &bp);
825 } else {
826 if ((error = xfs_bmbt_rshift(cur, level, &i))) {
827 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
828 return error;
829 }
830 if (i) {
831 /* nothing */
832 } else {
833 if ((error = xfs_bmbt_lshift(cur, level, &i))) {
834 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
835 return error;
836 }
837 if (i) {
838 optr = ptr = cur->bc_ptrs[level];
839 } else {
840 if ((error = xfs_bmbt_split(cur, level,
841 &nbno, &nkey, &ncur,
842 &i))) {
843 XFS_BMBT_TRACE_CURSOR(cur,
844 ERROR);
845 return error;
846 }
847 if (i) {
848 block = xfs_bmbt_get_block(
849 cur, level, &bp);
850#ifdef DEBUG
851 if ((error =
852 xfs_btree_check_lblock(cur,
853 block, level, bp))) {
854 XFS_BMBT_TRACE_CURSOR(
855 cur, ERROR);
856 return error;
857 }
858#endif
859 ptr = cur->bc_ptrs[level];
860 xfs_bmbt_disk_set_allf(&nrec,
861 nkey.br_startoff, 0, 0,
862 XFS_EXT_NORM);
863 } else {
864 XFS_BMBT_TRACE_CURSOR(cur,
865 EXIT);
866 *stat = 0;
867 return 0;
868 }
869 }
870 }
871 }
872 }
873 numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT);
874 if (level > 0) {
875 kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
876 pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
877#ifdef DEBUG
878 for (i = numrecs; i >= ptr; i--) {
879 if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i - 1], ARCH_CONVERT),
880 level))) {
881 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
882 return error;
883 }
884 }
885#endif
886 memmove(&kp[ptr], &kp[ptr - 1],
887 (numrecs - ptr + 1) * sizeof(*kp));
888 memmove(&pp[ptr], &pp[ptr - 1], /* INT_: direct copy */
889 (numrecs - ptr + 1) * sizeof(*pp));
890#ifdef DEBUG
891 if ((error = xfs_btree_check_lptr(cur, (xfs_bmbt_ptr_t)*bnop,
892 level))) {
893 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
894 return error;
895 }
896#endif
897 kp[ptr - 1] = key;
898 INT_SET(pp[ptr - 1], ARCH_CONVERT, *bnop);
899 numrecs++;
900 INT_SET(block->bb_numrecs, ARCH_CONVERT, numrecs);
901 xfs_bmbt_log_keys(cur, bp, ptr, numrecs);
902 xfs_bmbt_log_ptrs(cur, bp, ptr, numrecs);
903 } else {
904 rp = XFS_BMAP_REC_IADDR(block, 1, cur);
905 memmove(&rp[ptr], &rp[ptr - 1],
906 (numrecs - ptr + 1) * sizeof(*rp));
907 rp[ptr - 1] = *recp;
908 numrecs++;
909 INT_SET(block->bb_numrecs, ARCH_CONVERT, numrecs);
910 xfs_bmbt_log_recs(cur, bp, ptr, numrecs);
911 }
912 xfs_bmbt_log_block(cur, bp, XFS_BB_NUMRECS);
913#ifdef DEBUG
914 if (ptr < numrecs) {
915 if (level == 0)
916 xfs_btree_check_rec(XFS_BTNUM_BMAP, rp + ptr - 1,
917 rp + ptr);
918 else
919 xfs_btree_check_key(XFS_BTNUM_BMAP, kp + ptr - 1,
920 kp + ptr);
921 }
922#endif
923 if (optr == 1 && (error = xfs_bmbt_updkey(cur, &key, level + 1))) {
924 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
925 return error;
926 }
927 *bnop = nbno;
928 if (nbno != NULLFSBLOCK) {
929 *recp = nrec;
930 *curp = ncur;
931 }
932 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
933 *stat = 1;
934 return 0;
935}
936
937STATIC int
938xfs_bmbt_killroot(
939 xfs_btree_cur_t *cur)
940{
941 xfs_bmbt_block_t *block;
942 xfs_bmbt_block_t *cblock;
943 xfs_buf_t *cbp;
944 xfs_bmbt_key_t *ckp;
945 xfs_bmbt_ptr_t *cpp;
946#ifdef DEBUG
947 int error;
948#endif
949#ifdef XFS_BMBT_TRACE
950 static char fname[] = "xfs_bmbt_killroot";
951#endif
952 int i;
953 xfs_bmbt_key_t *kp;
954 xfs_inode_t *ip;
955 xfs_ifork_t *ifp;
956 int level;
957 xfs_bmbt_ptr_t *pp;
958
959 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
960 level = cur->bc_nlevels - 1;
961 ASSERT(level >= 1);
962 /*
963 * Don't deal with the root block needs to be a leaf case.
964 * We're just going to turn the thing back into extents anyway.
965 */
966 if (level == 1) {
967 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
968 return 0;
969 }
970 block = xfs_bmbt_get_block(cur, level, &cbp);
971 /*
972 * Give up if the root has multiple children.
973 */
974 if (INT_GET(block->bb_numrecs, ARCH_CONVERT) != 1) {
975 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
976 return 0;
977 }
978 /*
979 * Only do this if the next level will fit.
980 * Then the data must be copied up to the inode,
981 * instead of freeing the root you free the next level.
982 */
983 cbp = cur->bc_bufs[level - 1];
984 cblock = XFS_BUF_TO_BMBT_BLOCK(cbp);
985 if (INT_GET(cblock->bb_numrecs, ARCH_CONVERT) > XFS_BMAP_BLOCK_DMAXRECS(level, cur)) {
986 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
987 return 0;
988 }
989 ASSERT(INT_GET(cblock->bb_leftsib, ARCH_CONVERT) == NULLDFSBNO);
990 ASSERT(INT_GET(cblock->bb_rightsib, ARCH_CONVERT) == NULLDFSBNO);
991 ip = cur->bc_private.b.ip;
992 ifp = XFS_IFORK_PTR(ip, cur->bc_private.b.whichfork);
993 ASSERT(XFS_BMAP_BLOCK_IMAXRECS(level, cur) ==
994 XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes));
995 i = (int)(INT_GET(cblock->bb_numrecs, ARCH_CONVERT) - XFS_BMAP_BLOCK_IMAXRECS(level, cur));
996 if (i) {
997 xfs_iroot_realloc(ip, i, cur->bc_private.b.whichfork);
998 block = ifp->if_broot;
999 }
1000 INT_MOD(block->bb_numrecs, ARCH_CONVERT, i);
1001 ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) == INT_GET(cblock->bb_numrecs, ARCH_CONVERT));
1002 kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
1003 ckp = XFS_BMAP_KEY_IADDR(cblock, 1, cur);
1004 memcpy(kp, ckp, INT_GET(block->bb_numrecs, ARCH_CONVERT) * sizeof(*kp));
1005 pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
1006 cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur);
1007#ifdef DEBUG
1008 for (i = 0; i < INT_GET(cblock->bb_numrecs, ARCH_CONVERT); i++) {
1009 if ((error = xfs_btree_check_lptr(cur, INT_GET(cpp[i], ARCH_CONVERT), level - 1))) {
1010 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1011 return error;
1012 }
1013 }
1014#endif
1015 memcpy(pp, cpp, INT_GET(block->bb_numrecs, ARCH_CONVERT) * sizeof(*pp));
1016 xfs_bmap_add_free(XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(cbp)), 1,
1017 cur->bc_private.b.flist, cur->bc_mp);
1018 ip->i_d.di_nblocks--;
1019 XFS_TRANS_MOD_DQUOT_BYINO(cur->bc_mp, cur->bc_tp, ip,
1020 XFS_TRANS_DQ_BCOUNT, -1L);
1021 xfs_trans_binval(cur->bc_tp, cbp);
1022 cur->bc_bufs[level - 1] = NULL;
1023 INT_MOD(block->bb_level, ARCH_CONVERT, -1);
1024 xfs_trans_log_inode(cur->bc_tp, ip,
1025 XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
1026 cur->bc_nlevels--;
1027 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1028 return 0;
1029}
1030
1031/*
1032 * Log key values from the btree block.
1033 */
1034STATIC void
1035xfs_bmbt_log_keys(
1036 xfs_btree_cur_t *cur,
1037 xfs_buf_t *bp,
1038 int kfirst,
1039 int klast)
1040{
1041#ifdef XFS_BMBT_TRACE
1042 static char fname[] = "xfs_bmbt_log_keys";
1043#endif
1044 xfs_trans_t *tp;
1045
1046 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
1047 XFS_BMBT_TRACE_ARGBII(cur, bp, kfirst, klast);
1048 tp = cur->bc_tp;
1049 if (bp) {
1050 xfs_bmbt_block_t *block;
1051 int first;
1052 xfs_bmbt_key_t *kp;
1053 int last;
1054
1055 block = XFS_BUF_TO_BMBT_BLOCK(bp);
1056 kp = XFS_BMAP_KEY_DADDR(block, 1, cur);
1057 first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
1058 last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
1059 xfs_trans_log_buf(tp, bp, first, last);
1060 } else {
1061 xfs_inode_t *ip;
1062
1063 ip = cur->bc_private.b.ip;
1064 xfs_trans_log_inode(tp, ip,
1065 XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
1066 }
1067 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1068}
1069
1070/*
1071 * Log pointer values from the btree block.
1072 */
1073STATIC void
1074xfs_bmbt_log_ptrs(
1075 xfs_btree_cur_t *cur,
1076 xfs_buf_t *bp,
1077 int pfirst,
1078 int plast)
1079{
1080#ifdef XFS_BMBT_TRACE
1081 static char fname[] = "xfs_bmbt_log_ptrs";
1082#endif
1083 xfs_trans_t *tp;
1084
1085 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
1086 XFS_BMBT_TRACE_ARGBII(cur, bp, pfirst, plast);
1087 tp = cur->bc_tp;
1088 if (bp) {
1089 xfs_bmbt_block_t *block;
1090 int first;
1091 int last;
1092 xfs_bmbt_ptr_t *pp;
1093
1094 block = XFS_BUF_TO_BMBT_BLOCK(bp);
1095 pp = XFS_BMAP_PTR_DADDR(block, 1, cur);
1096 first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
1097 last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
1098 xfs_trans_log_buf(tp, bp, first, last);
1099 } else {
1100 xfs_inode_t *ip;
1101
1102 ip = cur->bc_private.b.ip;
1103 xfs_trans_log_inode(tp, ip,
1104 XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
1105 }
1106 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1107}
1108
1109/*
1110 * Lookup the record. The cursor is made to point to it, based on dir.
1111 */
1112STATIC int /* error */
1113xfs_bmbt_lookup(
1114 xfs_btree_cur_t *cur,
1115 xfs_lookup_t dir,
1116 int *stat) /* success/failure */
1117{
1118 xfs_bmbt_block_t *block=NULL;
1119 xfs_buf_t *bp;
1120 xfs_daddr_t d;
1121 xfs_sfiloff_t diff;
1122 int error; /* error return value */
1123#ifdef XFS_BMBT_TRACE
1124 static char fname[] = "xfs_bmbt_lookup";
1125#endif
1126 xfs_fsblock_t fsbno=0;
1127 int high;
1128 int i;
1129 int keyno=0;
1130 xfs_bmbt_key_t *kkbase=NULL;
1131 xfs_bmbt_key_t *kkp;
1132 xfs_bmbt_rec_t *krbase=NULL;
1133 xfs_bmbt_rec_t *krp;
1134 int level;
1135 int low;
1136 xfs_mount_t *mp;
1137 xfs_bmbt_ptr_t *pp;
1138 xfs_bmbt_irec_t *rp;
1139 xfs_fileoff_t startoff;
1140 xfs_trans_t *tp;
1141
1142 XFS_STATS_INC(xs_bmbt_lookup);
1143 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
1144 XFS_BMBT_TRACE_ARGI(cur, (int)dir);
1145 tp = cur->bc_tp;
1146 mp = cur->bc_mp;
1147 rp = &cur->bc_rec.b;
1148 for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
1149 if (level < cur->bc_nlevels - 1) {
1150 d = XFS_FSB_TO_DADDR(mp, fsbno);
1151 bp = cur->bc_bufs[level];
1152 if (bp && XFS_BUF_ADDR(bp) != d)
1153 bp = (xfs_buf_t *)0;
1154 if (!bp) {
1155 if ((error = xfs_btree_read_bufl(mp, tp, fsbno,
1156 0, &bp, XFS_BMAP_BTREE_REF))) {
1157 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1158 return error;
1159 }
1160 xfs_btree_setbuf(cur, level, bp);
1161 block = XFS_BUF_TO_BMBT_BLOCK(bp);
1162 if ((error = xfs_btree_check_lblock(cur, block,
1163 level, bp))) {
1164 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1165 return error;
1166 }
1167 } else
1168 block = XFS_BUF_TO_BMBT_BLOCK(bp);
1169 } else
1170 block = xfs_bmbt_get_block(cur, level, &bp);
1171 if (diff == 0)
1172 keyno = 1;
1173 else {
1174 if (level > 0)
1175 kkbase = XFS_BMAP_KEY_IADDR(block, 1, cur);
1176 else
1177 krbase = XFS_BMAP_REC_IADDR(block, 1, cur);
1178 low = 1;
1179 if (!(high = INT_GET(block->bb_numrecs, ARCH_CONVERT))) {
1180 ASSERT(level == 0);
1181 cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
1182 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1183 *stat = 0;
1184 return 0;
1185 }
1186 while (low <= high) {
1187 XFS_STATS_INC(xs_bmbt_compare);
1188 keyno = (low + high) >> 1;
1189 if (level > 0) {
1190 kkp = kkbase + keyno - 1;
1191 startoff = INT_GET(kkp->br_startoff, ARCH_CONVERT);
1192 } else {
1193 krp = krbase + keyno - 1;
1194 startoff = xfs_bmbt_disk_get_startoff(krp);
1195 }
1196 diff = (xfs_sfiloff_t)
1197 (startoff - rp->br_startoff);
1198 if (diff < 0)
1199 low = keyno + 1;
1200 else if (diff > 0)
1201 high = keyno - 1;
1202 else
1203 break;
1204 }
1205 }
1206 if (level > 0) {
1207 if (diff > 0 && --keyno < 1)
1208 keyno = 1;
1209 pp = XFS_BMAP_PTR_IADDR(block, keyno, cur);
1210#ifdef DEBUG
1211 if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), level))) {
1212 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1213 return error;
1214 }
1215#endif
1216 fsbno = INT_GET(*pp, ARCH_CONVERT);
1217 cur->bc_ptrs[level] = keyno;
1218 }
1219 }
1220 if (dir != XFS_LOOKUP_LE && diff < 0) {
1221 keyno++;
1222 /*
1223 * If ge search and we went off the end of the block, but it's
1224 * not the last block, we're in the wrong block.
1225 */
1226 if (dir == XFS_LOOKUP_GE && keyno > INT_GET(block->bb_numrecs, ARCH_CONVERT) &&
1227 INT_GET(block->bb_rightsib, ARCH_CONVERT) != NULLDFSBNO) {
1228 cur->bc_ptrs[0] = keyno;
1229 if ((error = xfs_bmbt_increment(cur, 0, &i))) {
1230 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1231 return error;
1232 }
1233 XFS_WANT_CORRUPTED_RETURN(i == 1);
1234 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1235 *stat = 1;
1236 return 0;
1237 }
1238 }
1239 else if (dir == XFS_LOOKUP_LE && diff > 0)
1240 keyno--;
1241 cur->bc_ptrs[0] = keyno;
1242 if (keyno == 0 || keyno > INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
1243 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1244 *stat = 0;
1245 } else {
1246 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1247 *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
1248 }
1249 return 0;
1250}
1251
1252/*
1253 * Move 1 record left from cur/level if possible.
1254 * Update cur to reflect the new path.
1255 */
1256STATIC int /* error */
1257xfs_bmbt_lshift(
1258 xfs_btree_cur_t *cur,
1259 int level,
1260 int *stat) /* success/failure */
1261{
1262 int error; /* error return value */
1263#ifdef XFS_BMBT_TRACE
1264 static char fname[] = "xfs_bmbt_lshift";
1265#endif
1266#ifdef DEBUG
1267 int i; /* loop counter */
1268#endif
1269 xfs_bmbt_key_t key; /* bmap btree key */
1270 xfs_buf_t *lbp; /* left buffer pointer */
1271 xfs_bmbt_block_t *left; /* left btree block */
1272 xfs_bmbt_key_t *lkp=NULL; /* left btree key */
1273 xfs_bmbt_ptr_t *lpp; /* left address pointer */
1274 int lrecs; /* left record count */
1275 xfs_bmbt_rec_t *lrp=NULL; /* left record pointer */
1276 xfs_mount_t *mp; /* file system mount point */
1277 xfs_buf_t *rbp; /* right buffer pointer */
1278 xfs_bmbt_block_t *right; /* right btree block */
1279 xfs_bmbt_key_t *rkp=NULL; /* right btree key */
1280 xfs_bmbt_ptr_t *rpp=NULL; /* right address pointer */
1281 xfs_bmbt_rec_t *rrp=NULL; /* right record pointer */
1282 int rrecs; /* right record count */
1283
1284 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
1285 XFS_BMBT_TRACE_ARGI(cur, level);
1286 if (level == cur->bc_nlevels - 1) {
1287 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1288 *stat = 0;
1289 return 0;
1290 }
1291 rbp = cur->bc_bufs[level];
1292 right = XFS_BUF_TO_BMBT_BLOCK(rbp);
1293#ifdef DEBUG
1294 if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
1295 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1296 return error;
1297 }
1298#endif
1299 if (INT_GET(right->bb_leftsib, ARCH_CONVERT) == NULLDFSBNO) {
1300 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1301 *stat = 0;
1302 return 0;
1303 }
1304 if (cur->bc_ptrs[level] <= 1) {
1305 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1306 *stat = 0;
1307 return 0;
1308 }
1309 mp = cur->bc_mp;
1310 if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, INT_GET(right->bb_leftsib, ARCH_CONVERT), 0,
1311 &lbp, XFS_BMAP_BTREE_REF))) {
1312 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1313 return error;
1314 }
1315 left = XFS_BUF_TO_BMBT_BLOCK(lbp);
1316 if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
1317 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1318 return error;
1319 }
1320 if (INT_GET(left->bb_numrecs, ARCH_CONVERT) == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
1321 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1322 *stat = 0;
1323 return 0;
1324 }
1325 lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1;
1326 if (level > 0) {
1327 lkp = XFS_BMAP_KEY_IADDR(left, lrecs, cur);
1328 rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
1329 *lkp = *rkp;
1330 xfs_bmbt_log_keys(cur, lbp, lrecs, lrecs);
1331 lpp = XFS_BMAP_PTR_IADDR(left, lrecs, cur);
1332 rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
1333#ifdef DEBUG
1334 if ((error = xfs_btree_check_lptr(cur, INT_GET(*rpp, ARCH_CONVERT), level))) {
1335 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1336 return error;
1337 }
1338#endif
1339 *lpp = *rpp; /* INT_: direct copy */
1340 xfs_bmbt_log_ptrs(cur, lbp, lrecs, lrecs);
1341 } else {
1342 lrp = XFS_BMAP_REC_IADDR(left, lrecs, cur);
1343 rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
1344 *lrp = *rrp;
1345 xfs_bmbt_log_recs(cur, lbp, lrecs, lrecs);
1346 }
1347 INT_SET(left->bb_numrecs, ARCH_CONVERT, lrecs);
1348 xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS);
1349#ifdef DEBUG
1350 if (level > 0)
1351 xfs_btree_check_key(XFS_BTNUM_BMAP, lkp - 1, lkp);
1352 else
1353 xfs_btree_check_rec(XFS_BTNUM_BMAP, lrp - 1, lrp);
1354#endif
1355 rrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1;
1356 INT_SET(right->bb_numrecs, ARCH_CONVERT, rrecs);
1357 xfs_bmbt_log_block(cur, rbp, XFS_BB_NUMRECS);
1358 if (level > 0) {
1359#ifdef DEBUG
1360 for (i = 0; i < rrecs; i++) {
1361 if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i + 1], ARCH_CONVERT),
1362 level))) {
1363 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1364 return error;
1365 }
1366 }
1367#endif
1368 memmove(rkp, rkp + 1, rrecs * sizeof(*rkp));
1369 memmove(rpp, rpp + 1, rrecs * sizeof(*rpp));
1370 xfs_bmbt_log_keys(cur, rbp, 1, rrecs);
1371 xfs_bmbt_log_ptrs(cur, rbp, 1, rrecs);
1372 } else {
1373 memmove(rrp, rrp + 1, rrecs * sizeof(*rrp));
1374 xfs_bmbt_log_recs(cur, rbp, 1, rrecs);
1375 INT_SET(key.br_startoff, ARCH_CONVERT,
1376 xfs_bmbt_disk_get_startoff(rrp));
1377 rkp = &key;
1378 }
1379 if ((error = xfs_bmbt_updkey(cur, rkp, level + 1))) {
1380 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1381 return error;
1382 }
1383 cur->bc_ptrs[level]--;
1384 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1385 *stat = 1;
1386 return 0;
1387}
1388
1389/*
1390 * Move 1 record right from cur/level if possible.
1391 * Update cur to reflect the new path.
1392 */
1393STATIC int /* error */
1394xfs_bmbt_rshift(
1395 xfs_btree_cur_t *cur,
1396 int level,
1397 int *stat) /* success/failure */
1398{
1399 int error; /* error return value */
1400#ifdef XFS_BMBT_TRACE
1401 static char fname[] = "xfs_bmbt_rshift";
1402#endif
1403 int i; /* loop counter */
1404 xfs_bmbt_key_t key; /* bmap btree key */
1405 xfs_buf_t *lbp; /* left buffer pointer */
1406 xfs_bmbt_block_t *left; /* left btree block */
1407 xfs_bmbt_key_t *lkp; /* left btree key */
1408 xfs_bmbt_ptr_t *lpp; /* left address pointer */
1409 xfs_bmbt_rec_t *lrp; /* left record pointer */
1410 xfs_mount_t *mp; /* file system mount point */
1411 xfs_buf_t *rbp; /* right buffer pointer */
1412 xfs_bmbt_block_t *right; /* right btree block */
1413 xfs_bmbt_key_t *rkp; /* right btree key */
1414 xfs_bmbt_ptr_t *rpp; /* right address pointer */
1415 xfs_bmbt_rec_t *rrp=NULL; /* right record pointer */
1416 struct xfs_btree_cur *tcur; /* temporary btree cursor */
1417
1418 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
1419 XFS_BMBT_TRACE_ARGI(cur, level);
1420 if (level == cur->bc_nlevels - 1) {
1421 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1422 *stat = 0;
1423 return 0;
1424 }
1425 lbp = cur->bc_bufs[level];
1426 left = XFS_BUF_TO_BMBT_BLOCK(lbp);
1427#ifdef DEBUG
1428 if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
1429 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1430 return error;
1431 }
1432#endif
1433 if (INT_GET(left->bb_rightsib, ARCH_CONVERT) == NULLDFSBNO) {
1434 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1435 *stat = 0;
1436 return 0;
1437 }
1438 if (cur->bc_ptrs[level] >= INT_GET(left->bb_numrecs, ARCH_CONVERT)) {
1439 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1440 *stat = 0;
1441 return 0;
1442 }
1443 mp = cur->bc_mp;
1444 if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, INT_GET(left->bb_rightsib, ARCH_CONVERT), 0,
1445 &rbp, XFS_BMAP_BTREE_REF))) {
1446 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1447 return error;
1448 }
1449 right = XFS_BUF_TO_BMBT_BLOCK(rbp);
1450 if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
1451 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1452 return error;
1453 }
1454 if (INT_GET(right->bb_numrecs, ARCH_CONVERT) == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
1455 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1456 *stat = 0;
1457 return 0;
1458 }
1459 if (level > 0) {
1460 lkp = XFS_BMAP_KEY_IADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
1461 lpp = XFS_BMAP_PTR_IADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
1462 rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
1463 rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
1464#ifdef DEBUG
1465 for (i = INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1; i >= 0; i--) {
1466 if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level))) {
1467 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1468 return error;
1469 }
1470 }
1471#endif
1472 memmove(rkp + 1, rkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp));
1473 memmove(rpp + 1, rpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp));
1474#ifdef DEBUG
1475 if ((error = xfs_btree_check_lptr(cur, INT_GET(*lpp, ARCH_CONVERT), level))) {
1476 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1477 return error;
1478 }
1479#endif
1480 *rkp = *lkp;
1481 *rpp = *lpp; /* INT_: direct copy */
1482 xfs_bmbt_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
1483 xfs_bmbt_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
1484 } else {
1485 lrp = XFS_BMAP_REC_IADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
1486 rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
1487 memmove(rrp + 1, rrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp));
1488 *rrp = *lrp;
1489 xfs_bmbt_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
1490 INT_SET(key.br_startoff, ARCH_CONVERT,
1491 xfs_bmbt_disk_get_startoff(rrp));
1492 rkp = &key;
1493 }
1494 INT_MOD(left->bb_numrecs, ARCH_CONVERT, -1);
1495 xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS);
1496 INT_MOD(right->bb_numrecs, ARCH_CONVERT, +1);
1497#ifdef DEBUG
1498 if (level > 0)
1499 xfs_btree_check_key(XFS_BTNUM_BMAP, rkp, rkp + 1);
1500 else
1501 xfs_btree_check_rec(XFS_BTNUM_BMAP, rrp, rrp + 1);
1502#endif
1503 xfs_bmbt_log_block(cur, rbp, XFS_BB_NUMRECS);
1504 if ((error = xfs_btree_dup_cursor(cur, &tcur))) {
1505 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1506 return error;
1507 }
1508 i = xfs_btree_lastrec(tcur, level);
1509 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1510 if ((error = xfs_bmbt_increment(tcur, level, &i))) {
1511 XFS_BMBT_TRACE_CURSOR(tcur, ERROR);
1512 goto error1;
1513 }
1514 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1515 if ((error = xfs_bmbt_updkey(tcur, rkp, level + 1))) {
1516 XFS_BMBT_TRACE_CURSOR(tcur, ERROR);
1517 goto error1;
1518 }
1519 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
1520 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1521 *stat = 1;
1522 return 0;
1523error0:
1524 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1525error1:
1526 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
1527 return error;
1528}
1529
1530/*
1531 * Determine the extent state.
1532 */
1533/* ARGSUSED */
1534STATIC xfs_exntst_t
1535xfs_extent_state(
1536 xfs_filblks_t blks,
1537 int extent_flag)
1538{
1539 if (extent_flag) {
1540 ASSERT(blks != 0); /* saved for DMIG */
1541 return XFS_EXT_UNWRITTEN;
1542 }
1543 return XFS_EXT_NORM;
1544}
1545
1546
1547/*
1548 * Split cur/level block in half.
1549 * Return new block number and its first record (to be inserted into parent).
1550 */
1551STATIC int /* error */
1552xfs_bmbt_split(
1553 xfs_btree_cur_t *cur,
1554 int level,
1555 xfs_fsblock_t *bnop,
1556 xfs_bmbt_key_t *keyp,
1557 xfs_btree_cur_t **curp,
1558 int *stat) /* success/failure */
1559{
1560 xfs_alloc_arg_t args; /* block allocation args */
1561 int error; /* error return value */
1562#ifdef XFS_BMBT_TRACE
1563 static char fname[] = "xfs_bmbt_split";
1564#endif
1565 int i; /* loop counter */
1566 xfs_fsblock_t lbno; /* left sibling block number */
1567 xfs_buf_t *lbp; /* left buffer pointer */
1568 xfs_bmbt_block_t *left; /* left btree block */
1569 xfs_bmbt_key_t *lkp; /* left btree key */
1570 xfs_bmbt_ptr_t *lpp; /* left address pointer */
1571 xfs_bmbt_rec_t *lrp; /* left record pointer */
1572 xfs_buf_t *rbp; /* right buffer pointer */
1573 xfs_bmbt_block_t *right; /* right btree block */
1574 xfs_bmbt_key_t *rkp; /* right btree key */
1575 xfs_bmbt_ptr_t *rpp; /* right address pointer */
1576 xfs_bmbt_block_t *rrblock; /* right-right btree block */
1577 xfs_buf_t *rrbp; /* right-right buffer pointer */
1578 xfs_bmbt_rec_t *rrp; /* right record pointer */
1579
1580 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
1581 XFS_BMBT_TRACE_ARGIFK(cur, level, *bnop, keyp);
1582 args.tp = cur->bc_tp;
1583 args.mp = cur->bc_mp;
1584 lbp = cur->bc_bufs[level];
1585 lbno = XFS_DADDR_TO_FSB(args.mp, XFS_BUF_ADDR(lbp));
1586 left = XFS_BUF_TO_BMBT_BLOCK(lbp);
1587 args.fsbno = cur->bc_private.b.firstblock;
1588 if (args.fsbno == NULLFSBLOCK) {
1589 args.fsbno = lbno;
1590 args.type = XFS_ALLOCTYPE_START_BNO;
1591 } else if (cur->bc_private.b.flist->xbf_low)
1592 args.type = XFS_ALLOCTYPE_FIRST_AG;
1593 else
1594 args.type = XFS_ALLOCTYPE_NEAR_BNO;
1595 args.mod = args.minleft = args.alignment = args.total = args.isfl =
1596 args.userdata = args.minalignslop = 0;
1597 args.minlen = args.maxlen = args.prod = 1;
1598 args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
1599 if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
1600 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1601 return XFS_ERROR(ENOSPC);
1602 }
1603 if ((error = xfs_alloc_vextent(&args))) {
1604 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1605 return error;
1606 }
1607 if (args.fsbno == NULLFSBLOCK) {
1608 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1609 *stat = 0;
1610 return 0;
1611 }
1612 ASSERT(args.len == 1);
1613 cur->bc_private.b.firstblock = args.fsbno;
1614 cur->bc_private.b.allocated++;
1615 cur->bc_private.b.ip->i_d.di_nblocks++;
1616 xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
1617 XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip,
1618 XFS_TRANS_DQ_BCOUNT, 1L);
1619 rbp = xfs_btree_get_bufl(args.mp, args.tp, args.fsbno, 0);
1620 right = XFS_BUF_TO_BMBT_BLOCK(rbp);
1621#ifdef DEBUG
1622 if ((error = xfs_btree_check_lblock(cur, left, level, rbp))) {
1623 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1624 return error;
1625 }
1626#endif
1627 INT_SET(right->bb_magic, ARCH_CONVERT, XFS_BMAP_MAGIC);
1628 right->bb_level = left->bb_level; /* INT_: direct copy */
1629 INT_SET(right->bb_numrecs, ARCH_CONVERT, (__uint16_t)(INT_GET(left->bb_numrecs, ARCH_CONVERT) / 2));
1630 if ((INT_GET(left->bb_numrecs, ARCH_CONVERT) & 1) &&
1631 cur->bc_ptrs[level] <= INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1)
1632 INT_MOD(right->bb_numrecs, ARCH_CONVERT, +1);
1633 i = INT_GET(left->bb_numrecs, ARCH_CONVERT) - INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1;
1634 if (level > 0) {
1635 lkp = XFS_BMAP_KEY_IADDR(left, i, cur);
1636 lpp = XFS_BMAP_PTR_IADDR(left, i, cur);
1637 rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
1638 rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
1639#ifdef DEBUG
1640 for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) {
1641 if ((error = xfs_btree_check_lptr(cur, INT_GET(lpp[i], ARCH_CONVERT), level))) {
1642 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1643 return error;
1644 }
1645 }
1646#endif
1647 memcpy(rkp, lkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp));
1648 memcpy(rpp, lpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp));
1649 xfs_bmbt_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
1650 xfs_bmbt_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
1651 keyp->br_startoff = INT_GET(rkp->br_startoff, ARCH_CONVERT);
1652 } else {
1653 lrp = XFS_BMAP_REC_IADDR(left, i, cur);
1654 rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
1655 memcpy(rrp, lrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp));
1656 xfs_bmbt_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
1657 keyp->br_startoff = xfs_bmbt_disk_get_startoff(rrp);
1658 }
1659 INT_MOD(left->bb_numrecs, ARCH_CONVERT, -(INT_GET(right->bb_numrecs, ARCH_CONVERT)));
1660 right->bb_rightsib = left->bb_rightsib; /* INT_: direct copy */
1661 INT_SET(left->bb_rightsib, ARCH_CONVERT, args.fsbno);
1662 INT_SET(right->bb_leftsib, ARCH_CONVERT, lbno);
1663 xfs_bmbt_log_block(cur, rbp, XFS_BB_ALL_BITS);
1664 xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
1665 if (INT_GET(right->bb_rightsib, ARCH_CONVERT) != NULLDFSBNO) {
1666 if ((error = xfs_btree_read_bufl(args.mp, args.tp,
1667 INT_GET(right->bb_rightsib, ARCH_CONVERT), 0, &rrbp,
1668 XFS_BMAP_BTREE_REF))) {
1669 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1670 return error;
1671 }
1672 rrblock = XFS_BUF_TO_BMBT_BLOCK(rrbp);
1673 if ((error = xfs_btree_check_lblock(cur, rrblock, level, rrbp))) {
1674 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1675 return error;
1676 }
1677 INT_SET(rrblock->bb_leftsib, ARCH_CONVERT, args.fsbno);
1678 xfs_bmbt_log_block(cur, rrbp, XFS_BB_LEFTSIB);
1679 }
1680 if (cur->bc_ptrs[level] > INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1) {
1681 xfs_btree_setbuf(cur, level, rbp);
1682 cur->bc_ptrs[level] -= INT_GET(left->bb_numrecs, ARCH_CONVERT);
1683 }
1684 if (level + 1 < cur->bc_nlevels) {
1685 if ((error = xfs_btree_dup_cursor(cur, curp))) {
1686 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1687 return error;
1688 }
1689 (*curp)->bc_ptrs[level + 1]++;
1690 }
1691 *bnop = args.fsbno;
1692 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1693 *stat = 1;
1694 return 0;
1695}
1696
1697
1698/*
1699 * Update keys for the record.
1700 */
1701STATIC int
1702xfs_bmbt_updkey(
1703 xfs_btree_cur_t *cur,
1704 xfs_bmbt_key_t *keyp, /* on-disk format */
1705 int level)
1706{
1707 xfs_bmbt_block_t *block;
1708 xfs_buf_t *bp;
1709#ifdef DEBUG
1710 int error;
1711#endif
1712#ifdef XFS_BMBT_TRACE
1713 static char fname[] = "xfs_bmbt_updkey";
1714#endif
1715 xfs_bmbt_key_t *kp;
1716 int ptr;
1717
1718 ASSERT(level >= 1);
1719 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
1720 XFS_BMBT_TRACE_ARGIK(cur, level, keyp);
1721 for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
1722 block = xfs_bmbt_get_block(cur, level, &bp);
1723#ifdef DEBUG
1724 if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
1725 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1726 return error;
1727 }
1728#endif
1729 ptr = cur->bc_ptrs[level];
1730 kp = XFS_BMAP_KEY_IADDR(block, ptr, cur);
1731 *kp = *keyp;
1732 xfs_bmbt_log_keys(cur, bp, ptr, ptr);
1733 }
1734 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1735 return 0;
1736}
1737
1738/*
1739 * Convert on-disk form of btree root to in-memory form.
1740 */
1741void
1742xfs_bmdr_to_bmbt(
1743 xfs_bmdr_block_t *dblock,
1744 int dblocklen,
1745 xfs_bmbt_block_t *rblock,
1746 int rblocklen)
1747{
1748 int dmxr;
1749 xfs_bmbt_key_t *fkp;
1750 xfs_bmbt_ptr_t *fpp;
1751 xfs_bmbt_key_t *tkp;
1752 xfs_bmbt_ptr_t *tpp;
1753
1754 INT_SET(rblock->bb_magic, ARCH_CONVERT, XFS_BMAP_MAGIC);
1755 rblock->bb_level = dblock->bb_level; /* both in on-disk format */
1756 ASSERT(INT_GET(rblock->bb_level, ARCH_CONVERT) > 0);
1757 rblock->bb_numrecs = dblock->bb_numrecs;/* both in on-disk format */
1758 INT_SET(rblock->bb_leftsib, ARCH_CONVERT, NULLDFSBNO);
1759 INT_SET(rblock->bb_rightsib, ARCH_CONVERT, NULLDFSBNO);
1760 dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0);
1761 fkp = XFS_BTREE_KEY_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr);
1762 tkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen);
1763 fpp = XFS_BTREE_PTR_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr);
1764 tpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen);
1765 dmxr = INT_GET(dblock->bb_numrecs, ARCH_CONVERT);
1766 memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
1767 memcpy(tpp, fpp, sizeof(*fpp) * dmxr); /* INT_: direct copy */
1768}
1769
1770/*
1771 * Decrement cursor by one record at the level.
1772 * For nonzero levels the leaf-ward information is untouched.
1773 */
1774int /* error */
1775xfs_bmbt_decrement(
1776 xfs_btree_cur_t *cur,
1777 int level,
1778 int *stat) /* success/failure */
1779{
1780 xfs_bmbt_block_t *block;
1781 xfs_buf_t *bp;
1782 int error; /* error return value */
1783#ifdef XFS_BMBT_TRACE
1784 static char fname[] = "xfs_bmbt_decrement";
1785#endif
1786 xfs_fsblock_t fsbno;
1787 int lev;
1788 xfs_mount_t *mp;
1789 xfs_trans_t *tp;
1790
1791 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
1792 XFS_BMBT_TRACE_ARGI(cur, level);
1793 ASSERT(level < cur->bc_nlevels);
1794 if (level < cur->bc_nlevels - 1)
1795 xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
1796 if (--cur->bc_ptrs[level] > 0) {
1797 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1798 *stat = 1;
1799 return 0;
1800 }
1801 block = xfs_bmbt_get_block(cur, level, &bp);
1802#ifdef DEBUG
1803 if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
1804 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1805 return error;
1806 }
1807#endif
1808 if (INT_GET(block->bb_leftsib, ARCH_CONVERT) == NULLDFSBNO) {
1809 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1810 *stat = 0;
1811 return 0;
1812 }
1813 for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1814 if (--cur->bc_ptrs[lev] > 0)
1815 break;
1816 if (lev < cur->bc_nlevels - 1)
1817 xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
1818 }
1819 if (lev == cur->bc_nlevels) {
1820 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1821 *stat = 0;
1822 return 0;
1823 }
1824 tp = cur->bc_tp;
1825 mp = cur->bc_mp;
1826 for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) {
1827 fsbno = INT_GET(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT);
1828 if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp,
1829 XFS_BMAP_BTREE_REF))) {
1830 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1831 return error;
1832 }
1833 lev--;
1834 xfs_btree_setbuf(cur, lev, bp);
1835 block = XFS_BUF_TO_BMBT_BLOCK(bp);
1836 if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
1837 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1838 return error;
1839 }
1840 cur->bc_ptrs[lev] = INT_GET(block->bb_numrecs, ARCH_CONVERT);
1841 }
1842 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1843 *stat = 1;
1844 return 0;
1845}
1846
1847/*
1848 * Delete the record pointed to by cur.
1849 */
1850int /* error */
1851xfs_bmbt_delete(
1852 xfs_btree_cur_t *cur,
1853 int *stat) /* success/failure */
1854{
1855 int error; /* error return value */
1856#ifdef XFS_BMBT_TRACE
1857 static char fname[] = "xfs_bmbt_delete";
1858#endif
1859 int i;
1860 int level;
1861
1862 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
1863 for (level = 0, i = 2; i == 2; level++) {
1864 if ((error = xfs_bmbt_delrec(cur, level, &i))) {
1865 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1866 return error;
1867 }
1868 }
1869 if (i == 0) {
1870 for (level = 1; level < cur->bc_nlevels; level++) {
1871 if (cur->bc_ptrs[level] == 0) {
1872 if ((error = xfs_bmbt_decrement(cur, level,
1873 &i))) {
1874 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
1875 return error;
1876 }
1877 break;
1878 }
1879 }
1880 }
1881 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
1882 *stat = i;
1883 return 0;
1884}
1885
1886/*
1887 * Convert a compressed bmap extent record to an uncompressed form.
1888 * This code must be in sync with the routines xfs_bmbt_get_startoff,
1889 * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state.
1890 */
1891
1892STATIC __inline__ void
1893__xfs_bmbt_get_all(
1894 __uint64_t l0,
1895 __uint64_t l1,
1896 xfs_bmbt_irec_t *s)
1897{
1898 int ext_flag;
1899 xfs_exntst_t st;
1900
1901 ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN));
1902 s->br_startoff = ((xfs_fileoff_t)l0 &
1903 XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
1904#if XFS_BIG_BLKNOS
1905 s->br_startblock = (((xfs_fsblock_t)l0 & XFS_MASK64LO(9)) << 43) |
1906 (((xfs_fsblock_t)l1) >> 21);
1907#else
1908#ifdef DEBUG
1909 {
1910 xfs_dfsbno_t b;
1911
1912 b = (((xfs_dfsbno_t)l0 & XFS_MASK64LO(9)) << 43) |
1913 (((xfs_dfsbno_t)l1) >> 21);
1914 ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b));
1915 s->br_startblock = (xfs_fsblock_t)b;
1916 }
1917#else /* !DEBUG */
1918 s->br_startblock = (xfs_fsblock_t)(((xfs_dfsbno_t)l1) >> 21);
1919#endif /* DEBUG */
1920#endif /* XFS_BIG_BLKNOS */
1921 s->br_blockcount = (xfs_filblks_t)(l1 & XFS_MASK64LO(21));
1922 /* This is xfs_extent_state() in-line */
1923 if (ext_flag) {
1924 ASSERT(s->br_blockcount != 0); /* saved for DMIG */
1925 st = XFS_EXT_UNWRITTEN;
1926 } else
1927 st = XFS_EXT_NORM;
1928 s->br_state = st;
1929}
1930
1931void
1932xfs_bmbt_get_all(
1933 xfs_bmbt_rec_t *r,
1934 xfs_bmbt_irec_t *s)
1935{
1936 __xfs_bmbt_get_all(r->l0, r->l1, s);
1937}
1938
1939/*
1940 * Get the block pointer for the given level of the cursor.
1941 * Fill in the buffer pointer, if applicable.
1942 */
1943xfs_bmbt_block_t *
1944xfs_bmbt_get_block(
1945 xfs_btree_cur_t *cur,
1946 int level,
1947 xfs_buf_t **bpp)
1948{
1949 xfs_ifork_t *ifp;
1950 xfs_bmbt_block_t *rval;
1951
1952 if (level < cur->bc_nlevels - 1) {
1953 *bpp = cur->bc_bufs[level];
1954 rval = XFS_BUF_TO_BMBT_BLOCK(*bpp);
1955 } else {
1956 *bpp = NULL;
1957 ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
1958 cur->bc_private.b.whichfork);
1959 rval = ifp->if_broot;
1960 }
1961 return rval;
1962}
1963
1964/*
1965 * Extract the blockcount field from an in memory bmap extent record.
1966 */
1967xfs_filblks_t
1968xfs_bmbt_get_blockcount(
1969 xfs_bmbt_rec_t *r)
1970{
1971 return (xfs_filblks_t)(r->l1 & XFS_MASK64LO(21));
1972}
1973
1974/*
1975 * Extract the startblock field from an in memory bmap extent record.
1976 */
1977xfs_fsblock_t
1978xfs_bmbt_get_startblock(
1979 xfs_bmbt_rec_t *r)
1980{
1981#if XFS_BIG_BLKNOS
1982 return (((xfs_fsblock_t)r->l0 & XFS_MASK64LO(9)) << 43) |
1983 (((xfs_fsblock_t)r->l1) >> 21);
1984#else
1985#ifdef DEBUG
1986 xfs_dfsbno_t b;
1987
1988 b = (((xfs_dfsbno_t)r->l0 & XFS_MASK64LO(9)) << 43) |
1989 (((xfs_dfsbno_t)r->l1) >> 21);
1990 ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b));
1991 return (xfs_fsblock_t)b;
1992#else /* !DEBUG */
1993 return (xfs_fsblock_t)(((xfs_dfsbno_t)r->l1) >> 21);
1994#endif /* DEBUG */
1995#endif /* XFS_BIG_BLKNOS */
1996}
1997
1998/*
1999 * Extract the startoff field from an in memory bmap extent record.
2000 */
2001xfs_fileoff_t
2002xfs_bmbt_get_startoff(
2003 xfs_bmbt_rec_t *r)
2004{
2005 return ((xfs_fileoff_t)r->l0 &
2006 XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
2007}
2008
2009xfs_exntst_t
2010xfs_bmbt_get_state(
2011 xfs_bmbt_rec_t *r)
2012{
2013 int ext_flag;
2014
2015 ext_flag = (int)((r->l0) >> (64 - BMBT_EXNTFLAG_BITLEN));
2016 return xfs_extent_state(xfs_bmbt_get_blockcount(r),
2017 ext_flag);
2018}
2019
2020#if __BYTE_ORDER != __BIG_ENDIAN
2021/* Endian flipping versions of the bmbt extraction functions */
2022void
2023xfs_bmbt_disk_get_all(
2024 xfs_bmbt_rec_t *r,
2025 xfs_bmbt_irec_t *s)
2026{
2027 __uint64_t l0, l1;
2028
2029 l0 = INT_GET(r->l0, ARCH_CONVERT);
2030 l1 = INT_GET(r->l1, ARCH_CONVERT);
2031
2032 __xfs_bmbt_get_all(l0, l1, s);
2033}
2034
2035/*
2036 * Extract the blockcount field from an on disk bmap extent record.
2037 */
2038xfs_filblks_t
2039xfs_bmbt_disk_get_blockcount(
2040 xfs_bmbt_rec_t *r)
2041{
2042 return (xfs_filblks_t)(INT_GET(r->l1, ARCH_CONVERT) & XFS_MASK64LO(21));
2043}
2044
2045/*
2046 * Extract the startblock field from an on disk bmap extent record.
2047 */
2048xfs_fsblock_t
2049xfs_bmbt_disk_get_startblock(
2050 xfs_bmbt_rec_t *r)
2051{
2052#if XFS_BIG_BLKNOS
2053 return (((xfs_fsblock_t)INT_GET(r->l0, ARCH_CONVERT) & XFS_MASK64LO(9)) << 43) |
2054 (((xfs_fsblock_t)INT_GET(r->l1, ARCH_CONVERT)) >> 21);
2055#else
2056#ifdef DEBUG
2057 xfs_dfsbno_t b;
2058
2059 b = (((xfs_dfsbno_t)INT_GET(r->l0, ARCH_CONVERT) & XFS_MASK64LO(9)) << 43) |
2060 (((xfs_dfsbno_t)INT_GET(r->l1, ARCH_CONVERT)) >> 21);
2061 ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b));
2062 return (xfs_fsblock_t)b;
2063#else /* !DEBUG */
2064 return (xfs_fsblock_t)(((xfs_dfsbno_t)INT_GET(r->l1, ARCH_CONVERT)) >> 21);
2065#endif /* DEBUG */
2066#endif /* XFS_BIG_BLKNOS */
2067}
2068
2069/*
2070 * Extract the startoff field from a disk format bmap extent record.
2071 */
2072xfs_fileoff_t
2073xfs_bmbt_disk_get_startoff(
2074 xfs_bmbt_rec_t *r)
2075{
2076 return ((xfs_fileoff_t)INT_GET(r->l0, ARCH_CONVERT) &
2077 XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
2078}
2079
2080xfs_exntst_t
2081xfs_bmbt_disk_get_state(
2082 xfs_bmbt_rec_t *r)
2083{
2084 int ext_flag;
2085
2086 ext_flag = (int)((INT_GET(r->l0, ARCH_CONVERT)) >> (64 - BMBT_EXNTFLAG_BITLEN));
2087 return xfs_extent_state(xfs_bmbt_disk_get_blockcount(r),
2088 ext_flag);
2089}
2090#endif
2091
2092
2093/*
2094 * Increment cursor by one record at the level.
2095 * For nonzero levels the leaf-ward information is untouched.
2096 */
2097int /* error */
2098xfs_bmbt_increment(
2099 xfs_btree_cur_t *cur,
2100 int level,
2101 int *stat) /* success/failure */
2102{
2103 xfs_bmbt_block_t *block;
2104 xfs_buf_t *bp;
2105 int error; /* error return value */
2106#ifdef XFS_BMBT_TRACE
2107 static char fname[] = "xfs_bmbt_increment";
2108#endif
2109 xfs_fsblock_t fsbno;
2110 int lev;
2111 xfs_mount_t *mp;
2112 xfs_trans_t *tp;
2113
2114 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
2115 XFS_BMBT_TRACE_ARGI(cur, level);
2116 ASSERT(level < cur->bc_nlevels);
2117 if (level < cur->bc_nlevels - 1)
2118 xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
2119 block = xfs_bmbt_get_block(cur, level, &bp);
2120#ifdef DEBUG
2121 if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
2122 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
2123 return error;
2124 }
2125#endif
2126 if (++cur->bc_ptrs[level] <= INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
2127 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
2128 *stat = 1;
2129 return 0;
2130 }
2131 if (INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLDFSBNO) {
2132 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
2133 *stat = 0;
2134 return 0;
2135 }
2136 for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
2137 block = xfs_bmbt_get_block(cur, lev, &bp);
2138#ifdef DEBUG
2139 if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
2140 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
2141 return error;
2142 }
2143#endif
2144 if (++cur->bc_ptrs[lev] <= INT_GET(block->bb_numrecs, ARCH_CONVERT))
2145 break;
2146 if (lev < cur->bc_nlevels - 1)
2147 xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
2148 }
2149 if (lev == cur->bc_nlevels) {
2150 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
2151 *stat = 0;
2152 return 0;
2153 }
2154 tp = cur->bc_tp;
2155 mp = cur->bc_mp;
2156 for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) {
2157 fsbno = INT_GET(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT);
2158 if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp,
2159 XFS_BMAP_BTREE_REF))) {
2160 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
2161 return error;
2162 }
2163 lev--;
2164 xfs_btree_setbuf(cur, lev, bp);
2165 block = XFS_BUF_TO_BMBT_BLOCK(bp);
2166 if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
2167 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
2168 return error;
2169 }
2170 cur->bc_ptrs[lev] = 1;
2171 }
2172 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
2173 *stat = 1;
2174 return 0;
2175}
2176
2177/*
2178 * Insert the current record at the point referenced by cur.
2179 */
2180int /* error */
2181xfs_bmbt_insert(
2182 xfs_btree_cur_t *cur,
2183 int *stat) /* success/failure */
2184{
2185 int error; /* error return value */
2186#ifdef XFS_BMBT_TRACE
2187 static char fname[] = "xfs_bmbt_insert";
2188#endif
2189 int i;
2190 int level;
2191 xfs_fsblock_t nbno;
2192 xfs_btree_cur_t *ncur;
2193 xfs_bmbt_rec_t nrec;
2194 xfs_btree_cur_t *pcur;
2195
2196 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
2197 level = 0;
2198 nbno = NULLFSBLOCK;
2199 xfs_bmbt_disk_set_all(&nrec, &cur->bc_rec.b);
2200 ncur = (xfs_btree_cur_t *)0;
2201 pcur = cur;
2202 do {
2203 if ((error = xfs_bmbt_insrec(pcur, level++, &nbno, &nrec, &ncur,
2204 &i))) {
2205 if (pcur != cur)
2206 xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
2207 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
2208 return error;
2209 }
2210 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
2211 if (pcur != cur && (ncur || nbno == NULLFSBLOCK)) {
2212 cur->bc_nlevels = pcur->bc_nlevels;
2213 cur->bc_private.b.allocated +=
2214 pcur->bc_private.b.allocated;
2215 pcur->bc_private.b.allocated = 0;
2216 ASSERT((cur->bc_private.b.firstblock != NULLFSBLOCK) ||
2217 (cur->bc_private.b.ip->i_d.di_flags &
2218 XFS_DIFLAG_REALTIME));
2219 cur->bc_private.b.firstblock =
2220 pcur->bc_private.b.firstblock;
2221 ASSERT(cur->bc_private.b.flist ==
2222 pcur->bc_private.b.flist);
2223 xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
2224 }
2225 if (ncur) {
2226 pcur = ncur;
2227 ncur = (xfs_btree_cur_t *)0;
2228 }
2229 } while (nbno != NULLFSBLOCK);
2230 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
2231 *stat = i;
2232 return 0;
2233error0:
2234 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
2235 return error;
2236}
2237
2238/*
2239 * Log fields from the btree block header.
2240 */
2241void
2242xfs_bmbt_log_block(
2243 xfs_btree_cur_t *cur,
2244 xfs_buf_t *bp,
2245 int fields)
2246{
2247 int first;
2248#ifdef XFS_BMBT_TRACE
2249 static char fname[] = "xfs_bmbt_log_block";
2250#endif
2251 int last;
2252 xfs_trans_t *tp;
2253 static const short offsets[] = {
2254 offsetof(xfs_bmbt_block_t, bb_magic),
2255 offsetof(xfs_bmbt_block_t, bb_level),
2256 offsetof(xfs_bmbt_block_t, bb_numrecs),
2257 offsetof(xfs_bmbt_block_t, bb_leftsib),
2258 offsetof(xfs_bmbt_block_t, bb_rightsib),
2259 sizeof(xfs_bmbt_block_t)
2260 };
2261
2262 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
2263 XFS_BMBT_TRACE_ARGBI(cur, bp, fields);
2264 tp = cur->bc_tp;
2265 if (bp) {
2266 xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first,
2267 &last);
2268 xfs_trans_log_buf(tp, bp, first, last);
2269 } else
2270 xfs_trans_log_inode(tp, cur->bc_private.b.ip,
2271 XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
2272 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
2273}
2274
2275/*
2276 * Log record values from the btree block.
2277 */
2278void
2279xfs_bmbt_log_recs(
2280 xfs_btree_cur_t *cur,
2281 xfs_buf_t *bp,
2282 int rfirst,
2283 int rlast)
2284{
2285 xfs_bmbt_block_t *block;
2286 int first;
2287#ifdef XFS_BMBT_TRACE
2288 static char fname[] = "xfs_bmbt_log_recs";
2289#endif
2290 int last;
2291 xfs_bmbt_rec_t *rp;
2292 xfs_trans_t *tp;
2293
2294 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
2295 XFS_BMBT_TRACE_ARGBII(cur, bp, rfirst, rlast);
2296 ASSERT(bp);
2297 tp = cur->bc_tp;
2298 block = XFS_BUF_TO_BMBT_BLOCK(bp);
2299 rp = XFS_BMAP_REC_DADDR(block, 1, cur);
2300 first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
2301 last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
2302 xfs_trans_log_buf(tp, bp, first, last);
2303 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
2304}
2305
2306int /* error */
2307xfs_bmbt_lookup_eq(
2308 xfs_btree_cur_t *cur,
2309 xfs_fileoff_t off,
2310 xfs_fsblock_t bno,
2311 xfs_filblks_t len,
2312 int *stat) /* success/failure */
2313{
2314 cur->bc_rec.b.br_startoff = off;
2315 cur->bc_rec.b.br_startblock = bno;
2316 cur->bc_rec.b.br_blockcount = len;
2317 return xfs_bmbt_lookup(cur, XFS_LOOKUP_EQ, stat);
2318}
2319
2320int /* error */
2321xfs_bmbt_lookup_ge(
2322 xfs_btree_cur_t *cur,
2323 xfs_fileoff_t off,
2324 xfs_fsblock_t bno,
2325 xfs_filblks_t len,
2326 int *stat) /* success/failure */
2327{
2328 cur->bc_rec.b.br_startoff = off;
2329 cur->bc_rec.b.br_startblock = bno;
2330 cur->bc_rec.b.br_blockcount = len;
2331 return xfs_bmbt_lookup(cur, XFS_LOOKUP_GE, stat);
2332}
2333
2334int /* error */
2335xfs_bmbt_lookup_le(
2336 xfs_btree_cur_t *cur,
2337 xfs_fileoff_t off,
2338 xfs_fsblock_t bno,
2339 xfs_filblks_t len,
2340 int *stat) /* success/failure */
2341{
2342 cur->bc_rec.b.br_startoff = off;
2343 cur->bc_rec.b.br_startblock = bno;
2344 cur->bc_rec.b.br_blockcount = len;
2345 return xfs_bmbt_lookup(cur, XFS_LOOKUP_LE, stat);
2346}
2347
2348/*
2349 * Give the bmap btree a new root block. Copy the old broot contents
2350 * down into a real block and make the broot point to it.
2351 */
2352int /* error */
2353xfs_bmbt_newroot(
2354 xfs_btree_cur_t *cur, /* btree cursor */
2355 int *logflags, /* logging flags for inode */
2356 int *stat) /* return status - 0 fail */
2357{
2358 xfs_alloc_arg_t args; /* allocation arguments */
2359 xfs_bmbt_block_t *block; /* bmap btree block */
2360 xfs_buf_t *bp; /* buffer for block */
2361 xfs_bmbt_block_t *cblock; /* child btree block */
2362 xfs_bmbt_key_t *ckp; /* child key pointer */
2363 xfs_bmbt_ptr_t *cpp; /* child ptr pointer */
2364 int error; /* error return code */
2365#ifdef XFS_BMBT_TRACE
2366 static char fname[] = "xfs_bmbt_newroot";
2367#endif
2368#ifdef DEBUG
2369 int i; /* loop counter */
2370#endif
2371 xfs_bmbt_key_t *kp; /* pointer to bmap btree key */
2372 int level; /* btree level */
2373 xfs_bmbt_ptr_t *pp; /* pointer to bmap block addr */
2374
2375 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
2376 level = cur->bc_nlevels - 1;
2377 block = xfs_bmbt_get_block(cur, level, &bp);
2378 /*
2379 * Copy the root into a real block.
2380 */
2381 args.mp = cur->bc_mp;
2382 pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
2383 args.tp = cur->bc_tp;
2384 args.fsbno = cur->bc_private.b.firstblock;
2385 args.mod = args.minleft = args.alignment = args.total = args.isfl =
2386 args.userdata = args.minalignslop = 0;
2387 args.minlen = args.maxlen = args.prod = 1;
2388 args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
2389 if (args.fsbno == NULLFSBLOCK) {
2390#ifdef DEBUG
2391 if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), level))) {
2392 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
2393 return error;
2394 }
2395#endif
2396 args.fsbno = INT_GET(*pp, ARCH_CONVERT);
2397 args.type = XFS_ALLOCTYPE_START_BNO;
2398 } else if (args.wasdel)
2399 args.type = XFS_ALLOCTYPE_FIRST_AG;
2400 else
2401 args.type = XFS_ALLOCTYPE_NEAR_BNO;
2402 if ((error = xfs_alloc_vextent(&args))) {
2403 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
2404 return error;
2405 }
2406 if (args.fsbno == NULLFSBLOCK) {
2407 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
2408 *stat = 0;
2409 return 0;
2410 }
2411 ASSERT(args.len == 1);
2412 cur->bc_private.b.firstblock = args.fsbno;
2413 cur->bc_private.b.allocated++;
2414 cur->bc_private.b.ip->i_d.di_nblocks++;
2415 XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip,
2416 XFS_TRANS_DQ_BCOUNT, 1L);
2417 bp = xfs_btree_get_bufl(args.mp, cur->bc_tp, args.fsbno, 0);
2418 cblock = XFS_BUF_TO_BMBT_BLOCK(bp);
2419 *cblock = *block;
2420 INT_MOD(block->bb_level, ARCH_CONVERT, +1);
2421 INT_SET(block->bb_numrecs, ARCH_CONVERT, 1);
2422 cur->bc_nlevels++;
2423 cur->bc_ptrs[level + 1] = 1;
2424 kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
2425 ckp = XFS_BMAP_KEY_IADDR(cblock, 1, cur);
2426 memcpy(ckp, kp, INT_GET(cblock->bb_numrecs, ARCH_CONVERT) * sizeof(*kp));
2427 cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur);
2428#ifdef DEBUG
2429 for (i = 0; i < INT_GET(cblock->bb_numrecs, ARCH_CONVERT); i++) {
2430 if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i], ARCH_CONVERT), level))) {
2431 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
2432 return error;
2433 }
2434 }
2435#endif
2436 memcpy(cpp, pp, INT_GET(cblock->bb_numrecs, ARCH_CONVERT) * sizeof(*pp));
2437#ifdef DEBUG
2438 if ((error = xfs_btree_check_lptr(cur, (xfs_bmbt_ptr_t)args.fsbno,
2439 level))) {
2440 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
2441 return error;
2442 }
2443#endif
2444 INT_SET(*pp, ARCH_CONVERT, args.fsbno);
2445 xfs_iroot_realloc(cur->bc_private.b.ip, 1 - INT_GET(cblock->bb_numrecs, ARCH_CONVERT),
2446 cur->bc_private.b.whichfork);
2447 xfs_btree_setbuf(cur, level, bp);
2448 /*
2449 * Do all this logging at the end so that
2450 * the root is at the right level.
2451 */
2452 xfs_bmbt_log_block(cur, bp, XFS_BB_ALL_BITS);
2453 xfs_bmbt_log_keys(cur, bp, 1, INT_GET(cblock->bb_numrecs, ARCH_CONVERT));
2454 xfs_bmbt_log_ptrs(cur, bp, 1, INT_GET(cblock->bb_numrecs, ARCH_CONVERT));
2455 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
2456 *logflags |=
2457 XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork);
2458 *stat = 1;
2459 return 0;
2460}
2461
2462/*
2463 * Set all the fields in a bmap extent record from the uncompressed form.
2464 */
2465void
2466xfs_bmbt_set_all(
2467 xfs_bmbt_rec_t *r,
2468 xfs_bmbt_irec_t *s)
2469{
2470 int extent_flag;
2471
2472 ASSERT((s->br_state == XFS_EXT_NORM) ||
2473 (s->br_state == XFS_EXT_UNWRITTEN));
2474 extent_flag = (s->br_state == XFS_EXT_NORM) ? 0 : 1;
2475 ASSERT((s->br_startoff & XFS_MASK64HI(9)) == 0);
2476 ASSERT((s->br_blockcount & XFS_MASK64HI(43)) == 0);
2477#if XFS_BIG_BLKNOS
2478 ASSERT((s->br_startblock & XFS_MASK64HI(12)) == 0);
2479 r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
2480 ((xfs_bmbt_rec_base_t)s->br_startoff << 9) |
2481 ((xfs_bmbt_rec_base_t)s->br_startblock >> 43);
2482 r->l1 = ((xfs_bmbt_rec_base_t)s->br_startblock << 21) |
2483 ((xfs_bmbt_rec_base_t)s->br_blockcount &
2484 (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
2485#else /* !XFS_BIG_BLKNOS */
2486 if (ISNULLSTARTBLOCK(s->br_startblock)) {
2487 r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
2488 ((xfs_bmbt_rec_base_t)s->br_startoff << 9) |
2489 (xfs_bmbt_rec_base_t)XFS_MASK64LO(9);
2490 r->l1 = XFS_MASK64HI(11) |
2491 ((xfs_bmbt_rec_base_t)s->br_startblock << 21) |
2492 ((xfs_bmbt_rec_base_t)s->br_blockcount &
2493 (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
2494 } else {
2495 r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
2496 ((xfs_bmbt_rec_base_t)s->br_startoff << 9);
2497 r->l1 = ((xfs_bmbt_rec_base_t)s->br_startblock << 21) |
2498 ((xfs_bmbt_rec_base_t)s->br_blockcount &
2499 (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
2500 }
2501#endif /* XFS_BIG_BLKNOS */
2502}
2503
2504/*
2505 * Set all the fields in a bmap extent record from the arguments.
2506 */
2507void
2508xfs_bmbt_set_allf(
2509 xfs_bmbt_rec_t *r,
2510 xfs_fileoff_t o,
2511 xfs_fsblock_t b,
2512 xfs_filblks_t c,
2513 xfs_exntst_t v)
2514{
2515 int extent_flag;
2516
2517 ASSERT((v == XFS_EXT_NORM) || (v == XFS_EXT_UNWRITTEN));
2518 extent_flag = (v == XFS_EXT_NORM) ? 0 : 1;
2519 ASSERT((o & XFS_MASK64HI(64-BMBT_STARTOFF_BITLEN)) == 0);
2520 ASSERT((c & XFS_MASK64HI(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
2521#if XFS_BIG_BLKNOS
2522 ASSERT((b & XFS_MASK64HI(64-BMBT_STARTBLOCK_BITLEN)) == 0);
2523 r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
2524 ((xfs_bmbt_rec_base_t)o << 9) |
2525 ((xfs_bmbt_rec_base_t)b >> 43);
2526 r->l1 = ((xfs_bmbt_rec_base_t)b << 21) |
2527 ((xfs_bmbt_rec_base_t)c &
2528 (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
2529#else /* !XFS_BIG_BLKNOS */
2530 if (ISNULLSTARTBLOCK(b)) {
2531 r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
2532 ((xfs_bmbt_rec_base_t)o << 9) |
2533 (xfs_bmbt_rec_base_t)XFS_MASK64LO(9);
2534 r->l1 = XFS_MASK64HI(11) |
2535 ((xfs_bmbt_rec_base_t)b << 21) |
2536 ((xfs_bmbt_rec_base_t)c &
2537 (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
2538 } else {
2539 r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
2540 ((xfs_bmbt_rec_base_t)o << 9);
2541 r->l1 = ((xfs_bmbt_rec_base_t)b << 21) |
2542 ((xfs_bmbt_rec_base_t)c &
2543 (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
2544 }
2545#endif /* XFS_BIG_BLKNOS */
2546}
2547
2548#if __BYTE_ORDER != __BIG_ENDIAN
2549/*
2550 * Set all the fields in a bmap extent record from the uncompressed form.
2551 */
2552void
2553xfs_bmbt_disk_set_all(
2554 xfs_bmbt_rec_t *r,
2555 xfs_bmbt_irec_t *s)
2556{
2557 int extent_flag;
2558
2559 ASSERT((s->br_state == XFS_EXT_NORM) ||
2560 (s->br_state == XFS_EXT_UNWRITTEN));
2561 extent_flag = (s->br_state == XFS_EXT_NORM) ? 0 : 1;
2562 ASSERT((s->br_startoff & XFS_MASK64HI(9)) == 0);
2563 ASSERT((s->br_blockcount & XFS_MASK64HI(43)) == 0);
2564#if XFS_BIG_BLKNOS
2565 ASSERT((s->br_startblock & XFS_MASK64HI(12)) == 0);
2566 INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) |
2567 ((xfs_bmbt_rec_base_t)s->br_startoff << 9) |
2568 ((xfs_bmbt_rec_base_t)s->br_startblock >> 43));
2569 INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)s->br_startblock << 21) |
2570 ((xfs_bmbt_rec_base_t)s->br_blockcount &
2571 (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
2572#else /* !XFS_BIG_BLKNOS */
2573 if (ISNULLSTARTBLOCK(s->br_startblock)) {
2574 INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) |
2575 ((xfs_bmbt_rec_base_t)s->br_startoff << 9) |
2576 (xfs_bmbt_rec_base_t)XFS_MASK64LO(9));
2577 INT_SET(r->l1, ARCH_CONVERT, XFS_MASK64HI(11) |
2578 ((xfs_bmbt_rec_base_t)s->br_startblock << 21) |
2579 ((xfs_bmbt_rec_base_t)s->br_blockcount &
2580 (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
2581 } else {
2582 INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) |
2583 ((xfs_bmbt_rec_base_t)s->br_startoff << 9));
2584 INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)s->br_startblock << 21) |
2585 ((xfs_bmbt_rec_base_t)s->br_blockcount &
2586 (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
2587 }
2588#endif /* XFS_BIG_BLKNOS */
2589}
2590
2591/*
2592 * Set all the fields in a disk format bmap extent record from the arguments.
2593 */
2594void
2595xfs_bmbt_disk_set_allf(
2596 xfs_bmbt_rec_t *r,
2597 xfs_fileoff_t o,
2598 xfs_fsblock_t b,
2599 xfs_filblks_t c,
2600 xfs_exntst_t v)
2601{
2602 int extent_flag;
2603
2604 ASSERT((v == XFS_EXT_NORM) || (v == XFS_EXT_UNWRITTEN));
2605 extent_flag = (v == XFS_EXT_NORM) ? 0 : 1;
2606 ASSERT((o & XFS_MASK64HI(64-BMBT_STARTOFF_BITLEN)) == 0);
2607 ASSERT((c & XFS_MASK64HI(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
2608#if XFS_BIG_BLKNOS
2609 ASSERT((b & XFS_MASK64HI(64-BMBT_STARTBLOCK_BITLEN)) == 0);
2610 INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) |
2611 ((xfs_bmbt_rec_base_t)o << 9) |
2612 ((xfs_bmbt_rec_base_t)b >> 43));
2613 INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)b << 21) |
2614 ((xfs_bmbt_rec_base_t)c &
2615 (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
2616#else /* !XFS_BIG_BLKNOS */
2617 if (ISNULLSTARTBLOCK(b)) {
2618 INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) |
2619 ((xfs_bmbt_rec_base_t)o << 9) |
2620 (xfs_bmbt_rec_base_t)XFS_MASK64LO(9));
2621 INT_SET(r->l1, ARCH_CONVERT, XFS_MASK64HI(11) |
2622 ((xfs_bmbt_rec_base_t)b << 21) |
2623 ((xfs_bmbt_rec_base_t)c &
2624 (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
2625 } else {
2626 INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) |
2627 ((xfs_bmbt_rec_base_t)o << 9));
2628 INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)b << 21) |
2629 ((xfs_bmbt_rec_base_t)c &
2630 (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
2631 }
2632#endif /* XFS_BIG_BLKNOS */
2633}
2634#endif
2635
2636/*
2637 * Set the blockcount field in a bmap extent record.
2638 */
2639void
2640xfs_bmbt_set_blockcount(
2641 xfs_bmbt_rec_t *r,
2642 xfs_filblks_t v)
2643{
2644 ASSERT((v & XFS_MASK64HI(43)) == 0);
2645 r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64HI(43)) |
2646 (xfs_bmbt_rec_base_t)(v & XFS_MASK64LO(21));
2647}
2648
2649/*
2650 * Set the startblock field in a bmap extent record.
2651 */
2652void
2653xfs_bmbt_set_startblock(
2654 xfs_bmbt_rec_t *r,
2655 xfs_fsblock_t v)
2656{
2657#if XFS_BIG_BLKNOS
2658 ASSERT((v & XFS_MASK64HI(12)) == 0);
2659 r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)XFS_MASK64HI(55)) |
2660 (xfs_bmbt_rec_base_t)(v >> 43);
2661 r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)) |
2662 (xfs_bmbt_rec_base_t)(v << 21);
2663#else /* !XFS_BIG_BLKNOS */
2664 if (ISNULLSTARTBLOCK(v)) {
2665 r->l0 |= (xfs_bmbt_rec_base_t)XFS_MASK64LO(9);
2666 r->l1 = (xfs_bmbt_rec_base_t)XFS_MASK64HI(11) |
2667 ((xfs_bmbt_rec_base_t)v << 21) |
2668 (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
2669 } else {
2670 r->l0 &= ~(xfs_bmbt_rec_base_t)XFS_MASK64LO(9);
2671 r->l1 = ((xfs_bmbt_rec_base_t)v << 21) |
2672 (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
2673 }
2674#endif /* XFS_BIG_BLKNOS */
2675}
2676
2677/*
2678 * Set the startoff field in a bmap extent record.
2679 */
2680void
2681xfs_bmbt_set_startoff(
2682 xfs_bmbt_rec_t *r,
2683 xfs_fileoff_t v)
2684{
2685 ASSERT((v & XFS_MASK64HI(9)) == 0);
2686 r->l0 = (r->l0 & (xfs_bmbt_rec_base_t) XFS_MASK64HI(1)) |
2687 ((xfs_bmbt_rec_base_t)v << 9) |
2688 (r->l0 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(9));
2689}
2690
2691/*
2692 * Set the extent state field in a bmap extent record.
2693 */
2694void
2695xfs_bmbt_set_state(
2696 xfs_bmbt_rec_t *r,
2697 xfs_exntst_t v)
2698{
2699 ASSERT(v == XFS_EXT_NORM || v == XFS_EXT_UNWRITTEN);
2700 if (v == XFS_EXT_NORM)
2701 r->l0 &= XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN);
2702 else
2703 r->l0 |= XFS_MASK64HI(BMBT_EXNTFLAG_BITLEN);
2704}
2705
2706/*
2707 * Convert in-memory form of btree root to on-disk form.
2708 */
2709void
2710xfs_bmbt_to_bmdr(
2711 xfs_bmbt_block_t *rblock,
2712 int rblocklen,
2713 xfs_bmdr_block_t *dblock,
2714 int dblocklen)
2715{
2716 int dmxr;
2717 xfs_bmbt_key_t *fkp;
2718 xfs_bmbt_ptr_t *fpp;
2719 xfs_bmbt_key_t *tkp;
2720 xfs_bmbt_ptr_t *tpp;
2721
2722 ASSERT(INT_GET(rblock->bb_magic, ARCH_CONVERT) == XFS_BMAP_MAGIC);
2723 ASSERT(INT_GET(rblock->bb_leftsib, ARCH_CONVERT) == NULLDFSBNO);
2724 ASSERT(INT_GET(rblock->bb_rightsib, ARCH_CONVERT) == NULLDFSBNO);
2725 ASSERT(INT_GET(rblock->bb_level, ARCH_CONVERT) > 0);
2726 dblock->bb_level = rblock->bb_level; /* both in on-disk format */
2727 dblock->bb_numrecs = rblock->bb_numrecs;/* both in on-disk format */
2728 dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0);
2729 fkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen);
2730 tkp = XFS_BTREE_KEY_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr);
2731 fpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen);
2732 tpp = XFS_BTREE_PTR_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr);
2733 dmxr = INT_GET(dblock->bb_numrecs, ARCH_CONVERT);
2734 memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
2735 memcpy(tpp, fpp, sizeof(*fpp) * dmxr); /* INT_: direct copy */
2736}
2737
2738/*
2739 * Update the record to the passed values.
2740 */
2741int
2742xfs_bmbt_update(
2743 xfs_btree_cur_t *cur,
2744 xfs_fileoff_t off,
2745 xfs_fsblock_t bno,
2746 xfs_filblks_t len,
2747 xfs_exntst_t state)
2748{
2749 xfs_bmbt_block_t *block;
2750 xfs_buf_t *bp;
2751 int error;
2752#ifdef XFS_BMBT_TRACE
2753 static char fname[] = "xfs_bmbt_update";
2754#endif
2755 xfs_bmbt_key_t key;
2756 int ptr;
2757 xfs_bmbt_rec_t *rp;
2758
2759 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
2760 XFS_BMBT_TRACE_ARGFFFI(cur, (xfs_dfiloff_t)off, (xfs_dfsbno_t)bno,
2761 (xfs_dfilblks_t)len, (int)state);
2762 block = xfs_bmbt_get_block(cur, 0, &bp);
2763#ifdef DEBUG
2764 if ((error = xfs_btree_check_lblock(cur, block, 0, bp))) {
2765 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
2766 return error;
2767 }
2768#endif
2769 ptr = cur->bc_ptrs[0];
2770 rp = XFS_BMAP_REC_IADDR(block, ptr, cur);
2771 xfs_bmbt_disk_set_allf(rp, off, bno, len, state);
2772 xfs_bmbt_log_recs(cur, bp, ptr, ptr);
2773 if (ptr > 1) {
2774 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
2775 return 0;
2776 }
2777 INT_SET(key.br_startoff, ARCH_CONVERT, off);
2778 if ((error = xfs_bmbt_updkey(cur, &key, 1))) {
2779 XFS_BMBT_TRACE_CURSOR(cur, ERROR);
2780 return error;
2781 }
2782 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
2783 return 0;
2784}
2785
2786/*
2787 * Check an extent list, which has just been read, for
2788 * any bit in the extent flag field. ASSERT on debug
2789 * kernels, as this condition should not occur.
2790 * Return an error condition (1) if any flags found,
2791 * otherwise return 0.
2792 */
2793
2794int
2795xfs_check_nostate_extents(
2796 xfs_bmbt_rec_t *ep,
2797 xfs_extnum_t num)
2798{
2799 for (; num > 0; num--, ep++) {
2800 if ((ep->l0 >>
2801 (64 - BMBT_EXNTFLAG_BITLEN)) != 0) {
2802 ASSERT(0);
2803 return 1;
2804 }
2805 }
2806 return 0;
2807}
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
new file mode 100644
index 000000000000..843ff12b4bf2
--- /dev/null
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -0,0 +1,701 @@
1/*
2 * Copyright (c) 2000,2002-2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_BMAP_BTREE_H__
33#define __XFS_BMAP_BTREE_H__
34
35#define XFS_BMAP_MAGIC 0x424d4150 /* 'BMAP' */
36
37struct xfs_btree_cur;
38struct xfs_btree_lblock;
39struct xfs_mount;
40struct xfs_inode;
41
42/*
43 * Bmap root header, on-disk form only.
44 */
45typedef struct xfs_bmdr_block
46{
47 __uint16_t bb_level; /* 0 is a leaf */
48 __uint16_t bb_numrecs; /* current # of data records */
49} xfs_bmdr_block_t;
50
51/*
52 * Bmap btree record and extent descriptor.
53 * For 32-bit kernels,
54 * l0:31 is an extent flag (value 1 indicates non-normal).
55 * l0:0-30 and l1:9-31 are startoff.
56 * l1:0-8, l2:0-31, and l3:21-31 are startblock.
57 * l3:0-20 are blockcount.
58 * For 64-bit kernels,
59 * l0:63 is an extent flag (value 1 indicates non-normal).
60 * l0:9-62 are startoff.
61 * l0:0-8 and l1:21-63 are startblock.
62 * l1:0-20 are blockcount.
63 */
64
65#if __BYTE_ORDER == __LITTLE_ENDIAN
66
67#define BMBT_TOTAL_BITLEN 128 /* 128 bits, 16 bytes */
68#define BMBT_EXNTFLAG_BITOFF 0
69#define BMBT_EXNTFLAG_BITLEN 1
70#define BMBT_STARTOFF_BITOFF (BMBT_EXNTFLAG_BITOFF + BMBT_EXNTFLAG_BITLEN)
71#define BMBT_STARTOFF_BITLEN 54
72#define BMBT_STARTBLOCK_BITOFF (BMBT_STARTOFF_BITOFF + BMBT_STARTOFF_BITLEN)
73#define BMBT_STARTBLOCK_BITLEN 52
74#define BMBT_BLOCKCOUNT_BITOFF \
75 (BMBT_STARTBLOCK_BITOFF + BMBT_STARTBLOCK_BITLEN)
76#define BMBT_BLOCKCOUNT_BITLEN (BMBT_TOTAL_BITLEN - BMBT_BLOCKCOUNT_BITOFF)
77
78#else
79
80#define BMBT_TOTAL_BITLEN 128 /* 128 bits, 16 bytes */
81#define BMBT_EXNTFLAG_BITOFF 63
82#define BMBT_EXNTFLAG_BITLEN 1
83#define BMBT_STARTOFF_BITOFF (BMBT_EXNTFLAG_BITOFF - BMBT_STARTOFF_BITLEN)
84#define BMBT_STARTOFF_BITLEN 54
85#define BMBT_STARTBLOCK_BITOFF 85 /* 128 - 43 (other 9 is in first word) */
86#define BMBT_STARTBLOCK_BITLEN 52
87#define BMBT_BLOCKCOUNT_BITOFF 64 /* Start of second 64 bit container */
88#define BMBT_BLOCKCOUNT_BITLEN 21
89
90#endif
91
92
93#define BMBT_USE_64 1
94
95typedef struct xfs_bmbt_rec_32
96{
97 __uint32_t l0, l1, l2, l3;
98} xfs_bmbt_rec_32_t;
99typedef struct xfs_bmbt_rec_64
100{
101 __uint64_t l0, l1;
102} xfs_bmbt_rec_64_t;
103
104typedef __uint64_t xfs_bmbt_rec_base_t; /* use this for casts */
105typedef xfs_bmbt_rec_64_t xfs_bmbt_rec_t, xfs_bmdr_rec_t;
106
107/*
108 * Values and macros for delayed-allocation startblock fields.
109 */
110#define STARTBLOCKVALBITS 17
111#define STARTBLOCKMASKBITS (15 + XFS_BIG_BLKNOS * 20)
112#define DSTARTBLOCKMASKBITS (15 + 20)
113#define STARTBLOCKMASK \
114 (((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
115#define DSTARTBLOCKMASK \
116 (((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
117#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_ISNULLSTARTBLOCK)
118int isnullstartblock(xfs_fsblock_t x);
119#define ISNULLSTARTBLOCK(x) isnullstartblock(x)
120#else
121#define ISNULLSTARTBLOCK(x) (((x) & STARTBLOCKMASK) == STARTBLOCKMASK)
122#endif
123#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_ISNULLDSTARTBLOCK)
124int isnulldstartblock(xfs_dfsbno_t x);
125#define ISNULLDSTARTBLOCK(x) isnulldstartblock(x)
126#else
127#define ISNULLDSTARTBLOCK(x) (((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK)
128#endif
129#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_NULLSTARTBLOCK)
130xfs_fsblock_t nullstartblock(int k);
131#define NULLSTARTBLOCK(k) nullstartblock(k)
132#else
133#define NULLSTARTBLOCK(k) \
134 ((ASSERT(k < (1 << STARTBLOCKVALBITS))), (STARTBLOCKMASK | (k)))
135#endif
136#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_STARTBLOCKVAL)
137xfs_filblks_t startblockval(xfs_fsblock_t x);
138#define STARTBLOCKVAL(x) startblockval(x)
139#else
140#define STARTBLOCKVAL(x) ((xfs_filblks_t)((x) & ~STARTBLOCKMASK))
141#endif
142
143/*
144 * Possible extent formats.
145 */
146typedef enum {
147 XFS_EXTFMT_NOSTATE = 0,
148 XFS_EXTFMT_HASSTATE
149} xfs_exntfmt_t;
150
151/*
152 * Possible extent states.
153 */
154typedef enum {
155 XFS_EXT_NORM, XFS_EXT_UNWRITTEN,
156 XFS_EXT_DMAPI_OFFLINE, XFS_EXT_INVALID
157} xfs_exntst_t;
158
159/*
160 * Extent state and extent format macros.
161 */
162#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_EXTFMT_INODE )
163xfs_exntfmt_t xfs_extfmt_inode(struct xfs_inode *ip);
164#define XFS_EXTFMT_INODE(x) xfs_extfmt_inode(x)
165#else
166#define XFS_EXTFMT_INODE(x) \
167 (XFS_SB_VERSION_HASEXTFLGBIT(&((x)->i_mount->m_sb)) ? \
168 XFS_EXTFMT_HASSTATE : XFS_EXTFMT_NOSTATE)
169#endif
170#define ISUNWRITTEN(x) ((x)->br_state == XFS_EXT_UNWRITTEN)
171
172/*
173 * Incore version of above.
174 */
175typedef struct xfs_bmbt_irec
176{
177 xfs_fileoff_t br_startoff; /* starting file offset */
178 xfs_fsblock_t br_startblock; /* starting block number */
179 xfs_filblks_t br_blockcount; /* number of blocks */
180 xfs_exntst_t br_state; /* extent state */
181} xfs_bmbt_irec_t;
182
183/*
184 * Key structure for non-leaf levels of the tree.
185 */
186typedef struct xfs_bmbt_key
187{
188 xfs_dfiloff_t br_startoff; /* starting file offset */
189} xfs_bmbt_key_t, xfs_bmdr_key_t;
190
191typedef xfs_dfsbno_t xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; /* btree pointer type */
192 /* btree block header type */
193typedef struct xfs_btree_lblock xfs_bmbt_block_t;
194
195#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_BMBT_BLOCK)
196xfs_bmbt_block_t *xfs_buf_to_bmbt_block(struct xfs_buf *bp);
197#define XFS_BUF_TO_BMBT_BLOCK(bp) xfs_buf_to_bmbt_block(bp)
198#else
199#define XFS_BUF_TO_BMBT_BLOCK(bp) ((xfs_bmbt_block_t *)(XFS_BUF_PTR(bp)))
200#endif
201
202#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_RBLOCK_DSIZE)
203int xfs_bmap_rblock_dsize(int lev, struct xfs_btree_cur *cur);
204#define XFS_BMAP_RBLOCK_DSIZE(lev,cur) xfs_bmap_rblock_dsize(lev,cur)
205#else
206#define XFS_BMAP_RBLOCK_DSIZE(lev,cur) ((cur)->bc_private.b.forksize)
207#endif
208#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_RBLOCK_ISIZE)
209int xfs_bmap_rblock_isize(int lev, struct xfs_btree_cur *cur);
210#define XFS_BMAP_RBLOCK_ISIZE(lev,cur) xfs_bmap_rblock_isize(lev,cur)
211#else
212#define XFS_BMAP_RBLOCK_ISIZE(lev,cur) \
213 ((int)XFS_IFORK_PTR((cur)->bc_private.b.ip, \
214 (cur)->bc_private.b.whichfork)->if_broot_bytes)
215#endif
216#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_IBLOCK_SIZE)
217int xfs_bmap_iblock_size(int lev, struct xfs_btree_cur *cur);
218#define XFS_BMAP_IBLOCK_SIZE(lev,cur) xfs_bmap_iblock_size(lev,cur)
219#else
220#define XFS_BMAP_IBLOCK_SIZE(lev,cur) (1 << (cur)->bc_blocklog)
221#endif
222
223#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_DSIZE)
224int xfs_bmap_block_dsize(int lev, struct xfs_btree_cur *cur);
225#define XFS_BMAP_BLOCK_DSIZE(lev,cur) xfs_bmap_block_dsize(lev,cur)
226#else
227#define XFS_BMAP_BLOCK_DSIZE(lev,cur) \
228 ((lev) == (cur)->bc_nlevels - 1 ? \
229 XFS_BMAP_RBLOCK_DSIZE(lev,cur) : \
230 XFS_BMAP_IBLOCK_SIZE(lev,cur))
231#endif
232#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_ISIZE)
233int xfs_bmap_block_isize(int lev, struct xfs_btree_cur *cur);
234#define XFS_BMAP_BLOCK_ISIZE(lev,cur) xfs_bmap_block_isize(lev,cur)
235#else
236#define XFS_BMAP_BLOCK_ISIZE(lev,cur) \
237 ((lev) == (cur)->bc_nlevels - 1 ? \
238 XFS_BMAP_RBLOCK_ISIZE(lev,cur) : \
239 XFS_BMAP_IBLOCK_SIZE(lev,cur))
240#endif
241
242#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_DMAXRECS)
243int xfs_bmap_block_dmaxrecs(int lev, struct xfs_btree_cur *cur);
244#define XFS_BMAP_BLOCK_DMAXRECS(lev,cur) xfs_bmap_block_dmaxrecs(lev,cur)
245#else
246#define XFS_BMAP_BLOCK_DMAXRECS(lev,cur) \
247 ((lev) == (cur)->bc_nlevels - 1 ? \
248 XFS_BTREE_BLOCK_MAXRECS(XFS_BMAP_RBLOCK_DSIZE(lev,cur), \
249 xfs_bmdr, (lev) == 0) : \
250 ((cur)->bc_mp->m_bmap_dmxr[(lev) != 0]))
251#endif
252#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_IMAXRECS)
253int xfs_bmap_block_imaxrecs(int lev, struct xfs_btree_cur *cur);
254#define XFS_BMAP_BLOCK_IMAXRECS(lev,cur) xfs_bmap_block_imaxrecs(lev,cur)
255#else
256#define XFS_BMAP_BLOCK_IMAXRECS(lev,cur) \
257 ((lev) == (cur)->bc_nlevels - 1 ? \
258 XFS_BTREE_BLOCK_MAXRECS(XFS_BMAP_RBLOCK_ISIZE(lev,cur), \
259 xfs_bmbt, (lev) == 0) : \
260 ((cur)->bc_mp->m_bmap_dmxr[(lev) != 0]))
261#endif
262
263#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_DMINRECS)
264int xfs_bmap_block_dminrecs(int lev, struct xfs_btree_cur *cur);
265#define XFS_BMAP_BLOCK_DMINRECS(lev,cur) xfs_bmap_block_dminrecs(lev,cur)
266#else
267#define XFS_BMAP_BLOCK_DMINRECS(lev,cur) \
268 ((lev) == (cur)->bc_nlevels - 1 ? \
269 XFS_BTREE_BLOCK_MINRECS(XFS_BMAP_RBLOCK_DSIZE(lev,cur), \
270 xfs_bmdr, (lev) == 0) : \
271 ((cur)->bc_mp->m_bmap_dmnr[(lev) != 0]))
272#endif
273#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_IMINRECS)
274int xfs_bmap_block_iminrecs(int lev, struct xfs_btree_cur *cur);
275#define XFS_BMAP_BLOCK_IMINRECS(lev,cur) xfs_bmap_block_iminrecs(lev,cur)
276#else
277#define XFS_BMAP_BLOCK_IMINRECS(lev,cur) \
278 ((lev) == (cur)->bc_nlevels - 1 ? \
279 XFS_BTREE_BLOCK_MINRECS(XFS_BMAP_RBLOCK_ISIZE(lev,cur), \
280 xfs_bmbt, (lev) == 0) : \
281 ((cur)->bc_mp->m_bmap_dmnr[(lev) != 0]))
282#endif
283
284#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_REC_DADDR)
285xfs_bmbt_rec_t *
286xfs_bmap_rec_daddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur);
287#define XFS_BMAP_REC_DADDR(bb,i,cur) xfs_bmap_rec_daddr(bb,i,cur)
288#else
289#define XFS_BMAP_REC_DADDR(bb,i,cur) \
290 XFS_BTREE_REC_ADDR(XFS_BMAP_BLOCK_DSIZE( \
291 INT_GET((bb)->bb_level, ARCH_CONVERT), cur), \
292 xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS( \
293 INT_GET((bb)->bb_level, ARCH_CONVERT), cur))
294#endif
295#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_REC_IADDR)
296xfs_bmbt_rec_t *
297xfs_bmap_rec_iaddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur);
298#define XFS_BMAP_REC_IADDR(bb,i,cur) xfs_bmap_rec_iaddr(bb,i,cur)
299#else
300#define XFS_BMAP_REC_IADDR(bb,i,cur) \
301 XFS_BTREE_REC_ADDR(XFS_BMAP_BLOCK_ISIZE( \
302 INT_GET((bb)->bb_level, ARCH_CONVERT), cur), \
303 xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS( \
304 INT_GET((bb)->bb_level, ARCH_CONVERT), cur))
305#endif
306
307#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_KEY_DADDR)
308xfs_bmbt_key_t *
309xfs_bmap_key_daddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur);
310#define XFS_BMAP_KEY_DADDR(bb,i,cur) xfs_bmap_key_daddr(bb,i,cur)
311#else
312#define XFS_BMAP_KEY_DADDR(bb,i,cur) \
313 XFS_BTREE_KEY_ADDR(XFS_BMAP_BLOCK_DSIZE( \
314 INT_GET((bb)->bb_level, ARCH_CONVERT), cur), \
315 xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS( \
316 INT_GET((bb)->bb_level, ARCH_CONVERT), cur))
317#endif
318#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_KEY_IADDR)
319xfs_bmbt_key_t *
320xfs_bmap_key_iaddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur);
321#define XFS_BMAP_KEY_IADDR(bb,i,cur) xfs_bmap_key_iaddr(bb,i,cur)
322#else
323#define XFS_BMAP_KEY_IADDR(bb,i,cur) \
324 XFS_BTREE_KEY_ADDR(XFS_BMAP_BLOCK_ISIZE( \
325 INT_GET((bb)->bb_level, ARCH_CONVERT), cur), \
326 xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS( \
327 INT_GET((bb)->bb_level, ARCH_CONVERT), cur))
328#endif
329
330#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_PTR_DADDR)
331xfs_bmbt_ptr_t *
332xfs_bmap_ptr_daddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur);
333#define XFS_BMAP_PTR_DADDR(bb,i,cur) xfs_bmap_ptr_daddr(bb,i,cur)
334#else
335#define XFS_BMAP_PTR_DADDR(bb,i,cur) \
336 XFS_BTREE_PTR_ADDR(XFS_BMAP_BLOCK_DSIZE( \
337 INT_GET((bb)->bb_level, ARCH_CONVERT), cur), \
338 xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS( \
339 INT_GET((bb)->bb_level, ARCH_CONVERT), cur))
340#endif
341#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_PTR_IADDR)
342xfs_bmbt_ptr_t *
343xfs_bmap_ptr_iaddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur);
344#define XFS_BMAP_PTR_IADDR(bb,i,cur) xfs_bmap_ptr_iaddr(bb,i,cur)
345#else
346#define XFS_BMAP_PTR_IADDR(bb,i,cur) \
347 XFS_BTREE_PTR_ADDR(XFS_BMAP_BLOCK_ISIZE( \
348 INT_GET((bb)->bb_level, ARCH_CONVERT), cur), \
349 xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS( \
350 INT_GET((bb)->bb_level, ARCH_CONVERT), cur))
351#endif
352
353/*
354 * These are to be used when we know the size of the block and
355 * we don't have a cursor.
356 */
357#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_REC_ADDR)
358xfs_bmbt_rec_t *xfs_bmap_broot_rec_addr(xfs_bmbt_block_t *bb, int i, int sz);
359#define XFS_BMAP_BROOT_REC_ADDR(bb,i,sz) xfs_bmap_broot_rec_addr(bb,i,sz)
360#else
361#define XFS_BMAP_BROOT_REC_ADDR(bb,i,sz) \
362 XFS_BTREE_REC_ADDR(sz,xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz))
363#endif
364#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_KEY_ADDR)
365xfs_bmbt_key_t *xfs_bmap_broot_key_addr(xfs_bmbt_block_t *bb, int i, int sz);
366#define XFS_BMAP_BROOT_KEY_ADDR(bb,i,sz) xfs_bmap_broot_key_addr(bb,i,sz)
367#else
368#define XFS_BMAP_BROOT_KEY_ADDR(bb,i,sz) \
369 XFS_BTREE_KEY_ADDR(sz,xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz))
370#endif
371#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_PTR_ADDR)
372xfs_bmbt_ptr_t *xfs_bmap_broot_ptr_addr(xfs_bmbt_block_t *bb, int i, int sz);
373#define XFS_BMAP_BROOT_PTR_ADDR(bb,i,sz) xfs_bmap_broot_ptr_addr(bb,i,sz)
374#else
375#define XFS_BMAP_BROOT_PTR_ADDR(bb,i,sz) \
376 XFS_BTREE_PTR_ADDR(sz,xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz))
377#endif
378
379#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_NUMRECS)
380int xfs_bmap_broot_numrecs(xfs_bmdr_block_t *bb);
381#define XFS_BMAP_BROOT_NUMRECS(bb) xfs_bmap_broot_numrecs(bb)
382#else
383#define XFS_BMAP_BROOT_NUMRECS(bb) (INT_GET((bb)->bb_numrecs, ARCH_CONVERT))
384#endif
385#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_MAXRECS)
386int xfs_bmap_broot_maxrecs(int sz);
387#define XFS_BMAP_BROOT_MAXRECS(sz) xfs_bmap_broot_maxrecs(sz)
388#else
389#define XFS_BMAP_BROOT_MAXRECS(sz) XFS_BTREE_BLOCK_MAXRECS(sz,xfs_bmbt,0)
390#endif
391#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_SPACE_CALC)
392int xfs_bmap_broot_space_calc(int nrecs);
393#define XFS_BMAP_BROOT_SPACE_CALC(nrecs) xfs_bmap_broot_space_calc(nrecs)
394#else
395#define XFS_BMAP_BROOT_SPACE_CALC(nrecs) \
396 ((int)(sizeof(xfs_bmbt_block_t) + \
397 ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))))
398#endif
399#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_SPACE)
400int xfs_bmap_broot_space(xfs_bmdr_block_t *bb);
401#define XFS_BMAP_BROOT_SPACE(bb) xfs_bmap_broot_space(bb)
402#else
403#define XFS_BMAP_BROOT_SPACE(bb) \
404 XFS_BMAP_BROOT_SPACE_CALC(INT_GET((bb)->bb_numrecs, ARCH_CONVERT))
405#endif
406#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMDR_SPACE_CALC)
407int xfs_bmdr_space_calc(int nrecs);
408#define XFS_BMDR_SPACE_CALC(nrecs) xfs_bmdr_space_calc(nrecs)
409#else
410#define XFS_BMDR_SPACE_CALC(nrecs) \
411 ((int)(sizeof(xfs_bmdr_block_t) + \
412 ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))))
413#endif
414
415/*
416 * Maximum number of bmap btree levels.
417 */
418#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BM_MAXLEVELS)
419int xfs_bm_maxlevels(struct xfs_mount *mp, int w);
420#define XFS_BM_MAXLEVELS(mp,w) xfs_bm_maxlevels(mp,w)
421#else
422#define XFS_BM_MAXLEVELS(mp,w) ((mp)->m_bm_maxlevels[w])
423#endif
424
425#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_SANITY_CHECK)
426int xfs_bmap_sanity_check(struct xfs_mount *mp, xfs_bmbt_block_t *bb,
427 int level);
428#define XFS_BMAP_SANITY_CHECK(mp,bb,level) \
429 xfs_bmap_sanity_check(mp,bb,level)
430#else
431#define XFS_BMAP_SANITY_CHECK(mp,bb,level) \
432 (INT_GET((bb)->bb_magic, ARCH_CONVERT) == XFS_BMAP_MAGIC && \
433 INT_GET((bb)->bb_level, ARCH_CONVERT) == level && \
434 INT_GET((bb)->bb_numrecs, ARCH_CONVERT) > 0 && \
435 INT_GET((bb)->bb_numrecs, ARCH_CONVERT) <= (mp)->m_bmap_dmxr[(level) != 0])
436#endif
437
438
439#ifdef __KERNEL__
440
441#if defined(XFS_BMBT_TRACE)
442/*
443 * Trace buffer entry types.
444 */
445#define XFS_BMBT_KTRACE_ARGBI 1
446#define XFS_BMBT_KTRACE_ARGBII 2
447#define XFS_BMBT_KTRACE_ARGFFFI 3
448#define XFS_BMBT_KTRACE_ARGI 4
449#define XFS_BMBT_KTRACE_ARGIFK 5
450#define XFS_BMBT_KTRACE_ARGIFR 6
451#define XFS_BMBT_KTRACE_ARGIK 7
452#define XFS_BMBT_KTRACE_CUR 8
453
454#define XFS_BMBT_TRACE_SIZE 4096 /* size of global trace buffer */
455#define XFS_BMBT_KTRACE_SIZE 32 /* size of per-inode trace buffer */
456extern ktrace_t *xfs_bmbt_trace_buf;
457#endif
458
459/*
460 * Prototypes for xfs_bmap.c to call.
461 */
462
463void
464xfs_bmdr_to_bmbt(
465 xfs_bmdr_block_t *,
466 int,
467 xfs_bmbt_block_t *,
468 int);
469
470int
471xfs_bmbt_decrement(
472 struct xfs_btree_cur *,
473 int,
474 int *);
475
476int
477xfs_bmbt_delete(
478 struct xfs_btree_cur *,
479 int *);
480
481void
482xfs_bmbt_get_all(
483 xfs_bmbt_rec_t *r,
484 xfs_bmbt_irec_t *s);
485
486xfs_bmbt_block_t *
487xfs_bmbt_get_block(
488 struct xfs_btree_cur *cur,
489 int level,
490 struct xfs_buf **bpp);
491
492xfs_filblks_t
493xfs_bmbt_get_blockcount(
494 xfs_bmbt_rec_t *r);
495
496xfs_fsblock_t
497xfs_bmbt_get_startblock(
498 xfs_bmbt_rec_t *r);
499
500xfs_fileoff_t
501xfs_bmbt_get_startoff(
502 xfs_bmbt_rec_t *r);
503
504xfs_exntst_t
505xfs_bmbt_get_state(
506 xfs_bmbt_rec_t *r);
507
508#if __BYTE_ORDER != __BIG_ENDIAN
509void
510xfs_bmbt_disk_get_all(
511 xfs_bmbt_rec_t *r,
512 xfs_bmbt_irec_t *s);
513
514xfs_exntst_t
515xfs_bmbt_disk_get_state(
516 xfs_bmbt_rec_t *r);
517
518xfs_filblks_t
519xfs_bmbt_disk_get_blockcount(
520 xfs_bmbt_rec_t *r);
521
522xfs_fsblock_t
523xfs_bmbt_disk_get_startblock(
524 xfs_bmbt_rec_t *r);
525
526xfs_fileoff_t
527xfs_bmbt_disk_get_startoff(
528 xfs_bmbt_rec_t *r);
529
530#else
531#define xfs_bmbt_disk_get_all(r, s) \
532 xfs_bmbt_get_all(r, s)
533#define xfs_bmbt_disk_get_state(r) \
534 xfs_bmbt_get_state(r)
535#define xfs_bmbt_disk_get_blockcount(r) \
536 xfs_bmbt_get_blockcount(r)
537#define xfs_bmbt_disk_get_startblock(r) \
538 xfs_bmbt_get_blockcount(r)
539#define xfs_bmbt_disk_get_startoff(r) \
540 xfs_bmbt_get_startoff(r)
541#endif
542
543int
544xfs_bmbt_increment(
545 struct xfs_btree_cur *,
546 int,
547 int *);
548
549int
550xfs_bmbt_insert(
551 struct xfs_btree_cur *,
552 int *);
553
554void
555xfs_bmbt_log_block(
556 struct xfs_btree_cur *,
557 struct xfs_buf *,
558 int);
559
560void
561xfs_bmbt_log_recs(
562 struct xfs_btree_cur *,
563 struct xfs_buf *,
564 int,
565 int);
566
567int
568xfs_bmbt_lookup_eq(
569 struct xfs_btree_cur *,
570 xfs_fileoff_t,
571 xfs_fsblock_t,
572 xfs_filblks_t,
573 int *);
574
575int
576xfs_bmbt_lookup_ge(
577 struct xfs_btree_cur *,
578 xfs_fileoff_t,
579 xfs_fsblock_t,
580 xfs_filblks_t,
581 int *);
582
583int
584xfs_bmbt_lookup_le(
585 struct xfs_btree_cur *,
586 xfs_fileoff_t,
587 xfs_fsblock_t,
588 xfs_filblks_t,
589 int *);
590
591/*
592 * Give the bmap btree a new root block. Copy the old broot contents
593 * down into a real block and make the broot point to it.
594 */
595int /* error */
596xfs_bmbt_newroot(
597 struct xfs_btree_cur *cur, /* btree cursor */
598 int *logflags, /* logging flags for inode */
599 int *stat); /* return status - 0 fail */
600
601void
602xfs_bmbt_set_all(
603 xfs_bmbt_rec_t *r,
604 xfs_bmbt_irec_t *s);
605
606void
607xfs_bmbt_set_allf(
608 xfs_bmbt_rec_t *r,
609 xfs_fileoff_t o,
610 xfs_fsblock_t b,
611 xfs_filblks_t c,
612 xfs_exntst_t v);
613
614void
615xfs_bmbt_set_blockcount(
616 xfs_bmbt_rec_t *r,
617 xfs_filblks_t v);
618
619void
620xfs_bmbt_set_startblock(
621 xfs_bmbt_rec_t *r,
622 xfs_fsblock_t v);
623
624void
625xfs_bmbt_set_startoff(
626 xfs_bmbt_rec_t *r,
627 xfs_fileoff_t v);
628
629void
630xfs_bmbt_set_state(
631 xfs_bmbt_rec_t *r,
632 xfs_exntst_t v);
633
634#if __BYTE_ORDER != __BIG_ENDIAN
635void
636xfs_bmbt_disk_set_all(
637 xfs_bmbt_rec_t *r,
638 xfs_bmbt_irec_t *s);
639
640void
641xfs_bmbt_disk_set_allf(
642 xfs_bmbt_rec_t *r,
643 xfs_fileoff_t o,
644 xfs_fsblock_t b,
645 xfs_filblks_t c,
646 xfs_exntst_t v);
647#else
648#define xfs_bmbt_disk_set_all(r, s) \
649 xfs_bmbt_set_all(r, s)
650#define xfs_bmbt_disk_set_allf(r, o, b, c, v) \
651 xfs_bmbt_set_allf(r, o, b, c, v)
652#endif
653
654void
655xfs_bmbt_to_bmdr(
656 xfs_bmbt_block_t *,
657 int,
658 xfs_bmdr_block_t *,
659 int);
660
661int
662xfs_bmbt_update(
663 struct xfs_btree_cur *,
664 xfs_fileoff_t,
665 xfs_fsblock_t,
666 xfs_filblks_t,
667 xfs_exntst_t);
668
669#ifdef DEBUG
670/*
671 * Get the data from the pointed-to record.
672 */
673int
674xfs_bmbt_get_rec(
675 struct xfs_btree_cur *,
676 xfs_fileoff_t *,
677 xfs_fsblock_t *,
678 xfs_filblks_t *,
679 xfs_exntst_t *,
680 int *);
681#endif
682
683
684/*
685 * Search an extent list for the extent which includes block
686 * bno.
687 */
688xfs_bmbt_rec_t *
689xfs_bmap_do_search_extents(
690 xfs_bmbt_rec_t *,
691 xfs_extnum_t,
692 xfs_extnum_t,
693 xfs_fileoff_t,
694 int *,
695 xfs_extnum_t *,
696 xfs_bmbt_irec_t *,
697 xfs_bmbt_irec_t *);
698
699#endif /* __KERNEL__ */
700
701#endif /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
new file mode 100644
index 000000000000..9dd22dd95487
--- /dev/null
+++ b/fs/xfs/xfs_btree.c
@@ -0,0 +1,949 @@
1/*
2 * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33/*
34 * This file contains common code for the space manager's btree implementations.
35 */
36
37#include "xfs.h"
38
39#include "xfs_macros.h"
40#include "xfs_types.h"
41#include "xfs_inum.h"
42#include "xfs_log.h"
43#include "xfs_trans.h"
44#include "xfs_sb.h"
45#include "xfs_ag.h"
46#include "xfs_dir.h"
47#include "xfs_dir2.h"
48#include "xfs_dmapi.h"
49#include "xfs_mount.h"
50#include "xfs_alloc_btree.h"
51#include "xfs_bmap_btree.h"
52#include "xfs_ialloc_btree.h"
53#include "xfs_btree.h"
54#include "xfs_ialloc.h"
55#include "xfs_attr_sf.h"
56#include "xfs_dir_sf.h"
57#include "xfs_dir2_sf.h"
58#include "xfs_dinode.h"
59#include "xfs_inode.h"
60#include "xfs_bit.h"
61#include "xfs_error.h"
62
63/*
64 * Cursor allocation zone.
65 */
66kmem_zone_t *xfs_btree_cur_zone;
67
68/*
69 * Btree magic numbers.
70 */
71const __uint32_t xfs_magics[XFS_BTNUM_MAX] =
72{
73 XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC
74};
75
76/*
77 * Prototypes for internal routines.
78 */
79
80/*
81 * Checking routine: return maxrecs for the block.
82 */
83STATIC int /* number of records fitting in block */
84xfs_btree_maxrecs(
85 xfs_btree_cur_t *cur, /* btree cursor */
86 xfs_btree_block_t *block);/* generic btree block pointer */
87
88/*
89 * Internal routines.
90 */
91
92/*
93 * Checking routine: return maxrecs for the block.
94 */
95STATIC int /* number of records fitting in block */
96xfs_btree_maxrecs(
97 xfs_btree_cur_t *cur, /* btree cursor */
98 xfs_btree_block_t *block) /* generic btree block pointer */
99{
100 switch (cur->bc_btnum) {
101 case XFS_BTNUM_BNO:
102 case XFS_BTNUM_CNT:
103 return (int)XFS_ALLOC_BLOCK_MAXRECS(INT_GET(block->bb_h.bb_level, ARCH_CONVERT), cur);
104 case XFS_BTNUM_BMAP:
105 return (int)XFS_BMAP_BLOCK_IMAXRECS(INT_GET(block->bb_h.bb_level, ARCH_CONVERT), cur);
106 case XFS_BTNUM_INO:
107 return (int)XFS_INOBT_BLOCK_MAXRECS(INT_GET(block->bb_h.bb_level, ARCH_CONVERT), cur);
108 default:
109 ASSERT(0);
110 return 0;
111 }
112}
113
114/*
115 * External routines.
116 */
117
118#ifdef DEBUG
119/*
120 * Debug routine: check that block header is ok.
121 */
122void
123xfs_btree_check_block(
124 xfs_btree_cur_t *cur, /* btree cursor */
125 xfs_btree_block_t *block, /* generic btree block pointer */
126 int level, /* level of the btree block */
127 xfs_buf_t *bp) /* buffer containing block, if any */
128{
129 if (XFS_BTREE_LONG_PTRS(cur->bc_btnum))
130 xfs_btree_check_lblock(cur, (xfs_btree_lblock_t *)block, level,
131 bp);
132 else
133 xfs_btree_check_sblock(cur, (xfs_btree_sblock_t *)block, level,
134 bp);
135}
136
137/*
138 * Debug routine: check that keys are in the right order.
139 */
140void
141xfs_btree_check_key(
142 xfs_btnum_t btnum, /* btree identifier */
143 void *ak1, /* pointer to left (lower) key */
144 void *ak2) /* pointer to right (higher) key */
145{
146 switch (btnum) {
147 case XFS_BTNUM_BNO: {
148 xfs_alloc_key_t *k1;
149 xfs_alloc_key_t *k2;
150
151 k1 = ak1;
152 k2 = ak2;
153 ASSERT(INT_GET(k1->ar_startblock, ARCH_CONVERT) < INT_GET(k2->ar_startblock, ARCH_CONVERT));
154 break;
155 }
156 case XFS_BTNUM_CNT: {
157 xfs_alloc_key_t *k1;
158 xfs_alloc_key_t *k2;
159
160 k1 = ak1;
161 k2 = ak2;
162 ASSERT(INT_GET(k1->ar_blockcount, ARCH_CONVERT) < INT_GET(k2->ar_blockcount, ARCH_CONVERT) ||
163 (INT_GET(k1->ar_blockcount, ARCH_CONVERT) == INT_GET(k2->ar_blockcount, ARCH_CONVERT) &&
164 INT_GET(k1->ar_startblock, ARCH_CONVERT) < INT_GET(k2->ar_startblock, ARCH_CONVERT)));
165 break;
166 }
167 case XFS_BTNUM_BMAP: {
168 xfs_bmbt_key_t *k1;
169 xfs_bmbt_key_t *k2;
170
171 k1 = ak1;
172 k2 = ak2;
173 ASSERT(INT_GET(k1->br_startoff, ARCH_CONVERT) < INT_GET(k2->br_startoff, ARCH_CONVERT));
174 break;
175 }
176 case XFS_BTNUM_INO: {
177 xfs_inobt_key_t *k1;
178 xfs_inobt_key_t *k2;
179
180 k1 = ak1;
181 k2 = ak2;
182 ASSERT(INT_GET(k1->ir_startino, ARCH_CONVERT) < INT_GET(k2->ir_startino, ARCH_CONVERT));
183 break;
184 }
185 default:
186 ASSERT(0);
187 }
188}
189#endif /* DEBUG */
190
191/*
192 * Checking routine: check that long form block header is ok.
193 */
194/* ARGSUSED */
195int /* error (0 or EFSCORRUPTED) */
196xfs_btree_check_lblock(
197 xfs_btree_cur_t *cur, /* btree cursor */
198 xfs_btree_lblock_t *block, /* btree long form block pointer */
199 int level, /* level of the btree block */
200 xfs_buf_t *bp) /* buffer for block, if any */
201{
202 int lblock_ok; /* block passes checks */
203 xfs_mount_t *mp; /* file system mount point */
204
205 mp = cur->bc_mp;
206 lblock_ok =
207 INT_GET(block->bb_magic, ARCH_CONVERT) == xfs_magics[cur->bc_btnum] &&
208 INT_GET(block->bb_level, ARCH_CONVERT) == level &&
209 INT_GET(block->bb_numrecs, ARCH_CONVERT) <=
210 xfs_btree_maxrecs(cur, (xfs_btree_block_t *)block) &&
211 block->bb_leftsib &&
212 (INT_GET(block->bb_leftsib, ARCH_CONVERT) == NULLDFSBNO ||
213 XFS_FSB_SANITY_CHECK(mp, INT_GET(block->bb_leftsib, ARCH_CONVERT))) &&
214 block->bb_rightsib &&
215 (INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLDFSBNO ||
216 XFS_FSB_SANITY_CHECK(mp, INT_GET(block->bb_rightsib, ARCH_CONVERT)));
217 if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK,
218 XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
219 if (bp)
220 xfs_buftrace("LBTREE ERROR", bp);
221 XFS_ERROR_REPORT("xfs_btree_check_lblock", XFS_ERRLEVEL_LOW,
222 mp);
223 return XFS_ERROR(EFSCORRUPTED);
224 }
225 return 0;
226}
227
228/*
229 * Checking routine: check that (long) pointer is ok.
230 */
231int /* error (0 or EFSCORRUPTED) */
232xfs_btree_check_lptr(
233 xfs_btree_cur_t *cur, /* btree cursor */
234 xfs_dfsbno_t ptr, /* btree block disk address */
235 int level) /* btree block level */
236{
237 xfs_mount_t *mp; /* file system mount point */
238
239 mp = cur->bc_mp;
240 XFS_WANT_CORRUPTED_RETURN(
241 level > 0 &&
242 ptr != NULLDFSBNO &&
243 XFS_FSB_SANITY_CHECK(mp, ptr));
244 return 0;
245}
246
247#ifdef DEBUG
248/*
249 * Debug routine: check that records are in the right order.
250 */
251void
252xfs_btree_check_rec(
253 xfs_btnum_t btnum, /* btree identifier */
254 void *ar1, /* pointer to left (lower) record */
255 void *ar2) /* pointer to right (higher) record */
256{
257 switch (btnum) {
258 case XFS_BTNUM_BNO: {
259 xfs_alloc_rec_t *r1;
260 xfs_alloc_rec_t *r2;
261
262 r1 = ar1;
263 r2 = ar2;
264 ASSERT(INT_GET(r1->ar_startblock, ARCH_CONVERT) + INT_GET(r1->ar_blockcount, ARCH_CONVERT) <=
265 INT_GET(r2->ar_startblock, ARCH_CONVERT));
266 break;
267 }
268 case XFS_BTNUM_CNT: {
269 xfs_alloc_rec_t *r1;
270 xfs_alloc_rec_t *r2;
271
272 r1 = ar1;
273 r2 = ar2;
274 ASSERT(INT_GET(r1->ar_blockcount, ARCH_CONVERT) < INT_GET(r2->ar_blockcount, ARCH_CONVERT) ||
275 (INT_GET(r1->ar_blockcount, ARCH_CONVERT) == INT_GET(r2->ar_blockcount, ARCH_CONVERT) &&
276 INT_GET(r1->ar_startblock, ARCH_CONVERT) < INT_GET(r2->ar_startblock, ARCH_CONVERT)));
277 break;
278 }
279 case XFS_BTNUM_BMAP: {
280 xfs_bmbt_rec_t *r1;
281 xfs_bmbt_rec_t *r2;
282
283 r1 = ar1;
284 r2 = ar2;
285 ASSERT(xfs_bmbt_disk_get_startoff(r1) +
286 xfs_bmbt_disk_get_blockcount(r1) <=
287 xfs_bmbt_disk_get_startoff(r2));
288 break;
289 }
290 case XFS_BTNUM_INO: {
291 xfs_inobt_rec_t *r1;
292 xfs_inobt_rec_t *r2;
293
294 r1 = ar1;
295 r2 = ar2;
296 ASSERT(INT_GET(r1->ir_startino, ARCH_CONVERT) + XFS_INODES_PER_CHUNK <=
297 INT_GET(r2->ir_startino, ARCH_CONVERT));
298 break;
299 }
300 default:
301 ASSERT(0);
302 }
303}
304#endif /* DEBUG */
305
306/*
307 * Checking routine: check that block header is ok.
308 */
309/* ARGSUSED */
310int /* error (0 or EFSCORRUPTED) */
311xfs_btree_check_sblock(
312 xfs_btree_cur_t *cur, /* btree cursor */
313 xfs_btree_sblock_t *block, /* btree short form block pointer */
314 int level, /* level of the btree block */
315 xfs_buf_t *bp) /* buffer containing block */
316{
317 xfs_buf_t *agbp; /* buffer for ag. freespace struct */
318 xfs_agf_t *agf; /* ag. freespace structure */
319 xfs_agblock_t agflen; /* native ag. freespace length */
320 int sblock_ok; /* block passes checks */
321
322 agbp = cur->bc_private.a.agbp;
323 agf = XFS_BUF_TO_AGF(agbp);
324 agflen = INT_GET(agf->agf_length, ARCH_CONVERT);
325 sblock_ok =
326 INT_GET(block->bb_magic, ARCH_CONVERT) == xfs_magics[cur->bc_btnum] &&
327 INT_GET(block->bb_level, ARCH_CONVERT) == level &&
328 INT_GET(block->bb_numrecs, ARCH_CONVERT) <=
329 xfs_btree_maxrecs(cur, (xfs_btree_block_t *)block) &&
330 (INT_GET(block->bb_leftsib, ARCH_CONVERT) == NULLAGBLOCK ||
331 INT_GET(block->bb_leftsib, ARCH_CONVERT) < agflen) &&
332 block->bb_leftsib &&
333 (INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK ||
334 INT_GET(block->bb_rightsib, ARCH_CONVERT) < agflen) &&
335 block->bb_rightsib;
336 if (unlikely(XFS_TEST_ERROR(!sblock_ok, cur->bc_mp,
337 XFS_ERRTAG_BTREE_CHECK_SBLOCK,
338 XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
339 if (bp)
340 xfs_buftrace("SBTREE ERROR", bp);
341 XFS_ERROR_REPORT("xfs_btree_check_sblock", XFS_ERRLEVEL_LOW,
342 cur->bc_mp);
343 return XFS_ERROR(EFSCORRUPTED);
344 }
345 return 0;
346}
347
348/*
349 * Checking routine: check that (short) pointer is ok.
350 */
351int /* error (0 or EFSCORRUPTED) */
352xfs_btree_check_sptr(
353 xfs_btree_cur_t *cur, /* btree cursor */
354 xfs_agblock_t ptr, /* btree block disk address */
355 int level) /* btree block level */
356{
357 xfs_buf_t *agbp; /* buffer for ag. freespace struct */
358 xfs_agf_t *agf; /* ag. freespace structure */
359
360 agbp = cur->bc_private.a.agbp;
361 agf = XFS_BUF_TO_AGF(agbp);
362 XFS_WANT_CORRUPTED_RETURN(
363 level > 0 &&
364 ptr != NULLAGBLOCK && ptr != 0 &&
365 ptr < INT_GET(agf->agf_length, ARCH_CONVERT));
366 return 0;
367}
368
369/*
370 * Delete the btree cursor.
371 */
372void
373xfs_btree_del_cursor(
374 xfs_btree_cur_t *cur, /* btree cursor */
375 int error) /* del because of error */
376{
377 int i; /* btree level */
378
379 /*
380 * Clear the buffer pointers, and release the buffers.
381 * If we're doing this in the face of an error, we
382 * need to make sure to inspect all of the entries
383 * in the bc_bufs array for buffers to be unlocked.
384 * This is because some of the btree code works from
385 * level n down to 0, and if we get an error along
386 * the way we won't have initialized all the entries
387 * down to 0.
388 */
389 for (i = 0; i < cur->bc_nlevels; i++) {
390 if (cur->bc_bufs[i])
391 xfs_btree_setbuf(cur, i, NULL);
392 else if (!error)
393 break;
394 }
395 /*
396 * Can't free a bmap cursor without having dealt with the
397 * allocated indirect blocks' accounting.
398 */
399 ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
400 cur->bc_private.b.allocated == 0);
401 /*
402 * Free the cursor.
403 */
404 kmem_zone_free(xfs_btree_cur_zone, cur);
405}
406
407/*
408 * Duplicate the btree cursor.
409 * Allocate a new one, copy the record, re-get the buffers.
410 */
411int /* error */
412xfs_btree_dup_cursor(
413 xfs_btree_cur_t *cur, /* input cursor */
414 xfs_btree_cur_t **ncur) /* output cursor */
415{
416 xfs_buf_t *bp; /* btree block's buffer pointer */
417 int error; /* error return value */
418 int i; /* level number of btree block */
419 xfs_mount_t *mp; /* mount structure for filesystem */
420 xfs_btree_cur_t *new; /* new cursor value */
421 xfs_trans_t *tp; /* transaction pointer, can be NULL */
422
423 tp = cur->bc_tp;
424 mp = cur->bc_mp;
425 /*
426 * Allocate a new cursor like the old one.
427 */
428 new = xfs_btree_init_cursor(mp, tp, cur->bc_private.a.agbp,
429 cur->bc_private.a.agno, cur->bc_btnum, cur->bc_private.b.ip,
430 cur->bc_private.b.whichfork);
431 /*
432 * Copy the record currently in the cursor.
433 */
434 new->bc_rec = cur->bc_rec;
435 /*
436 * For each level current, re-get the buffer and copy the ptr value.
437 */
438 for (i = 0; i < new->bc_nlevels; i++) {
439 new->bc_ptrs[i] = cur->bc_ptrs[i];
440 new->bc_ra[i] = cur->bc_ra[i];
441 if ((bp = cur->bc_bufs[i])) {
442 if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
443 XFS_BUF_ADDR(bp), mp->m_bsize, 0, &bp))) {
444 xfs_btree_del_cursor(new, error);
445 *ncur = NULL;
446 return error;
447 }
448 new->bc_bufs[i] = bp;
449 ASSERT(bp);
450 ASSERT(!XFS_BUF_GETERROR(bp));
451 } else
452 new->bc_bufs[i] = NULL;
453 }
454 /*
455 * For bmap btrees, copy the firstblock, flist, and flags values,
456 * since init cursor doesn't get them.
457 */
458 if (new->bc_btnum == XFS_BTNUM_BMAP) {
459 new->bc_private.b.firstblock = cur->bc_private.b.firstblock;
460 new->bc_private.b.flist = cur->bc_private.b.flist;
461 new->bc_private.b.flags = cur->bc_private.b.flags;
462 }
463 *ncur = new;
464 return 0;
465}
466
467/*
468 * Change the cursor to point to the first record at the given level.
469 * Other levels are unaffected.
470 */
471int /* success=1, failure=0 */
472xfs_btree_firstrec(
473 xfs_btree_cur_t *cur, /* btree cursor */
474 int level) /* level to change */
475{
476 xfs_btree_block_t *block; /* generic btree block pointer */
477 xfs_buf_t *bp; /* buffer containing block */
478
479 /*
480 * Get the block pointer for this level.
481 */
482 block = xfs_btree_get_block(cur, level, &bp);
483 xfs_btree_check_block(cur, block, level, bp);
484 /*
485 * It's empty, there is no such record.
486 */
487 if (!block->bb_h.bb_numrecs)
488 return 0;
489 /*
490 * Set the ptr value to 1, that's the first record/key.
491 */
492 cur->bc_ptrs[level] = 1;
493 return 1;
494}
495
496/*
497 * Retrieve the block pointer from the cursor at the given level.
498 * This may be a bmap btree root or from a buffer.
499 */
500xfs_btree_block_t * /* generic btree block pointer */
501xfs_btree_get_block(
502 xfs_btree_cur_t *cur, /* btree cursor */
503 int level, /* level in btree */
504 xfs_buf_t **bpp) /* buffer containing the block */
505{
506 xfs_btree_block_t *block; /* return value */
507 xfs_buf_t *bp; /* return buffer */
508 xfs_ifork_t *ifp; /* inode fork pointer */
509 int whichfork; /* data or attr fork */
510
511 if (cur->bc_btnum == XFS_BTNUM_BMAP && level == cur->bc_nlevels - 1) {
512 whichfork = cur->bc_private.b.whichfork;
513 ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, whichfork);
514 block = (xfs_btree_block_t *)ifp->if_broot;
515 bp = NULL;
516 } else {
517 bp = cur->bc_bufs[level];
518 block = XFS_BUF_TO_BLOCK(bp);
519 }
520 ASSERT(block != NULL);
521 *bpp = bp;
522 return block;
523}
524
525/*
526 * Get a buffer for the block, return it with no data read.
527 * Long-form addressing.
528 */
529xfs_buf_t * /* buffer for fsbno */
530xfs_btree_get_bufl(
531 xfs_mount_t *mp, /* file system mount point */
532 xfs_trans_t *tp, /* transaction pointer */
533 xfs_fsblock_t fsbno, /* file system block number */
534 uint lock) /* lock flags for get_buf */
535{
536 xfs_buf_t *bp; /* buffer pointer (return value) */
537 xfs_daddr_t d; /* real disk block address */
538
539 ASSERT(fsbno != NULLFSBLOCK);
540 d = XFS_FSB_TO_DADDR(mp, fsbno);
541 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
542 ASSERT(bp);
543 ASSERT(!XFS_BUF_GETERROR(bp));
544 return bp;
545}
546
547/*
548 * Get a buffer for the block, return it with no data read.
549 * Short-form addressing.
550 */
551xfs_buf_t * /* buffer for agno/agbno */
552xfs_btree_get_bufs(
553 xfs_mount_t *mp, /* file system mount point */
554 xfs_trans_t *tp, /* transaction pointer */
555 xfs_agnumber_t agno, /* allocation group number */
556 xfs_agblock_t agbno, /* allocation group block number */
557 uint lock) /* lock flags for get_buf */
558{
559 xfs_buf_t *bp; /* buffer pointer (return value) */
560 xfs_daddr_t d; /* real disk block address */
561
562 ASSERT(agno != NULLAGNUMBER);
563 ASSERT(agbno != NULLAGBLOCK);
564 d = XFS_AGB_TO_DADDR(mp, agno, agbno);
565 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
566 ASSERT(bp);
567 ASSERT(!XFS_BUF_GETERROR(bp));
568 return bp;
569}
570
571/*
572 * Allocate a new btree cursor.
573 * The cursor is either for allocation (A) or bmap (B) or inodes (I).
574 */
575xfs_btree_cur_t * /* new btree cursor */
576xfs_btree_init_cursor(
577 xfs_mount_t *mp, /* file system mount point */
578 xfs_trans_t *tp, /* transaction pointer */
579 xfs_buf_t *agbp, /* (A only) buffer for agf structure */
580 /* (I only) buffer for agi structure */
581 xfs_agnumber_t agno, /* (AI only) allocation group number */
582 xfs_btnum_t btnum, /* btree identifier */
583 xfs_inode_t *ip, /* (B only) inode owning the btree */
584 int whichfork) /* (B only) data or attr fork */
585{
586 xfs_agf_t *agf; /* (A) allocation group freespace */
587 xfs_agi_t *agi; /* (I) allocation group inodespace */
588 xfs_btree_cur_t *cur; /* return value */
589 xfs_ifork_t *ifp; /* (I) inode fork pointer */
590 int nlevels=0; /* number of levels in the btree */
591
592 ASSERT(xfs_btree_cur_zone != NULL);
593 /*
594 * Allocate a new cursor.
595 */
596 cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
597 /*
598 * Deduce the number of btree levels from the arguments.
599 */
600 switch (btnum) {
601 case XFS_BTNUM_BNO:
602 case XFS_BTNUM_CNT:
603 agf = XFS_BUF_TO_AGF(agbp);
604 nlevels = INT_GET(agf->agf_levels[btnum], ARCH_CONVERT);
605 break;
606 case XFS_BTNUM_BMAP:
607 ifp = XFS_IFORK_PTR(ip, whichfork);
608 nlevels = INT_GET(ifp->if_broot->bb_level, ARCH_CONVERT) + 1;
609 break;
610 case XFS_BTNUM_INO:
611 agi = XFS_BUF_TO_AGI(agbp);
612 nlevels = INT_GET(agi->agi_level, ARCH_CONVERT);
613 break;
614 default:
615 ASSERT(0);
616 }
617 /*
618 * Fill in the common fields.
619 */
620 cur->bc_tp = tp;
621 cur->bc_mp = mp;
622 cur->bc_nlevels = nlevels;
623 cur->bc_btnum = btnum;
624 cur->bc_blocklog = mp->m_sb.sb_blocklog;
625 /*
626 * Fill in private fields.
627 */
628 switch (btnum) {
629 case XFS_BTNUM_BNO:
630 case XFS_BTNUM_CNT:
631 /*
632 * Allocation btree fields.
633 */
634 cur->bc_private.a.agbp = agbp;
635 cur->bc_private.a.agno = agno;
636 break;
637 case XFS_BTNUM_BMAP:
638 /*
639 * Bmap btree fields.
640 */
641 cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
642 cur->bc_private.b.ip = ip;
643 cur->bc_private.b.firstblock = NULLFSBLOCK;
644 cur->bc_private.b.flist = NULL;
645 cur->bc_private.b.allocated = 0;
646 cur->bc_private.b.flags = 0;
647 cur->bc_private.b.whichfork = whichfork;
648 break;
649 case XFS_BTNUM_INO:
650 /*
651 * Inode allocation btree fields.
652 */
653 cur->bc_private.i.agbp = agbp;
654 cur->bc_private.i.agno = agno;
655 break;
656 default:
657 ASSERT(0);
658 }
659 return cur;
660}
661
662/*
663 * Check for the cursor referring to the last block at the given level.
664 */
665int /* 1=is last block, 0=not last block */
666xfs_btree_islastblock(
667 xfs_btree_cur_t *cur, /* btree cursor */
668 int level) /* level to check */
669{
670 xfs_btree_block_t *block; /* generic btree block pointer */
671 xfs_buf_t *bp; /* buffer containing block */
672
673 block = xfs_btree_get_block(cur, level, &bp);
674 xfs_btree_check_block(cur, block, level, bp);
675 if (XFS_BTREE_LONG_PTRS(cur->bc_btnum))
676 return INT_GET(block->bb_u.l.bb_rightsib, ARCH_CONVERT) == NULLDFSBNO;
677 else
678 return INT_GET(block->bb_u.s.bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK;
679}
680
681/*
682 * Change the cursor to point to the last record in the current block
683 * at the given level. Other levels are unaffected.
684 */
685int /* success=1, failure=0 */
686xfs_btree_lastrec(
687 xfs_btree_cur_t *cur, /* btree cursor */
688 int level) /* level to change */
689{
690 xfs_btree_block_t *block; /* generic btree block pointer */
691 xfs_buf_t *bp; /* buffer containing block */
692
693 /*
694 * Get the block pointer for this level.
695 */
696 block = xfs_btree_get_block(cur, level, &bp);
697 xfs_btree_check_block(cur, block, level, bp);
698 /*
699 * It's empty, there is no such record.
700 */
701 if (!block->bb_h.bb_numrecs)
702 return 0;
703 /*
704 * Set the ptr value to numrecs, that's the last record/key.
705 */
706 cur->bc_ptrs[level] = INT_GET(block->bb_h.bb_numrecs, ARCH_CONVERT);
707 return 1;
708}
709
710/*
711 * Compute first and last byte offsets for the fields given.
712 * Interprets the offsets table, which contains struct field offsets.
713 */
714void
715xfs_btree_offsets(
716 __int64_t fields, /* bitmask of fields */
717 const short *offsets, /* table of field offsets */
718 int nbits, /* number of bits to inspect */
719 int *first, /* output: first byte offset */
720 int *last) /* output: last byte offset */
721{
722 int i; /* current bit number */
723 __int64_t imask; /* mask for current bit number */
724
725 ASSERT(fields != 0);
726 /*
727 * Find the lowest bit, so the first byte offset.
728 */
729 for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
730 if (imask & fields) {
731 *first = offsets[i];
732 break;
733 }
734 }
735 /*
736 * Find the highest bit, so the last byte offset.
737 */
738 for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
739 if (imask & fields) {
740 *last = offsets[i + 1] - 1;
741 break;
742 }
743 }
744}
745
746/*
747 * Get a buffer for the block, return it read in.
748 * Long-form addressing.
749 */
750int /* error */
751xfs_btree_read_bufl(
752 xfs_mount_t *mp, /* file system mount point */
753 xfs_trans_t *tp, /* transaction pointer */
754 xfs_fsblock_t fsbno, /* file system block number */
755 uint lock, /* lock flags for read_buf */
756 xfs_buf_t **bpp, /* buffer for fsbno */
757 int refval) /* ref count value for buffer */
758{
759 xfs_buf_t *bp; /* return value */
760 xfs_daddr_t d; /* real disk block address */
761 int error;
762
763 ASSERT(fsbno != NULLFSBLOCK);
764 d = XFS_FSB_TO_DADDR(mp, fsbno);
765 if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
766 mp->m_bsize, lock, &bp))) {
767 return error;
768 }
769 ASSERT(!bp || !XFS_BUF_GETERROR(bp));
770 if (bp != NULL) {
771 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
772 }
773 *bpp = bp;
774 return 0;
775}
776
777/*
778 * Get a buffer for the block, return it read in.
779 * Short-form addressing.
780 */
781int /* error */
782xfs_btree_read_bufs(
783 xfs_mount_t *mp, /* file system mount point */
784 xfs_trans_t *tp, /* transaction pointer */
785 xfs_agnumber_t agno, /* allocation group number */
786 xfs_agblock_t agbno, /* allocation group block number */
787 uint lock, /* lock flags for read_buf */
788 xfs_buf_t **bpp, /* buffer for agno/agbno */
789 int refval) /* ref count value for buffer */
790{
791 xfs_buf_t *bp; /* return value */
792 xfs_daddr_t d; /* real disk block address */
793 int error;
794
795 ASSERT(agno != NULLAGNUMBER);
796 ASSERT(agbno != NULLAGBLOCK);
797 d = XFS_AGB_TO_DADDR(mp, agno, agbno);
798 if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
799 mp->m_bsize, lock, &bp))) {
800 return error;
801 }
802 ASSERT(!bp || !XFS_BUF_GETERROR(bp));
803 if (bp != NULL) {
804 switch (refval) {
805 case XFS_ALLOC_BTREE_REF:
806 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
807 break;
808 case XFS_INO_BTREE_REF:
809 XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, refval);
810 break;
811 }
812 }
813 *bpp = bp;
814 return 0;
815}
816
817/*
818 * Read-ahead the block, don't wait for it, don't return a buffer.
819 * Long-form addressing.
820 */
821/* ARGSUSED */
822void
823xfs_btree_reada_bufl(
824 xfs_mount_t *mp, /* file system mount point */
825 xfs_fsblock_t fsbno, /* file system block number */
826 xfs_extlen_t count) /* count of filesystem blocks */
827{
828 xfs_daddr_t d;
829
830 ASSERT(fsbno != NULLFSBLOCK);
831 d = XFS_FSB_TO_DADDR(mp, fsbno);
832 xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count);
833}
834
835/*
836 * Read-ahead the block, don't wait for it, don't return a buffer.
837 * Short-form addressing.
838 */
839/* ARGSUSED */
840void
841xfs_btree_reada_bufs(
842 xfs_mount_t *mp, /* file system mount point */
843 xfs_agnumber_t agno, /* allocation group number */
844 xfs_agblock_t agbno, /* allocation group block number */
845 xfs_extlen_t count) /* count of filesystem blocks */
846{
847 xfs_daddr_t d;
848
849 ASSERT(agno != NULLAGNUMBER);
850 ASSERT(agbno != NULLAGBLOCK);
851 d = XFS_AGB_TO_DADDR(mp, agno, agbno);
852 xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count);
853}
854
855/*
856 * Read-ahead btree blocks, at the given level.
857 * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
858 */
859int
860xfs_btree_readahead_core(
861 xfs_btree_cur_t *cur, /* btree cursor */
862 int lev, /* level in btree */
863 int lr) /* left/right bits */
864{
865 xfs_alloc_block_t *a;
866 xfs_bmbt_block_t *b;
867 xfs_inobt_block_t *i;
868 int rval = 0;
869
870 ASSERT(cur->bc_bufs[lev] != NULL);
871 cur->bc_ra[lev] |= lr;
872 switch (cur->bc_btnum) {
873 case XFS_BTNUM_BNO:
874 case XFS_BTNUM_CNT:
875 a = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[lev]);
876 if ((lr & XFS_BTCUR_LEFTRA) && INT_GET(a->bb_leftsib, ARCH_CONVERT) != NULLAGBLOCK) {
877 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
878 INT_GET(a->bb_leftsib, ARCH_CONVERT), 1);
879 rval++;
880 }
881 if ((lr & XFS_BTCUR_RIGHTRA) && INT_GET(a->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
882 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
883 INT_GET(a->bb_rightsib, ARCH_CONVERT), 1);
884 rval++;
885 }
886 break;
887 case XFS_BTNUM_BMAP:
888 b = XFS_BUF_TO_BMBT_BLOCK(cur->bc_bufs[lev]);
889 if ((lr & XFS_BTCUR_LEFTRA) && INT_GET(b->bb_leftsib, ARCH_CONVERT) != NULLDFSBNO) {
890 xfs_btree_reada_bufl(cur->bc_mp, INT_GET(b->bb_leftsib, ARCH_CONVERT), 1);
891 rval++;
892 }
893 if ((lr & XFS_BTCUR_RIGHTRA) && INT_GET(b->bb_rightsib, ARCH_CONVERT) != NULLDFSBNO) {
894 xfs_btree_reada_bufl(cur->bc_mp, INT_GET(b->bb_rightsib, ARCH_CONVERT), 1);
895 rval++;
896 }
897 break;
898 case XFS_BTNUM_INO:
899 i = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]);
900 if ((lr & XFS_BTCUR_LEFTRA) && INT_GET(i->bb_leftsib, ARCH_CONVERT) != NULLAGBLOCK) {
901 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.i.agno,
902 INT_GET(i->bb_leftsib, ARCH_CONVERT), 1);
903 rval++;
904 }
905 if ((lr & XFS_BTCUR_RIGHTRA) && INT_GET(i->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
906 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.i.agno,
907 INT_GET(i->bb_rightsib, ARCH_CONVERT), 1);
908 rval++;
909 }
910 break;
911 default:
912 ASSERT(0);
913 }
914 return rval;
915}
916
917/*
918 * Set the buffer for level "lev" in the cursor to bp, releasing
919 * any previous buffer.
920 */
921void
922xfs_btree_setbuf(
923 xfs_btree_cur_t *cur, /* btree cursor */
924 int lev, /* level in btree */
925 xfs_buf_t *bp) /* new buffer to set */
926{
927 xfs_btree_block_t *b; /* btree block */
928 xfs_buf_t *obp; /* old buffer pointer */
929
930 obp = cur->bc_bufs[lev];
931 if (obp)
932 xfs_trans_brelse(cur->bc_tp, obp);
933 cur->bc_bufs[lev] = bp;
934 cur->bc_ra[lev] = 0;
935 if (!bp)
936 return;
937 b = XFS_BUF_TO_BLOCK(bp);
938 if (XFS_BTREE_LONG_PTRS(cur->bc_btnum)) {
939 if (INT_GET(b->bb_u.l.bb_leftsib, ARCH_CONVERT) == NULLDFSBNO)
940 cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
941 if (INT_GET(b->bb_u.l.bb_rightsib, ARCH_CONVERT) == NULLDFSBNO)
942 cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
943 } else {
944 if (INT_GET(b->bb_u.s.bb_leftsib, ARCH_CONVERT) == NULLAGBLOCK)
945 cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
946 if (INT_GET(b->bb_u.s.bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK)
947 cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
948 }
949}
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
new file mode 100644
index 000000000000..93872bba41f5
--- /dev/null
+++ b/fs/xfs/xfs_btree.h
@@ -0,0 +1,592 @@
1/*
2 * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_BTREE_H__
33#define __XFS_BTREE_H__
34
35struct xfs_buf;
36struct xfs_bmap_free;
37struct xfs_inode;
38struct xfs_mount;
39struct xfs_trans;
40
41/*
42 * This nonsense is to make -wlint happy.
43 */
44#define XFS_LOOKUP_EQ ((xfs_lookup_t)XFS_LOOKUP_EQi)
45#define XFS_LOOKUP_LE ((xfs_lookup_t)XFS_LOOKUP_LEi)
46#define XFS_LOOKUP_GE ((xfs_lookup_t)XFS_LOOKUP_GEi)
47
48#define XFS_BTNUM_BNO ((xfs_btnum_t)XFS_BTNUM_BNOi)
49#define XFS_BTNUM_CNT ((xfs_btnum_t)XFS_BTNUM_CNTi)
50#define XFS_BTNUM_BMAP ((xfs_btnum_t)XFS_BTNUM_BMAPi)
51#define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi)
52
53/*
54 * Short form header: space allocation btrees.
55 */
56typedef struct xfs_btree_sblock
57{
58 __uint32_t bb_magic; /* magic number for block type */
59 __uint16_t bb_level; /* 0 is a leaf */
60 __uint16_t bb_numrecs; /* current # of data records */
61 xfs_agblock_t bb_leftsib; /* left sibling block or NULLAGBLOCK */
62 xfs_agblock_t bb_rightsib; /* right sibling block or NULLAGBLOCK */
63} xfs_btree_sblock_t;
64
65/*
66 * Long form header: bmap btrees.
67 */
68typedef struct xfs_btree_lblock
69{
70 __uint32_t bb_magic; /* magic number for block type */
71 __uint16_t bb_level; /* 0 is a leaf */
72 __uint16_t bb_numrecs; /* current # of data records */
73 xfs_dfsbno_t bb_leftsib; /* left sibling block or NULLDFSBNO */
74 xfs_dfsbno_t bb_rightsib; /* right sibling block or NULLDFSBNO */
75} xfs_btree_lblock_t;
76
77/*
78 * Combined header and structure, used by common code.
79 */
80typedef struct xfs_btree_hdr
81{
82 __uint32_t bb_magic; /* magic number for block type */
83 __uint16_t bb_level; /* 0 is a leaf */
84 __uint16_t bb_numrecs; /* current # of data records */
85} xfs_btree_hdr_t;
86
87typedef struct xfs_btree_block
88{
89 xfs_btree_hdr_t bb_h; /* header */
90 union {
91 struct {
92 xfs_agblock_t bb_leftsib;
93 xfs_agblock_t bb_rightsib;
94 } s; /* short form pointers */
95 struct {
96 xfs_dfsbno_t bb_leftsib;
97 xfs_dfsbno_t bb_rightsib;
98 } l; /* long form pointers */
99 } bb_u; /* rest */
100} xfs_btree_block_t;
101
102/*
103 * For logging record fields.
104 */
105#define XFS_BB_MAGIC 0x01
106#define XFS_BB_LEVEL 0x02
107#define XFS_BB_NUMRECS 0x04
108#define XFS_BB_LEFTSIB 0x08
109#define XFS_BB_RIGHTSIB 0x10
110#define XFS_BB_NUM_BITS 5
111#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1)
112
113/*
114 * Boolean to select which form of xfs_btree_block_t.bb_u to use.
115 */
116#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BTREE_LONG_PTRS)
117int xfs_btree_long_ptrs(xfs_btnum_t btnum);
118#define XFS_BTREE_LONG_PTRS(btnum) ((btnum) == XFS_BTNUM_BMAP)
119#else
120#define XFS_BTREE_LONG_PTRS(btnum) ((btnum) == XFS_BTNUM_BMAP)
121#endif
122
123/*
124 * Magic numbers for btree blocks.
125 */
126extern const __uint32_t xfs_magics[];
127
128/*
129 * Maximum and minimum records in a btree block.
130 * Given block size, type prefix, and leaf flag (0 or 1).
131 * The divisor below is equivalent to lf ? (e1) : (e2) but that produces
132 * compiler warnings.
133 */
134#define XFS_BTREE_BLOCK_MAXRECS(bsz,t,lf) \
135 ((int)(((bsz) - (uint)sizeof(t ## _block_t)) / \
136 (((lf) * (uint)sizeof(t ## _rec_t)) + \
137 ((1 - (lf)) * \
138 ((uint)sizeof(t ## _key_t) + (uint)sizeof(t ## _ptr_t))))))
139#define XFS_BTREE_BLOCK_MINRECS(bsz,t,lf) \
140 (XFS_BTREE_BLOCK_MAXRECS(bsz,t,lf) / 2)
141
142/*
143 * Record, key, and pointer address calculation macros.
144 * Given block size, type prefix, block pointer, and index of requested entry
145 * (first entry numbered 1).
146 */
147#define XFS_BTREE_REC_ADDR(bsz,t,bb,i,mxr) \
148 ((t ## _rec_t *)((char *)(bb) + sizeof(t ## _block_t) + \
149 ((i) - 1) * sizeof(t ## _rec_t)))
150#define XFS_BTREE_KEY_ADDR(bsz,t,bb,i,mxr) \
151 ((t ## _key_t *)((char *)(bb) + sizeof(t ## _block_t) + \
152 ((i) - 1) * sizeof(t ## _key_t)))
153#define XFS_BTREE_PTR_ADDR(bsz,t,bb,i,mxr) \
154 ((t ## _ptr_t *)((char *)(bb) + sizeof(t ## _block_t) + \
155 (mxr) * sizeof(t ## _key_t) + ((i) - 1) * sizeof(t ## _ptr_t)))
156
157#define XFS_BTREE_MAXLEVELS 8 /* max of all btrees */
158
159/*
160 * Btree cursor structure.
161 * This collects all information needed by the btree code in one place.
162 */
163typedef struct xfs_btree_cur
164{
165 struct xfs_trans *bc_tp; /* transaction we're in, if any */
166 struct xfs_mount *bc_mp; /* file system mount struct */
167 union {
168 xfs_alloc_rec_t a;
169 xfs_bmbt_irec_t b;
170 xfs_inobt_rec_t i;
171 } bc_rec; /* current insert/search record value */
172 struct xfs_buf *bc_bufs[XFS_BTREE_MAXLEVELS]; /* buf ptr per level */
173 int bc_ptrs[XFS_BTREE_MAXLEVELS]; /* key/record # */
174 __uint8_t bc_ra[XFS_BTREE_MAXLEVELS]; /* readahead bits */
175#define XFS_BTCUR_LEFTRA 1 /* left sibling has been read-ahead */
176#define XFS_BTCUR_RIGHTRA 2 /* right sibling has been read-ahead */
177 __uint8_t bc_nlevels; /* number of levels in the tree */
178 __uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */
179 xfs_btnum_t bc_btnum; /* identifies which btree type */
180 union {
181 struct { /* needed for BNO, CNT */
182 struct xfs_buf *agbp; /* agf buffer pointer */
183 xfs_agnumber_t agno; /* ag number */
184 } a;
185 struct { /* needed for BMAP */
186 struct xfs_inode *ip; /* pointer to our inode */
187 struct xfs_bmap_free *flist; /* list to free after */
188 xfs_fsblock_t firstblock; /* 1st blk allocated */
189 int allocated; /* count of alloced */
190 short forksize; /* fork's inode space */
191 char whichfork; /* data or attr fork */
192 char flags; /* flags */
193#define XFS_BTCUR_BPRV_WASDEL 1 /* was delayed */
194 } b;
195 struct { /* needed for INO */
196 struct xfs_buf *agbp; /* agi buffer pointer */
197 xfs_agnumber_t agno; /* ag number */
198 } i;
199 } bc_private; /* per-btree type data */
200} xfs_btree_cur_t;
201
202#define XFS_BTREE_NOERROR 0
203#define XFS_BTREE_ERROR 1
204
205/*
206 * Convert from buffer to btree block header.
207 */
208#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_BLOCK)
209xfs_btree_block_t *xfs_buf_to_block(struct xfs_buf *bp);
210#define XFS_BUF_TO_BLOCK(bp) xfs_buf_to_block(bp)
211#else
212#define XFS_BUF_TO_BLOCK(bp) ((xfs_btree_block_t *)(XFS_BUF_PTR(bp)))
213#endif
214#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_LBLOCK)
215xfs_btree_lblock_t *xfs_buf_to_lblock(struct xfs_buf *bp);
216#define XFS_BUF_TO_LBLOCK(bp) xfs_buf_to_lblock(bp)
217#else
218#define XFS_BUF_TO_LBLOCK(bp) ((xfs_btree_lblock_t *)(XFS_BUF_PTR(bp)))
219#endif
220#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_SBLOCK)
221xfs_btree_sblock_t *xfs_buf_to_sblock(struct xfs_buf *bp);
222#define XFS_BUF_TO_SBLOCK(bp) xfs_buf_to_sblock(bp)
223#else
224#define XFS_BUF_TO_SBLOCK(bp) ((xfs_btree_sblock_t *)(XFS_BUF_PTR(bp)))
225#endif
226
227#ifdef __KERNEL__
228
229#ifdef DEBUG
230/*
231 * Debug routine: check that block header is ok.
232 */
233void
234xfs_btree_check_block(
235 xfs_btree_cur_t *cur, /* btree cursor */
236 xfs_btree_block_t *block, /* generic btree block pointer */
237 int level, /* level of the btree block */
238 struct xfs_buf *bp); /* buffer containing block, if any */
239
240/*
241 * Debug routine: check that keys are in the right order.
242 */
243void
244xfs_btree_check_key(
245 xfs_btnum_t btnum, /* btree identifier */
246 void *ak1, /* pointer to left (lower) key */
247 void *ak2); /* pointer to right (higher) key */
248
249/*
250 * Debug routine: check that records are in the right order.
251 */
252void
253xfs_btree_check_rec(
254 xfs_btnum_t btnum, /* btree identifier */
255 void *ar1, /* pointer to left (lower) record */
256 void *ar2); /* pointer to right (higher) record */
257#else
258#define xfs_btree_check_block(a,b,c,d)
259#define xfs_btree_check_key(a,b,c)
260#define xfs_btree_check_rec(a,b,c)
261#endif /* DEBUG */
262
263/*
264 * Checking routine: check that long form block header is ok.
265 */
266int /* error (0 or EFSCORRUPTED) */
267xfs_btree_check_lblock(
268 xfs_btree_cur_t *cur, /* btree cursor */
269 xfs_btree_lblock_t *block, /* btree long form block pointer */
270 int level, /* level of the btree block */
271 struct xfs_buf *bp); /* buffer containing block, if any */
272
273/*
274 * Checking routine: check that (long) pointer is ok.
275 */
276int /* error (0 or EFSCORRUPTED) */
277xfs_btree_check_lptr(
278 xfs_btree_cur_t *cur, /* btree cursor */
279 xfs_dfsbno_t ptr, /* btree block disk address */
280 int level); /* btree block level */
281
282/*
283 * Checking routine: check that short form block header is ok.
284 */
285int /* error (0 or EFSCORRUPTED) */
286xfs_btree_check_sblock(
287 xfs_btree_cur_t *cur, /* btree cursor */
288 xfs_btree_sblock_t *block, /* btree short form block pointer */
289 int level, /* level of the btree block */
290 struct xfs_buf *bp); /* buffer containing block */
291
292/*
293 * Checking routine: check that (short) pointer is ok.
294 */
295int /* error (0 or EFSCORRUPTED) */
296xfs_btree_check_sptr(
297 xfs_btree_cur_t *cur, /* btree cursor */
298 xfs_agblock_t ptr, /* btree block disk address */
299 int level); /* btree block level */
300
301/*
302 * Delete the btree cursor.
303 */
304void
305xfs_btree_del_cursor(
306 xfs_btree_cur_t *cur, /* btree cursor */
307 int error); /* del because of error */
308
309/*
310 * Duplicate the btree cursor.
311 * Allocate a new one, copy the record, re-get the buffers.
312 */
313int /* error */
314xfs_btree_dup_cursor(
315 xfs_btree_cur_t *cur, /* input cursor */
316 xfs_btree_cur_t **ncur);/* output cursor */
317
318/*
319 * Change the cursor to point to the first record in the current block
320 * at the given level. Other levels are unaffected.
321 */
322int /* success=1, failure=0 */
323xfs_btree_firstrec(
324 xfs_btree_cur_t *cur, /* btree cursor */
325 int level); /* level to change */
326
327/*
328 * Retrieve the block pointer from the cursor at the given level.
329 * This may be a bmap btree root or from a buffer.
330 */
331xfs_btree_block_t * /* generic btree block pointer */
332xfs_btree_get_block(
333 xfs_btree_cur_t *cur, /* btree cursor */
334 int level, /* level in btree */
335 struct xfs_buf **bpp); /* buffer containing the block */
336
337/*
338 * Get a buffer for the block, return it with no data read.
339 * Long-form addressing.
340 */
341struct xfs_buf * /* buffer for fsbno */
342xfs_btree_get_bufl(
343 struct xfs_mount *mp, /* file system mount point */
344 struct xfs_trans *tp, /* transaction pointer */
345 xfs_fsblock_t fsbno, /* file system block number */
346 uint lock); /* lock flags for get_buf */
347
348/*
349 * Get a buffer for the block, return it with no data read.
350 * Short-form addressing.
351 */
352struct xfs_buf * /* buffer for agno/agbno */
353xfs_btree_get_bufs(
354 struct xfs_mount *mp, /* file system mount point */
355 struct xfs_trans *tp, /* transaction pointer */
356 xfs_agnumber_t agno, /* allocation group number */
357 xfs_agblock_t agbno, /* allocation group block number */
358 uint lock); /* lock flags for get_buf */
359
360/*
361 * Allocate a new btree cursor.
362 * The cursor is either for allocation (A) or bmap (B).
363 */
364xfs_btree_cur_t * /* new btree cursor */
365xfs_btree_init_cursor(
366 struct xfs_mount *mp, /* file system mount point */
367 struct xfs_trans *tp, /* transaction pointer */
368 struct xfs_buf *agbp, /* (A only) buffer for agf structure */
369 xfs_agnumber_t agno, /* (A only) allocation group number */
370 xfs_btnum_t btnum, /* btree identifier */
371 struct xfs_inode *ip, /* (B only) inode owning the btree */
372 int whichfork); /* (B only) data/attr fork */
373
374/*
375 * Check for the cursor referring to the last block at the given level.
376 */
377int /* 1=is last block, 0=not last block */
378xfs_btree_islastblock(
379 xfs_btree_cur_t *cur, /* btree cursor */
380 int level); /* level to check */
381
382/*
383 * Change the cursor to point to the last record in the current block
384 * at the given level. Other levels are unaffected.
385 */
386int /* success=1, failure=0 */
387xfs_btree_lastrec(
388 xfs_btree_cur_t *cur, /* btree cursor */
389 int level); /* level to change */
390
391/*
392 * Compute first and last byte offsets for the fields given.
393 * Interprets the offsets table, which contains struct field offsets.
394 */
395void
396xfs_btree_offsets(
397 __int64_t fields, /* bitmask of fields */
398 const short *offsets,/* table of field offsets */
399 int nbits, /* number of bits to inspect */
400 int *first, /* output: first byte offset */
401 int *last); /* output: last byte offset */
402
403/*
404 * Get a buffer for the block, return it read in.
405 * Long-form addressing.
406 */
407int /* error */
408xfs_btree_read_bufl(
409 struct xfs_mount *mp, /* file system mount point */
410 struct xfs_trans *tp, /* transaction pointer */
411 xfs_fsblock_t fsbno, /* file system block number */
412 uint lock, /* lock flags for read_buf */
413 struct xfs_buf **bpp, /* buffer for fsbno */
414 int refval);/* ref count value for buffer */
415
416/*
417 * Get a buffer for the block, return it read in.
418 * Short-form addressing.
419 */
420int /* error */
421xfs_btree_read_bufs(
422 struct xfs_mount *mp, /* file system mount point */
423 struct xfs_trans *tp, /* transaction pointer */
424 xfs_agnumber_t agno, /* allocation group number */
425 xfs_agblock_t agbno, /* allocation group block number */
426 uint lock, /* lock flags for read_buf */
427 struct xfs_buf **bpp, /* buffer for agno/agbno */
428 int refval);/* ref count value for buffer */
429
430/*
431 * Read-ahead the block, don't wait for it, don't return a buffer.
432 * Long-form addressing.
433 */
434void /* error */
435xfs_btree_reada_bufl(
436 struct xfs_mount *mp, /* file system mount point */
437 xfs_fsblock_t fsbno, /* file system block number */
438 xfs_extlen_t count); /* count of filesystem blocks */
439
440/*
441 * Read-ahead the block, don't wait for it, don't return a buffer.
442 * Short-form addressing.
443 */
444void /* error */
445xfs_btree_reada_bufs(
446 struct xfs_mount *mp, /* file system mount point */
447 xfs_agnumber_t agno, /* allocation group number */
448 xfs_agblock_t agbno, /* allocation group block number */
449 xfs_extlen_t count); /* count of filesystem blocks */
450
451/*
452 * Read-ahead btree blocks, at the given level.
453 * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
454 */
455int /* readahead block count */
456xfs_btree_readahead_core(
457 xfs_btree_cur_t *cur, /* btree cursor */
458 int lev, /* level in btree */
459 int lr); /* left/right bits */
460
461static inline int /* readahead block count */
462xfs_btree_readahead(
463 xfs_btree_cur_t *cur, /* btree cursor */
464 int lev, /* level in btree */
465 int lr) /* left/right bits */
466{
467 if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
468 return 0;
469
470 return xfs_btree_readahead_core(cur, lev, lr);
471}
472
473
474/*
475 * Set the buffer for level "lev" in the cursor to bp, releasing
476 * any previous buffer.
477 */
478void
479xfs_btree_setbuf(
480 xfs_btree_cur_t *cur, /* btree cursor */
481 int lev, /* level in btree */
482 struct xfs_buf *bp); /* new buffer to set */
483
484#endif /* __KERNEL__ */
485
486
487/*
488 * Min and max functions for extlen, agblock, fileoff, and filblks types.
489 */
490#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_EXTLEN_MIN)
491xfs_extlen_t xfs_extlen_min(xfs_extlen_t a, xfs_extlen_t b);
492#define XFS_EXTLEN_MIN(a,b) xfs_extlen_min(a,b)
493#else
494#define XFS_EXTLEN_MIN(a,b) \
495 ((xfs_extlen_t)(a) < (xfs_extlen_t)(b) ? \
496 (xfs_extlen_t)(a) : (xfs_extlen_t)(b))
497#endif
498#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_EXTLEN_MAX)
499xfs_extlen_t xfs_extlen_max(xfs_extlen_t a, xfs_extlen_t b);
500#define XFS_EXTLEN_MAX(a,b) xfs_extlen_max(a,b)
501#else
502#define XFS_EXTLEN_MAX(a,b) \
503 ((xfs_extlen_t)(a) > (xfs_extlen_t)(b) ? \
504 (xfs_extlen_t)(a) : (xfs_extlen_t)(b))
505#endif
506
507#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGBLOCK_MIN)
508xfs_agblock_t xfs_agblock_min(xfs_agblock_t a, xfs_agblock_t b);
509#define XFS_AGBLOCK_MIN(a,b) xfs_agblock_min(a,b)
510#else
511#define XFS_AGBLOCK_MIN(a,b) \
512 ((xfs_agblock_t)(a) < (xfs_agblock_t)(b) ? \
513 (xfs_agblock_t)(a) : (xfs_agblock_t)(b))
514#endif
515#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGBLOCK_MAX)
516xfs_agblock_t xfs_agblock_max(xfs_agblock_t a, xfs_agblock_t b);
517#define XFS_AGBLOCK_MAX(a,b) xfs_agblock_max(a,b)
518#else
519#define XFS_AGBLOCK_MAX(a,b) \
520 ((xfs_agblock_t)(a) > (xfs_agblock_t)(b) ? \
521 (xfs_agblock_t)(a) : (xfs_agblock_t)(b))
522#endif
523
524#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FILEOFF_MIN)
525xfs_fileoff_t xfs_fileoff_min(xfs_fileoff_t a, xfs_fileoff_t b);
526#define XFS_FILEOFF_MIN(a,b) xfs_fileoff_min(a,b)
527#else
528#define XFS_FILEOFF_MIN(a,b) \
529 ((xfs_fileoff_t)(a) < (xfs_fileoff_t)(b) ? \
530 (xfs_fileoff_t)(a) : (xfs_fileoff_t)(b))
531#endif
532#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FILEOFF_MAX)
533xfs_fileoff_t xfs_fileoff_max(xfs_fileoff_t a, xfs_fileoff_t b);
534#define XFS_FILEOFF_MAX(a,b) xfs_fileoff_max(a,b)
535#else
536#define XFS_FILEOFF_MAX(a,b) \
537 ((xfs_fileoff_t)(a) > (xfs_fileoff_t)(b) ? \
538 (xfs_fileoff_t)(a) : (xfs_fileoff_t)(b))
539#endif
540
541#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FILBLKS_MIN)
542xfs_filblks_t xfs_filblks_min(xfs_filblks_t a, xfs_filblks_t b);
543#define XFS_FILBLKS_MIN(a,b) xfs_filblks_min(a,b)
544#else
545#define XFS_FILBLKS_MIN(a,b) \
546 ((xfs_filblks_t)(a) < (xfs_filblks_t)(b) ? \
547 (xfs_filblks_t)(a) : (xfs_filblks_t)(b))
548#endif
549#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FILBLKS_MAX)
550xfs_filblks_t xfs_filblks_max(xfs_filblks_t a, xfs_filblks_t b);
551#define XFS_FILBLKS_MAX(a,b) xfs_filblks_max(a,b)
552#else
553#define XFS_FILBLKS_MAX(a,b) \
554 ((xfs_filblks_t)(a) > (xfs_filblks_t)(b) ? \
555 (xfs_filblks_t)(a) : (xfs_filblks_t)(b))
556#endif
557#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FSB_SANITY_CHECK)
558int xfs_fsb_sanity_check(struct xfs_mount *mp, xfs_fsblock_t fsb);
559#define XFS_FSB_SANITY_CHECK(mp,fsb) xfs_fsb_sanity_check(mp,fsb)
560#else
561#define XFS_FSB_SANITY_CHECK(mp,fsb) \
562 (XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \
563 XFS_FSB_TO_AGBNO(mp, fsb) < mp->m_sb.sb_agblocks)
564#endif
565
566/*
567 * Macros to set EFSCORRUPTED & return/branch.
568 */
569#define XFS_WANT_CORRUPTED_GOTO(x,l) \
570 { \
571 int fs_is_ok = (x); \
572 ASSERT(fs_is_ok); \
573 if (unlikely(!fs_is_ok)) { \
574 XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_GOTO", \
575 XFS_ERRLEVEL_LOW, NULL); \
576 error = XFS_ERROR(EFSCORRUPTED); \
577 goto l; \
578 } \
579 }
580
581#define XFS_WANT_CORRUPTED_RETURN(x) \
582 { \
583 int fs_is_ok = (x); \
584 ASSERT(fs_is_ok); \
585 if (unlikely(!fs_is_ok)) { \
586 XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_RETURN", \
587 XFS_ERRLEVEL_LOW, NULL); \
588 return XFS_ERROR(EFSCORRUPTED); \
589 } \
590 }
591
592#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
new file mode 100644
index 000000000000..9ab0039f07df
--- /dev/null
+++ b/fs/xfs/xfs_buf_item.c
@@ -0,0 +1,1221 @@
1/*
2 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33/*
34 * This file contains the implementation of the xfs_buf_log_item.
35 * It contains the item operations used to manipulate the buf log
36 * items as well as utility routines used by the buffer specific
37 * transaction routines.
38 */
39
40#include "xfs.h"
41
42#include "xfs_macros.h"
43#include "xfs_types.h"
44#include "xfs_inum.h"
45#include "xfs_log.h"
46#include "xfs_trans.h"
47#include "xfs_buf_item.h"
48#include "xfs_sb.h"
49#include "xfs_dir.h"
50#include "xfs_dmapi.h"
51#include "xfs_mount.h"
52#include "xfs_trans_priv.h"
53#include "xfs_rw.h"
54#include "xfs_bit.h"
55#include "xfs_error.h"
56
57
58kmem_zone_t *xfs_buf_item_zone;
59
60#ifdef XFS_TRANS_DEBUG
61/*
62 * This function uses an alternate strategy for tracking the bytes
63 * that the user requests to be logged. This can then be used
64 * in conjunction with the bli_orig array in the buf log item to
65 * catch bugs in our callers' code.
66 *
67 * We also double check the bits set in xfs_buf_item_log using a
68 * simple algorithm to check that every byte is accounted for.
69 */
70STATIC void
71xfs_buf_item_log_debug(
72 xfs_buf_log_item_t *bip,
73 uint first,
74 uint last)
75{
76 uint x;
77 uint byte;
78 uint nbytes;
79 uint chunk_num;
80 uint word_num;
81 uint bit_num;
82 uint bit_set;
83 uint *wordp;
84
85 ASSERT(bip->bli_logged != NULL);
86 byte = first;
87 nbytes = last - first + 1;
88 bfset(bip->bli_logged, first, nbytes);
89 for (x = 0; x < nbytes; x++) {
90 chunk_num = byte >> XFS_BLI_SHIFT;
91 word_num = chunk_num >> BIT_TO_WORD_SHIFT;
92 bit_num = chunk_num & (NBWORD - 1);
93 wordp = &(bip->bli_format.blf_data_map[word_num]);
94 bit_set = *wordp & (1 << bit_num);
95 ASSERT(bit_set);
96 byte++;
97 }
98}
99
100/*
101 * This function is called when we flush something into a buffer without
102 * logging it. This happens for things like inodes which are logged
103 * separately from the buffer.
104 */
105void
106xfs_buf_item_flush_log_debug(
107 xfs_buf_t *bp,
108 uint first,
109 uint last)
110{
111 xfs_buf_log_item_t *bip;
112 uint nbytes;
113
114 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
115 if ((bip == NULL) || (bip->bli_item.li_type != XFS_LI_BUF)) {
116 return;
117 }
118
119 ASSERT(bip->bli_logged != NULL);
120 nbytes = last - first + 1;
121 bfset(bip->bli_logged, first, nbytes);
122}
123
124/*
125 * This function is called to verify that our caller's have logged
126 * all the bytes that they changed.
127 *
128 * It does this by comparing the original copy of the buffer stored in
129 * the buf log item's bli_orig array to the current copy of the buffer
130 * and ensuring that all bytes which miscompare are set in the bli_logged
131 * array of the buf log item.
132 */
133STATIC void
134xfs_buf_item_log_check(
135 xfs_buf_log_item_t *bip)
136{
137 char *orig;
138 char *buffer;
139 int x;
140 xfs_buf_t *bp;
141
142 ASSERT(bip->bli_orig != NULL);
143 ASSERT(bip->bli_logged != NULL);
144
145 bp = bip->bli_buf;
146 ASSERT(XFS_BUF_COUNT(bp) > 0);
147 ASSERT(XFS_BUF_PTR(bp) != NULL);
148 orig = bip->bli_orig;
149 buffer = XFS_BUF_PTR(bp);
150 for (x = 0; x < XFS_BUF_COUNT(bp); x++) {
151 if (orig[x] != buffer[x] && !btst(bip->bli_logged, x))
152 cmn_err(CE_PANIC,
153 "xfs_buf_item_log_check bip %x buffer %x orig %x index %d",
154 bip, bp, orig, x);
155 }
156}
157#else
158#define xfs_buf_item_log_debug(x,y,z)
159#define xfs_buf_item_log_check(x)
160#endif
161
162STATIC void xfs_buf_error_relse(xfs_buf_t *bp);
163STATIC void xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip);
164
165/*
166 * This returns the number of log iovecs needed to log the
167 * given buf log item.
168 *
169 * It calculates this as 1 iovec for the buf log format structure
170 * and 1 for each stretch of non-contiguous chunks to be logged.
171 * Contiguous chunks are logged in a single iovec.
172 *
173 * If the XFS_BLI_STALE flag has been set, then log nothing.
174 */
175uint
176xfs_buf_item_size(
177 xfs_buf_log_item_t *bip)
178{
179 uint nvecs;
180 int next_bit;
181 int last_bit;
182 xfs_buf_t *bp;
183
184 ASSERT(atomic_read(&bip->bli_refcount) > 0);
185 if (bip->bli_flags & XFS_BLI_STALE) {
186 /*
187 * The buffer is stale, so all we need to log
188 * is the buf log format structure with the
189 * cancel flag in it.
190 */
191 xfs_buf_item_trace("SIZE STALE", bip);
192 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
193 return 1;
194 }
195
196 bp = bip->bli_buf;
197 ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
198 nvecs = 1;
199 last_bit = xfs_next_bit(bip->bli_format.blf_data_map,
200 bip->bli_format.blf_map_size, 0);
201 ASSERT(last_bit != -1);
202 nvecs++;
203 while (last_bit != -1) {
204 /*
205 * This takes the bit number to start looking from and
206 * returns the next set bit from there. It returns -1
207 * if there are no more bits set or the start bit is
208 * beyond the end of the bitmap.
209 */
210 next_bit = xfs_next_bit(bip->bli_format.blf_data_map,
211 bip->bli_format.blf_map_size,
212 last_bit + 1);
213 /*
214 * If we run out of bits, leave the loop,
215 * else if we find a new set of bits bump the number of vecs,
216 * else keep scanning the current set of bits.
217 */
218 if (next_bit == -1) {
219 last_bit = -1;
220 } else if (next_bit != last_bit + 1) {
221 last_bit = next_bit;
222 nvecs++;
223 } else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) !=
224 (xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) +
225 XFS_BLI_CHUNK)) {
226 last_bit = next_bit;
227 nvecs++;
228 } else {
229 last_bit++;
230 }
231 }
232
233 xfs_buf_item_trace("SIZE NORM", bip);
234 return nvecs;
235}
236
237/*
238 * This is called to fill in the vector of log iovecs for the
239 * given log buf item. It fills the first entry with a buf log
240 * format structure, and the rest point to contiguous chunks
241 * within the buffer.
242 */
243void
244xfs_buf_item_format(
245 xfs_buf_log_item_t *bip,
246 xfs_log_iovec_t *log_vector)
247{
248 uint base_size;
249 uint nvecs;
250 xfs_log_iovec_t *vecp;
251 xfs_buf_t *bp;
252 int first_bit;
253 int last_bit;
254 int next_bit;
255 uint nbits;
256 uint buffer_offset;
257
258 ASSERT(atomic_read(&bip->bli_refcount) > 0);
259 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
260 (bip->bli_flags & XFS_BLI_STALE));
261 bp = bip->bli_buf;
262 ASSERT(XFS_BUF_BP_ISMAPPED(bp));
263 vecp = log_vector;
264
265 /*
266 * The size of the base structure is the size of the
267 * declared structure plus the space for the extra words
268 * of the bitmap. We subtract one from the map size, because
269 * the first element of the bitmap is accounted for in the
270 * size of the base structure.
271 */
272 base_size =
273 (uint)(sizeof(xfs_buf_log_format_t) +
274 ((bip->bli_format.blf_map_size - 1) * sizeof(uint)));
275 vecp->i_addr = (xfs_caddr_t)&bip->bli_format;
276 vecp->i_len = base_size;
277 vecp++;
278 nvecs = 1;
279
280 if (bip->bli_flags & XFS_BLI_STALE) {
281 /*
282 * The buffer is stale, so all we need to log
283 * is the buf log format structure with the
284 * cancel flag in it.
285 */
286 xfs_buf_item_trace("FORMAT STALE", bip);
287 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
288 bip->bli_format.blf_size = nvecs;
289 return;
290 }
291
292 /*
293 * Fill in an iovec for each set of contiguous chunks.
294 */
295 first_bit = xfs_next_bit(bip->bli_format.blf_data_map,
296 bip->bli_format.blf_map_size, 0);
297 ASSERT(first_bit != -1);
298 last_bit = first_bit;
299 nbits = 1;
300 for (;;) {
301 /*
302 * This takes the bit number to start looking from and
303 * returns the next set bit from there. It returns -1
304 * if there are no more bits set or the start bit is
305 * beyond the end of the bitmap.
306 */
307 next_bit = xfs_next_bit(bip->bli_format.blf_data_map,
308 bip->bli_format.blf_map_size,
309 (uint)last_bit + 1);
310 /*
311 * If we run out of bits fill in the last iovec and get
312 * out of the loop.
313 * Else if we start a new set of bits then fill in the
314 * iovec for the series we were looking at and start
315 * counting the bits in the new one.
316 * Else we're still in the same set of bits so just
317 * keep counting and scanning.
318 */
319 if (next_bit == -1) {
320 buffer_offset = first_bit * XFS_BLI_CHUNK;
321 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
322 vecp->i_len = nbits * XFS_BLI_CHUNK;
323 nvecs++;
324 break;
325 } else if (next_bit != last_bit + 1) {
326 buffer_offset = first_bit * XFS_BLI_CHUNK;
327 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
328 vecp->i_len = nbits * XFS_BLI_CHUNK;
329 nvecs++;
330 vecp++;
331 first_bit = next_bit;
332 last_bit = next_bit;
333 nbits = 1;
334 } else if (xfs_buf_offset(bp, next_bit << XFS_BLI_SHIFT) !=
335 (xfs_buf_offset(bp, last_bit << XFS_BLI_SHIFT) +
336 XFS_BLI_CHUNK)) {
337 buffer_offset = first_bit * XFS_BLI_CHUNK;
338 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
339 vecp->i_len = nbits * XFS_BLI_CHUNK;
340/* You would think we need to bump the nvecs here too, but we do not
341 * this number is used by recovery, and it gets confused by the boundary
342 * split here
343 * nvecs++;
344 */
345 vecp++;
346 first_bit = next_bit;
347 last_bit = next_bit;
348 nbits = 1;
349 } else {
350 last_bit++;
351 nbits++;
352 }
353 }
354 bip->bli_format.blf_size = nvecs;
355
356 /*
357 * Check to make sure everything is consistent.
358 */
359 xfs_buf_item_trace("FORMAT NORM", bip);
360 xfs_buf_item_log_check(bip);
361}
362
363/*
364 * This is called to pin the buffer associated with the buf log
365 * item in memory so it cannot be written out. Simply call bpin()
366 * on the buffer to do this.
367 */
368void
369xfs_buf_item_pin(
370 xfs_buf_log_item_t *bip)
371{
372 xfs_buf_t *bp;
373
374 bp = bip->bli_buf;
375 ASSERT(XFS_BUF_ISBUSY(bp));
376 ASSERT(atomic_read(&bip->bli_refcount) > 0);
377 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
378 (bip->bli_flags & XFS_BLI_STALE));
379 xfs_buf_item_trace("PIN", bip);
380 xfs_buftrace("XFS_PIN", bp);
381 xfs_bpin(bp);
382}
383
384
385/*
386 * This is called to unpin the buffer associated with the buf log
387 * item which was previously pinned with a call to xfs_buf_item_pin().
388 * Just call bunpin() on the buffer to do this.
389 *
390 * Also drop the reference to the buf item for the current transaction.
391 * If the XFS_BLI_STALE flag is set and we are the last reference,
392 * then free up the buf log item and unlock the buffer.
393 */
394void
395xfs_buf_item_unpin(
396 xfs_buf_log_item_t *bip,
397 int stale)
398{
399 xfs_mount_t *mp;
400 xfs_buf_t *bp;
401 int freed;
402 SPLDECL(s);
403
404 bp = bip->bli_buf;
405 ASSERT(bp != NULL);
406 ASSERT(XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *) == bip);
407 ASSERT(atomic_read(&bip->bli_refcount) > 0);
408 xfs_buf_item_trace("UNPIN", bip);
409 xfs_buftrace("XFS_UNPIN", bp);
410
411 freed = atomic_dec_and_test(&bip->bli_refcount);
412 mp = bip->bli_item.li_mountp;
413 xfs_bunpin(bp);
414 if (freed && stale) {
415 ASSERT(bip->bli_flags & XFS_BLI_STALE);
416 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
417 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
418 ASSERT(XFS_BUF_ISSTALE(bp));
419 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
420 xfs_buf_item_trace("UNPIN STALE", bip);
421 xfs_buftrace("XFS_UNPIN STALE", bp);
422 /*
423 * If we get called here because of an IO error, we may
424 * or may not have the item on the AIL. xfs_trans_delete_ail()
425 * will take care of that situation.
426 * xfs_trans_delete_ail() drops the AIL lock.
427 */
428 if (bip->bli_flags & XFS_BLI_STALE_INODE) {
429 xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip);
430 XFS_BUF_SET_FSPRIVATE(bp, NULL);
431 XFS_BUF_CLR_IODONE_FUNC(bp);
432 } else {
433 AIL_LOCK(mp,s);
434 xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip, s);
435 xfs_buf_item_relse(bp);
436 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL);
437 }
438 xfs_buf_relse(bp);
439 }
440}
441
442/*
443 * this is called from uncommit in the forced-shutdown path.
444 * we need to check to see if the reference count on the log item
445 * is going to drop to zero. If so, unpin will free the log item
446 * so we need to free the item's descriptor (that points to the item)
447 * in the transaction.
448 */
449void
450xfs_buf_item_unpin_remove(
451 xfs_buf_log_item_t *bip,
452 xfs_trans_t *tp)
453{
454 xfs_buf_t *bp;
455 xfs_log_item_desc_t *lidp;
456 int stale = 0;
457
458 bp = bip->bli_buf;
459 /*
460 * will xfs_buf_item_unpin() call xfs_buf_item_relse()?
461 */
462 if ((atomic_read(&bip->bli_refcount) == 1) &&
463 (bip->bli_flags & XFS_BLI_STALE)) {
464 ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0);
465 xfs_buf_item_trace("UNPIN REMOVE", bip);
466 xfs_buftrace("XFS_UNPIN_REMOVE", bp);
467 /*
468 * yes -- clear the xaction descriptor in-use flag
469 * and free the chunk if required. We can safely
470 * do some work here and then call buf_item_unpin
471 * to do the rest because if the if is true, then
472 * we are holding the buffer locked so no one else
473 * will be able to bump up the refcount.
474 */
475 lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) bip);
476 stale = lidp->lid_flags & XFS_LID_BUF_STALE;
477 xfs_trans_free_item(tp, lidp);
478 /*
479 * Since the transaction no longer refers to the buffer,
480 * the buffer should no longer refer to the transaction.
481 */
482 XFS_BUF_SET_FSPRIVATE2(bp, NULL);
483 }
484
485 xfs_buf_item_unpin(bip, stale);
486
487 return;
488}
489
490/*
491 * This is called to attempt to lock the buffer associated with this
492 * buf log item. Don't sleep on the buffer lock. If we can't get
493 * the lock right away, return 0. If we can get the lock, pull the
494 * buffer from the free list, mark it busy, and return 1.
495 */
496uint
497xfs_buf_item_trylock(
498 xfs_buf_log_item_t *bip)
499{
500 xfs_buf_t *bp;
501
502 bp = bip->bli_buf;
503
504 if (XFS_BUF_ISPINNED(bp)) {
505 return XFS_ITEM_PINNED;
506 }
507
508 if (!XFS_BUF_CPSEMA(bp)) {
509 return XFS_ITEM_LOCKED;
510 }
511
512 /*
513 * Remove the buffer from the free list. Only do this
514 * if it's on the free list. Private buffers like the
515 * superblock buffer are not.
516 */
517 XFS_BUF_HOLD(bp);
518
519 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
520 xfs_buf_item_trace("TRYLOCK SUCCESS", bip);
521 return XFS_ITEM_SUCCESS;
522}
523
524/*
525 * Release the buffer associated with the buf log item.
526 * If there is no dirty logged data associated with the
527 * buffer recorded in the buf log item, then free the
528 * buf log item and remove the reference to it in the
529 * buffer.
530 *
531 * This call ignores the recursion count. It is only called
532 * when the buffer should REALLY be unlocked, regardless
533 * of the recursion count.
534 *
535 * If the XFS_BLI_HOLD flag is set in the buf log item, then
536 * free the log item if necessary but do not unlock the buffer.
537 * This is for support of xfs_trans_bhold(). Make sure the
538 * XFS_BLI_HOLD field is cleared if we don't free the item.
539 */
540void
541xfs_buf_item_unlock(
542 xfs_buf_log_item_t *bip)
543{
544 int aborted;
545 xfs_buf_t *bp;
546 uint hold;
547
548 bp = bip->bli_buf;
549 xfs_buftrace("XFS_UNLOCK", bp);
550
551 /*
552 * Clear the buffer's association with this transaction.
553 */
554 XFS_BUF_SET_FSPRIVATE2(bp, NULL);
555
556 /*
557 * If this is a transaction abort, don't return early.
558 * Instead, allow the brelse to happen.
559 * Normally it would be done for stale (cancelled) buffers
560 * at unpin time, but we'll never go through the pin/unpin
561 * cycle if we abort inside commit.
562 */
563 aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0;
564
565 /*
566 * If the buf item is marked stale, then don't do anything.
567 * We'll unlock the buffer and free the buf item when the
568 * buffer is unpinned for the last time.
569 */
570 if (bip->bli_flags & XFS_BLI_STALE) {
571 bip->bli_flags &= ~XFS_BLI_LOGGED;
572 xfs_buf_item_trace("UNLOCK STALE", bip);
573 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
574 if (!aborted)
575 return;
576 }
577
578 /*
579 * Drop the transaction's reference to the log item if
580 * it was not logged as part of the transaction. Otherwise
581 * we'll drop the reference in xfs_buf_item_unpin() when
582 * the transaction is really through with the buffer.
583 */
584 if (!(bip->bli_flags & XFS_BLI_LOGGED)) {
585 atomic_dec(&bip->bli_refcount);
586 } else {
587 /*
588 * Clear the logged flag since this is per
589 * transaction state.
590 */
591 bip->bli_flags &= ~XFS_BLI_LOGGED;
592 }
593
594 /*
595 * Before possibly freeing the buf item, determine if we should
596 * release the buffer at the end of this routine.
597 */
598 hold = bip->bli_flags & XFS_BLI_HOLD;
599 xfs_buf_item_trace("UNLOCK", bip);
600
601 /*
602 * If the buf item isn't tracking any data, free it.
603 * Otherwise, if XFS_BLI_HOLD is set clear it.
604 */
605 if (xfs_count_bits(bip->bli_format.blf_data_map,
606 bip->bli_format.blf_map_size, 0) == 0) {
607 xfs_buf_item_relse(bp);
608 } else if (hold) {
609 bip->bli_flags &= ~XFS_BLI_HOLD;
610 }
611
612 /*
613 * Release the buffer if XFS_BLI_HOLD was not set.
614 */
615 if (!hold) {
616 xfs_buf_relse(bp);
617 }
618}
619
620/*
621 * This is called to find out where the oldest active copy of the
622 * buf log item in the on disk log resides now that the last log
623 * write of it completed at the given lsn.
624 * We always re-log all the dirty data in a buffer, so usually the
625 * latest copy in the on disk log is the only one that matters. For
626 * those cases we simply return the given lsn.
627 *
628 * The one exception to this is for buffers full of newly allocated
629 * inodes. These buffers are only relogged with the XFS_BLI_INODE_BUF
630 * flag set, indicating that only the di_next_unlinked fields from the
631 * inodes in the buffers will be replayed during recovery. If the
632 * original newly allocated inode images have not yet been flushed
633 * when the buffer is so relogged, then we need to make sure that we
634 * keep the old images in the 'active' portion of the log. We do this
635 * by returning the original lsn of that transaction here rather than
636 * the current one.
637 */
638xfs_lsn_t
639xfs_buf_item_committed(
640 xfs_buf_log_item_t *bip,
641 xfs_lsn_t lsn)
642{
643 xfs_buf_item_trace("COMMITTED", bip);
644 if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
645 (bip->bli_item.li_lsn != 0)) {
646 return bip->bli_item.li_lsn;
647 }
648 return (lsn);
649}
650
651/*
652 * This is called when the transaction holding the buffer is aborted.
653 * Just behave as if the transaction had been cancelled. If we're shutting down
654 * and have aborted this transaction, we'll trap this buffer when it tries to
655 * get written out.
656 */
657void
658xfs_buf_item_abort(
659 xfs_buf_log_item_t *bip)
660{
661 xfs_buf_t *bp;
662
663 bp = bip->bli_buf;
664 xfs_buftrace("XFS_ABORT", bp);
665 XFS_BUF_SUPER_STALE(bp);
666 xfs_buf_item_unlock(bip);
667 return;
668}
669
670/*
671 * This is called to asynchronously write the buffer associated with this
672 * buf log item out to disk. The buffer will already have been locked by
673 * a successful call to xfs_buf_item_trylock(). If the buffer still has
674 * B_DELWRI set, then get it going out to disk with a call to bawrite().
675 * If not, then just release the buffer.
676 */
677void
678xfs_buf_item_push(
679 xfs_buf_log_item_t *bip)
680{
681 xfs_buf_t *bp;
682
683 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
684 xfs_buf_item_trace("PUSH", bip);
685
686 bp = bip->bli_buf;
687
688 if (XFS_BUF_ISDELAYWRITE(bp)) {
689 xfs_bawrite(bip->bli_item.li_mountp, bp);
690 } else {
691 xfs_buf_relse(bp);
692 }
693}
694
695/* ARGSUSED */
696void
697xfs_buf_item_committing(xfs_buf_log_item_t *bip, xfs_lsn_t commit_lsn)
698{
699}
700
701/*
702 * This is the ops vector shared by all buf log items.
703 */
704struct xfs_item_ops xfs_buf_item_ops = {
705 .iop_size = (uint(*)(xfs_log_item_t*))xfs_buf_item_size,
706 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
707 xfs_buf_item_format,
708 .iop_pin = (void(*)(xfs_log_item_t*))xfs_buf_item_pin,
709 .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_buf_item_unpin,
710 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
711 xfs_buf_item_unpin_remove,
712 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_buf_item_trylock,
713 .iop_unlock = (void(*)(xfs_log_item_t*))xfs_buf_item_unlock,
714 .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
715 xfs_buf_item_committed,
716 .iop_push = (void(*)(xfs_log_item_t*))xfs_buf_item_push,
717 .iop_abort = (void(*)(xfs_log_item_t*))xfs_buf_item_abort,
718 .iop_pushbuf = NULL,
719 .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
720 xfs_buf_item_committing
721};
722
723
724/*
725 * Allocate a new buf log item to go with the given buffer.
726 * Set the buffer's b_fsprivate field to point to the new
727 * buf log item. If there are other item's attached to the
728 * buffer (see xfs_buf_attach_iodone() below), then put the
729 * buf log item at the front.
730 */
731void
732xfs_buf_item_init(
733 xfs_buf_t *bp,
734 xfs_mount_t *mp)
735{
736 xfs_log_item_t *lip;
737 xfs_buf_log_item_t *bip;
738 int chunks;
739 int map_size;
740
741 /*
742 * Check to see if there is already a buf log item for
743 * this buffer. If there is, it is guaranteed to be
744 * the first. If we do already have one, there is
745 * nothing to do here so return.
746 */
747 if (XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *) != mp)
748 XFS_BUF_SET_FSPRIVATE3(bp, mp);
749 XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
750 if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
751 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
752 if (lip->li_type == XFS_LI_BUF) {
753 return;
754 }
755 }
756
757 /*
758 * chunks is the number of XFS_BLI_CHUNK size pieces
759 * the buffer can be divided into. Make sure not to
760 * truncate any pieces. map_size is the size of the
761 * bitmap needed to describe the chunks of the buffer.
762 */
763 chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLI_CHUNK - 1)) >> XFS_BLI_SHIFT);
764 map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT);
765
766 bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone,
767 KM_SLEEP);
768 bip->bli_item.li_type = XFS_LI_BUF;
769 bip->bli_item.li_ops = &xfs_buf_item_ops;
770 bip->bli_item.li_mountp = mp;
771 bip->bli_buf = bp;
772 bip->bli_format.blf_type = XFS_LI_BUF;
773 bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp);
774 bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp));
775 bip->bli_format.blf_map_size = map_size;
776#ifdef XFS_BLI_TRACE
777 bip->bli_trace = ktrace_alloc(XFS_BLI_TRACE_SIZE, KM_SLEEP);
778#endif
779
780#ifdef XFS_TRANS_DEBUG
781 /*
782 * Allocate the arrays for tracking what needs to be logged
783 * and what our callers request to be logged. bli_orig
784 * holds a copy of the original, clean buffer for comparison
785 * against, and bli_logged keeps a 1 bit flag per byte in
786 * the buffer to indicate which bytes the callers have asked
787 * to have logged.
788 */
789 bip->bli_orig = (char *)kmem_alloc(XFS_BUF_COUNT(bp), KM_SLEEP);
790 memcpy(bip->bli_orig, XFS_BUF_PTR(bp), XFS_BUF_COUNT(bp));
791 bip->bli_logged = (char *)kmem_zalloc(XFS_BUF_COUNT(bp) / NBBY, KM_SLEEP);
792#endif
793
794 /*
795 * Put the buf item into the list of items attached to the
796 * buffer at the front.
797 */
798 if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
799 bip->bli_item.li_bio_list =
800 XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
801 }
802 XFS_BUF_SET_FSPRIVATE(bp, bip);
803}
804
805
806/*
807 * Mark bytes first through last inclusive as dirty in the buf
808 * item's bitmap.
809 */
810void
811xfs_buf_item_log(
812 xfs_buf_log_item_t *bip,
813 uint first,
814 uint last)
815{
816 uint first_bit;
817 uint last_bit;
818 uint bits_to_set;
819 uint bits_set;
820 uint word_num;
821 uint *wordp;
822 uint bit;
823 uint end_bit;
824 uint mask;
825
826 /*
827 * Mark the item as having some dirty data for
828 * quick reference in xfs_buf_item_dirty.
829 */
830 bip->bli_flags |= XFS_BLI_DIRTY;
831
832 /*
833 * Convert byte offsets to bit numbers.
834 */
835 first_bit = first >> XFS_BLI_SHIFT;
836 last_bit = last >> XFS_BLI_SHIFT;
837
838 /*
839 * Calculate the total number of bits to be set.
840 */
841 bits_to_set = last_bit - first_bit + 1;
842
843 /*
844 * Get a pointer to the first word in the bitmap
845 * to set a bit in.
846 */
847 word_num = first_bit >> BIT_TO_WORD_SHIFT;
848 wordp = &(bip->bli_format.blf_data_map[word_num]);
849
850 /*
851 * Calculate the starting bit in the first word.
852 */
853 bit = first_bit & (uint)(NBWORD - 1);
854
855 /*
856 * First set any bits in the first word of our range.
857 * If it starts at bit 0 of the word, it will be
858 * set below rather than here. That is what the variable
859 * bit tells us. The variable bits_set tracks the number
860 * of bits that have been set so far. End_bit is the number
861 * of the last bit to be set in this word plus one.
862 */
863 if (bit) {
864 end_bit = MIN(bit + bits_to_set, (uint)NBWORD);
865 mask = ((1 << (end_bit - bit)) - 1) << bit;
866 *wordp |= mask;
867 wordp++;
868 bits_set = end_bit - bit;
869 } else {
870 bits_set = 0;
871 }
872
873 /*
874 * Now set bits a whole word at a time that are between
875 * first_bit and last_bit.
876 */
877 while ((bits_to_set - bits_set) >= NBWORD) {
878 *wordp |= 0xffffffff;
879 bits_set += NBWORD;
880 wordp++;
881 }
882
883 /*
884 * Finally, set any bits left to be set in one last partial word.
885 */
886 end_bit = bits_to_set - bits_set;
887 if (end_bit) {
888 mask = (1 << end_bit) - 1;
889 *wordp |= mask;
890 }
891
892 xfs_buf_item_log_debug(bip, first, last);
893}
894
895
896/*
897 * Return 1 if the buffer has some data that has been logged (at any
898 * point, not just the current transaction) and 0 if not.
899 */
900uint
901xfs_buf_item_dirty(
902 xfs_buf_log_item_t *bip)
903{
904 return (bip->bli_flags & XFS_BLI_DIRTY);
905}
906
907/*
908 * This is called when the buf log item is no longer needed. It should
909 * free the buf log item associated with the given buffer and clear
910 * the buffer's pointer to the buf log item. If there are no more
911 * items in the list, clear the b_iodone field of the buffer (see
912 * xfs_buf_attach_iodone() below).
913 */
914void
915xfs_buf_item_relse(
916 xfs_buf_t *bp)
917{
918 xfs_buf_log_item_t *bip;
919
920 xfs_buftrace("XFS_RELSE", bp);
921 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
922 XFS_BUF_SET_FSPRIVATE(bp, bip->bli_item.li_bio_list);
923 if ((XFS_BUF_FSPRIVATE(bp, void *) == NULL) &&
924 (XFS_BUF_IODONE_FUNC(bp) != NULL)) {
925 ASSERT((XFS_BUF_ISUNINITIAL(bp)) == 0);
926 XFS_BUF_CLR_IODONE_FUNC(bp);
927 }
928
929#ifdef XFS_TRANS_DEBUG
930 kmem_free(bip->bli_orig, XFS_BUF_COUNT(bp));
931 bip->bli_orig = NULL;
932 kmem_free(bip->bli_logged, XFS_BUF_COUNT(bp) / NBBY);
933 bip->bli_logged = NULL;
934#endif /* XFS_TRANS_DEBUG */
935
936#ifdef XFS_BLI_TRACE
937 ktrace_free(bip->bli_trace);
938#endif
939 kmem_zone_free(xfs_buf_item_zone, bip);
940}
941
942
943/*
944 * Add the given log item with its callback to the list of callbacks
945 * to be called when the buffer's I/O completes. If it is not set
946 * already, set the buffer's b_iodone() routine to be
947 * xfs_buf_iodone_callbacks() and link the log item into the list of
948 * items rooted at b_fsprivate. Items are always added as the second
949 * entry in the list if there is a first, because the buf item code
950 * assumes that the buf log item is first.
951 */
952void
953xfs_buf_attach_iodone(
954 xfs_buf_t *bp,
955 void (*cb)(xfs_buf_t *, xfs_log_item_t *),
956 xfs_log_item_t *lip)
957{
958 xfs_log_item_t *head_lip;
959
960 ASSERT(XFS_BUF_ISBUSY(bp));
961 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
962
963 lip->li_cb = cb;
964 if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
965 head_lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
966 lip->li_bio_list = head_lip->li_bio_list;
967 head_lip->li_bio_list = lip;
968 } else {
969 XFS_BUF_SET_FSPRIVATE(bp, lip);
970 }
971
972 ASSERT((XFS_BUF_IODONE_FUNC(bp) == xfs_buf_iodone_callbacks) ||
973 (XFS_BUF_IODONE_FUNC(bp) == NULL));
974 XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
975}
976
977STATIC void
978xfs_buf_do_callbacks(
979 xfs_buf_t *bp,
980 xfs_log_item_t *lip)
981{
982 xfs_log_item_t *nlip;
983
984 while (lip != NULL) {
985 nlip = lip->li_bio_list;
986 ASSERT(lip->li_cb != NULL);
987 /*
988 * Clear the next pointer so we don't have any
989 * confusion if the item is added to another buf.
990 * Don't touch the log item after calling its
991 * callback, because it could have freed itself.
992 */
993 lip->li_bio_list = NULL;
994 lip->li_cb(bp, lip);
995 lip = nlip;
996 }
997}
998
999/*
1000 * This is the iodone() function for buffers which have had callbacks
1001 * attached to them by xfs_buf_attach_iodone(). It should remove each
1002 * log item from the buffer's list and call the callback of each in turn.
1003 * When done, the buffer's fsprivate field is set to NULL and the buffer
1004 * is unlocked with a call to iodone().
1005 */
1006void
1007xfs_buf_iodone_callbacks(
1008 xfs_buf_t *bp)
1009{
1010 xfs_log_item_t *lip;
1011 static ulong lasttime;
1012 static xfs_buftarg_t *lasttarg;
1013 xfs_mount_t *mp;
1014
1015 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
1016 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
1017
1018 if (XFS_BUF_GETERROR(bp) != 0) {
1019 /*
1020 * If we've already decided to shutdown the filesystem
1021 * because of IO errors, there's no point in giving this
1022 * a retry.
1023 */
1024 mp = lip->li_mountp;
1025 if (XFS_FORCED_SHUTDOWN(mp)) {
1026 ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
1027 XFS_BUF_SUPER_STALE(bp);
1028 xfs_buftrace("BUF_IODONE_CB", bp);
1029 xfs_buf_do_callbacks(bp, lip);
1030 XFS_BUF_SET_FSPRIVATE(bp, NULL);
1031 XFS_BUF_CLR_IODONE_FUNC(bp);
1032
1033 /*
1034 * XFS_SHUT flag gets set when we go thru the
1035 * entire buffer cache and deliberately start
1036 * throwing away delayed write buffers.
1037 * Since there's no biowait done on those,
1038 * we should just brelse them.
1039 */
1040 if (XFS_BUF_ISSHUT(bp)) {
1041 XFS_BUF_UNSHUT(bp);
1042 xfs_buf_relse(bp);
1043 } else {
1044 xfs_biodone(bp);
1045 }
1046
1047 return;
1048 }
1049
1050 if ((XFS_BUF_TARGET(bp) != lasttarg) ||
1051 (time_after(jiffies, (lasttime + 5*HZ)))) {
1052 lasttime = jiffies;
1053 prdev("XFS write error in file system meta-data "
1054 "block 0x%llx in %s",
1055 XFS_BUF_TARGET(bp),
1056 (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);
1057 }
1058 lasttarg = XFS_BUF_TARGET(bp);
1059
1060 if (XFS_BUF_ISASYNC(bp)) {
1061 /*
1062 * If the write was asynchronous then noone will be
1063 * looking for the error. Clear the error state
1064 * and write the buffer out again delayed write.
1065 *
1066 * XXXsup This is OK, so long as we catch these
1067 * before we start the umount; we don't want these
1068 * DELWRI metadata bufs to be hanging around.
1069 */
1070 XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */
1071
1072 if (!(XFS_BUF_ISSTALE(bp))) {
1073 XFS_BUF_DELAYWRITE(bp);
1074 XFS_BUF_DONE(bp);
1075 XFS_BUF_SET_START(bp);
1076 }
1077 ASSERT(XFS_BUF_IODONE_FUNC(bp));
1078 xfs_buftrace("BUF_IODONE ASYNC", bp);
1079 xfs_buf_relse(bp);
1080 } else {
1081 /*
1082 * If the write of the buffer was not asynchronous,
1083 * then we want to make sure to return the error
1084 * to the caller of bwrite(). Because of this we
1085 * cannot clear the B_ERROR state at this point.
1086 * Instead we install a callback function that
1087 * will be called when the buffer is released, and
1088 * that routine will clear the error state and
1089 * set the buffer to be written out again after
1090 * some delay.
1091 */
1092 /* We actually overwrite the existing b-relse
1093 function at times, but we're gonna be shutting down
1094 anyway. */
1095 XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse);
1096 XFS_BUF_DONE(bp);
1097 XFS_BUF_V_IODONESEMA(bp);
1098 }
1099 return;
1100 }
1101#ifdef XFSERRORDEBUG
1102 xfs_buftrace("XFS BUFCB NOERR", bp);
1103#endif
1104 xfs_buf_do_callbacks(bp, lip);
1105 XFS_BUF_SET_FSPRIVATE(bp, NULL);
1106 XFS_BUF_CLR_IODONE_FUNC(bp);
1107 xfs_biodone(bp);
1108}
1109
1110/*
1111 * This is a callback routine attached to a buffer which gets an error
1112 * when being written out synchronously.
1113 */
1114STATIC void
1115xfs_buf_error_relse(
1116 xfs_buf_t *bp)
1117{
1118 xfs_log_item_t *lip;
1119 xfs_mount_t *mp;
1120
1121 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
1122 mp = (xfs_mount_t *)lip->li_mountp;
1123 ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
1124
1125 XFS_BUF_STALE(bp);
1126 XFS_BUF_DONE(bp);
1127 XFS_BUF_UNDELAYWRITE(bp);
1128 XFS_BUF_ERROR(bp,0);
1129 xfs_buftrace("BUF_ERROR_RELSE", bp);
1130 if (! XFS_FORCED_SHUTDOWN(mp))
1131 xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR);
1132 /*
1133 * We have to unpin the pinned buffers so do the
1134 * callbacks.
1135 */
1136 xfs_buf_do_callbacks(bp, lip);
1137 XFS_BUF_SET_FSPRIVATE(bp, NULL);
1138 XFS_BUF_CLR_IODONE_FUNC(bp);
1139 XFS_BUF_SET_BRELSE_FUNC(bp,NULL);
1140 xfs_buf_relse(bp);
1141}
1142
1143
1144/*
1145 * This is the iodone() function for buffers which have been
1146 * logged. It is called when they are eventually flushed out.
1147 * It should remove the buf item from the AIL, and free the buf item.
1148 * It is called by xfs_buf_iodone_callbacks() above which will take
1149 * care of cleaning up the buffer itself.
1150 */
1151/* ARGSUSED */
1152void
1153xfs_buf_iodone(
1154 xfs_buf_t *bp,
1155 xfs_buf_log_item_t *bip)
1156{
1157 struct xfs_mount *mp;
1158 SPLDECL(s);
1159
1160 ASSERT(bip->bli_buf == bp);
1161
1162 mp = bip->bli_item.li_mountp;
1163
1164 /*
1165 * If we are forcibly shutting down, this may well be
1166 * off the AIL already. That's because we simulate the
1167 * log-committed callbacks to unpin these buffers. Or we may never
1168 * have put this item on AIL because of the transaction was
1169 * aborted forcibly. xfs_trans_delete_ail() takes care of these.
1170 *
1171 * Either way, AIL is useless if we're forcing a shutdown.
1172 */
1173 AIL_LOCK(mp,s);
1174 /*
1175 * xfs_trans_delete_ail() drops the AIL lock.
1176 */
1177 xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip, s);
1178
1179#ifdef XFS_TRANS_DEBUG
1180 kmem_free(bip->bli_orig, XFS_BUF_COUNT(bp));
1181 bip->bli_orig = NULL;
1182 kmem_free(bip->bli_logged, XFS_BUF_COUNT(bp) / NBBY);
1183 bip->bli_logged = NULL;
1184#endif /* XFS_TRANS_DEBUG */
1185
1186#ifdef XFS_BLI_TRACE
1187 ktrace_free(bip->bli_trace);
1188#endif
1189 kmem_zone_free(xfs_buf_item_zone, bip);
1190}
1191
1192#if defined(XFS_BLI_TRACE)
1193void
1194xfs_buf_item_trace(
1195 char *id,
1196 xfs_buf_log_item_t *bip)
1197{
1198 xfs_buf_t *bp;
1199 ASSERT(bip->bli_trace != NULL);
1200
1201 bp = bip->bli_buf;
1202 ktrace_enter(bip->bli_trace,
1203 (void *)id,
1204 (void *)bip->bli_buf,
1205 (void *)((unsigned long)bip->bli_flags),
1206 (void *)((unsigned long)bip->bli_recur),
1207 (void *)((unsigned long)atomic_read(&bip->bli_refcount)),
1208 (void *)((unsigned long)
1209 (0xFFFFFFFF & XFS_BUF_ADDR(bp) >> 32)),
1210 (void *)((unsigned long)(0xFFFFFFFF & XFS_BUF_ADDR(bp))),
1211 (void *)((unsigned long)XFS_BUF_COUNT(bp)),
1212 (void *)((unsigned long)XFS_BUF_BFLAGS(bp)),
1213 XFS_BUF_FSPRIVATE(bp, void *),
1214 XFS_BUF_FSPRIVATE2(bp, void *),
1215 (void *)(unsigned long)XFS_BUF_ISPINNED(bp),
1216 (void *)XFS_BUF_IODONE_FUNC(bp),
1217 (void *)((unsigned long)(XFS_BUF_VALUSEMA(bp))),
1218 (void *)bip->bli_item.li_desc,
1219 (void *)((unsigned long)bip->bli_item.li_flags));
1220}
1221#endif /* XFS_BLI_TRACE */
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
new file mode 100644
index 000000000000..5f1b0c9308f6
--- /dev/null
+++ b/fs/xfs/xfs_buf_item.h
@@ -0,0 +1,171 @@
1/*
2 * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_BUF_ITEM_H__
33#define __XFS_BUF_ITEM_H__
34
35/*
36 * This is the structure used to lay out a buf log item in the
37 * log. The data map describes which 128 byte chunks of the buffer
38 * have been logged. This structure works only on buffers that
39 * reside up to the first TB in the filesystem. These buffers are
40 * generated only by pre-6.2 systems and are known as XFS_LI_6_1_BUF.
41 */
42typedef struct xfs_buf_log_format_v1 {
43 unsigned short blf_type; /* buf log item type indicator */
44 unsigned short blf_size; /* size of this item */
45 __int32_t blf_blkno; /* starting blkno of this buf */
46 ushort blf_flags; /* misc state */
47 ushort blf_len; /* number of blocks in this buf */
48 unsigned int blf_map_size; /* size of data bitmap in words */
49 unsigned int blf_data_map[1];/* variable size bitmap of */
50 /* regions of buffer in this item */
51} xfs_buf_log_format_v1_t;
52
53/*
54 * This is a form of the above structure with a 64 bit blkno field.
55 * For 6.2 and beyond, this is XFS_LI_BUF. We use this to log everything.
56 */
57typedef struct xfs_buf_log_format_t {
58 unsigned short blf_type; /* buf log item type indicator */
59 unsigned short blf_size; /* size of this item */
60 ushort blf_flags; /* misc state */
61 ushort blf_len; /* number of blocks in this buf */
62 __int64_t blf_blkno; /* starting blkno of this buf */
63 unsigned int blf_map_size; /* size of data bitmap in words */
64 unsigned int blf_data_map[1];/* variable size bitmap of */
65 /* regions of buffer in this item */
66} xfs_buf_log_format_t;
67
68/*
69 * This flag indicates that the buffer contains on disk inodes
70 * and requires special recovery handling.
71 */
72#define XFS_BLI_INODE_BUF 0x1
73/*
74 * This flag indicates that the buffer should not be replayed
75 * during recovery because its blocks are being freed.
76 */
77#define XFS_BLI_CANCEL 0x2
78/*
79 * This flag indicates that the buffer contains on disk
80 * user or group dquots and may require special recovery handling.
81 */
82#define XFS_BLI_UDQUOT_BUF 0x4
83/* #define XFS_BLI_PDQUOT_BUF 0x8 */
84#define XFS_BLI_GDQUOT_BUF 0x10
85
86#define XFS_BLI_CHUNK 128
87#define XFS_BLI_SHIFT 7
88#define BIT_TO_WORD_SHIFT 5
89#define NBWORD (NBBY * sizeof(unsigned int))
90
91/*
92 * buf log item flags
93 */
94#define XFS_BLI_HOLD 0x01
95#define XFS_BLI_DIRTY 0x02
96#define XFS_BLI_STALE 0x04
97#define XFS_BLI_LOGGED 0x08
98#define XFS_BLI_INODE_ALLOC_BUF 0x10
99#define XFS_BLI_STALE_INODE 0x20
100
101
102#ifdef __KERNEL__
103
104struct xfs_buf;
105struct ktrace;
106struct xfs_mount;
107struct xfs_buf_log_item;
108
109#if defined(XFS_BLI_TRACE)
110#define XFS_BLI_TRACE_SIZE 32
111
112void xfs_buf_item_trace(char *, struct xfs_buf_log_item *);
113#else
114#define xfs_buf_item_trace(id, bip)
115#endif
116
117/*
118 * This is the in core log item structure used to track information
119 * needed to log buffers. It tracks how many times the lock has been
120 * locked, and which 128 byte chunks of the buffer are dirty.
121 */
122typedef struct xfs_buf_log_item {
123 xfs_log_item_t bli_item; /* common item structure */
124 struct xfs_buf *bli_buf; /* real buffer pointer */
125 unsigned int bli_flags; /* misc flags */
126 unsigned int bli_recur; /* lock recursion count */
127 atomic_t bli_refcount; /* cnt of tp refs */
128#ifdef XFS_BLI_TRACE
129 struct ktrace *bli_trace; /* event trace buf */
130#endif
131#ifdef XFS_TRANS_DEBUG
132 char *bli_orig; /* original buffer copy */
133 char *bli_logged; /* bytes logged (bitmap) */
134#endif
135 xfs_buf_log_format_t bli_format; /* in-log header */
136} xfs_buf_log_item_t;
137
138/*
139 * This structure is used during recovery to record the buf log
140 * items which have been canceled and should not be replayed.
141 */
142typedef struct xfs_buf_cancel {
143 xfs_daddr_t bc_blkno;
144 uint bc_len;
145 int bc_refcount;
146 struct xfs_buf_cancel *bc_next;
147} xfs_buf_cancel_t;
148
149void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
150void xfs_buf_item_relse(struct xfs_buf *);
151void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
152uint xfs_buf_item_dirty(xfs_buf_log_item_t *);
153void xfs_buf_attach_iodone(struct xfs_buf *,
154 void(*)(struct xfs_buf *, xfs_log_item_t *),
155 xfs_log_item_t *);
156void xfs_buf_iodone_callbacks(struct xfs_buf *);
157void xfs_buf_iodone(struct xfs_buf *, xfs_buf_log_item_t *);
158
159#ifdef XFS_TRANS_DEBUG
160void
161xfs_buf_item_flush_log_debug(
162 struct xfs_buf *bp,
163 uint first,
164 uint last);
165#else
166#define xfs_buf_item_flush_log_debug(bp, first, last)
167#endif
168
169#endif /* __KERNEL__ */
170
171#endif /* __XFS_BUF_ITEM_H__ */
diff --git a/fs/xfs/xfs_cap.h b/fs/xfs/xfs_cap.h
new file mode 100644
index 000000000000..2deac7303758
--- /dev/null
+++ b/fs/xfs/xfs_cap.h
@@ -0,0 +1,84 @@
1/*
2 * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_CAP_H__
33#define __XFS_CAP_H__
34
35/*
36 * Capabilities
37 */
38typedef __uint64_t xfs_cap_value_t;
39
40typedef struct xfs_cap_set {
41 xfs_cap_value_t cap_effective; /* use in capability checks */
42 xfs_cap_value_t cap_permitted; /* combined with file attrs */
43 xfs_cap_value_t cap_inheritable;/* pass through exec */
44} xfs_cap_set_t;
45
46/* On-disk XFS extended attribute names */
47#define SGI_CAP_FILE "SGI_CAP_FILE"
48#define SGI_CAP_FILE_SIZE (sizeof(SGI_CAP_FILE)-1)
49#define SGI_CAP_LINUX "SGI_CAP_LINUX"
50#define SGI_CAP_LINUX_SIZE (sizeof(SGI_CAP_LINUX)-1)
51
52/*
53 * For Linux, we take the bitfields directly from capability.h
54 * and no longer attempt to keep this attribute ondisk compatible
55 * with IRIX. Since this attribute is only set on exectuables,
56 * it just doesn't make much sense to try. We do use a different
57 * named attribute though, to avoid confusion.
58 */
59
60#ifdef __KERNEL__
61
62#ifdef CONFIG_FS_POSIX_CAP
63
64#include <linux/posix_cap_xattr.h>
65
66struct vnode;
67
68extern int xfs_cap_vhascap(struct vnode *);
69extern int xfs_cap_vset(struct vnode *, void *, size_t);
70extern int xfs_cap_vget(struct vnode *, void *, size_t);
71extern int xfs_cap_vremove(struct vnode *vp);
72
73#define _CAP_EXISTS xfs_cap_vhascap
74
75#else
76#define xfs_cap_vset(v,p,sz) (-EOPNOTSUPP)
77#define xfs_cap_vget(v,p,sz) (-EOPNOTSUPP)
78#define xfs_cap_vremove(v) (-EOPNOTSUPP)
79#define _CAP_EXISTS (NULL)
80#endif
81
82#endif /* __KERNEL__ */
83
84#endif /* __XFS_CAP_H__ */
diff --git a/fs/xfs/xfs_clnt.h b/fs/xfs/xfs_clnt.h
new file mode 100644
index 000000000000..b3215ffe0be8
--- /dev/null
+++ b/fs/xfs/xfs_clnt.h
@@ -0,0 +1,110 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_CLNT_H__
33#define __XFS_CLNT_H__
34
35/*
36 * XFS arguments structure, constructed from the arguments we
37 * are passed via the mount system call.
38 *
39 * NOTE: The mount system call is handled differently between
40 * Linux and IRIX. In IRIX we worked work with a binary data
41 * structure coming in across the syscall interface from user
42 * space (the mount userspace knows about each filesystem type
43 * and the set of valid options for it, and converts the users
44 * argument string into a binary structure _before_ making the
45 * system call), and the ABI issues that this implies.
46 *
47 * In Linux, we are passed a comma separated set of options;
48 * ie. a NULL terminated string of characters. Userspace mount
49 * code does not have any knowledge of mount options expected by
50 * each filesystem type and so each filesystem parses its mount
51 * options in kernel space.
52 *
53 * For the Linux port, we kept this structure pretty much intact
54 * and use it internally (because the existing code groks it).
55 */
56struct xfs_mount_args {
57 int flags; /* flags -> see XFSMNT_... macros below */
58 int logbufs; /* Number of log buffers, -1 to default */
59 int logbufsize; /* Size of log buffers, -1 to default */
60 char fsname[MAXNAMELEN+1]; /* data device name */
61 char rtname[MAXNAMELEN+1]; /* realtime device filename */
62 char logname[MAXNAMELEN+1]; /* journal device filename */
63 char mtpt[MAXNAMELEN+1]; /* filesystem mount point */
64 int sunit; /* stripe unit (BBs) */
65 int swidth; /* stripe width (BBs), multiple of sunit */
66 uchar_t iosizelog; /* log2 of the preferred I/O size */
67 int ihashsize; /* inode hash table size (buckets) */
68};
69
70/*
71 * XFS mount option flags
72 */
73#define XFSMNT_CHKLOG 0x00000001 /* check log */
74#define XFSMNT_WSYNC 0x00000002 /* safe mode nfs mount
75 * compatible */
76#define XFSMNT_INO64 0x00000004 /* move inode numbers up
77 * past 2^32 */
78#define XFSMNT_UQUOTA 0x00000008 /* user quota accounting */
79#define XFSMNT_PQUOTA 0x00000010 /* IRIX prj quota accounting */
80#define XFSMNT_UQUOTAENF 0x00000020 /* user quota limit
81 * enforcement */
82#define XFSMNT_PQUOTAENF 0x00000040 /* IRIX project quota limit
83 * enforcement */
84#define XFSMNT_NOATIME 0x00000100 /* don't modify access
85 * times on reads */
86#define XFSMNT_NOALIGN 0x00000200 /* don't allocate at
87 * stripe boundaries*/
88#define XFSMNT_RETERR 0x00000400 /* return error to user */
89#define XFSMNT_NORECOVERY 0x00000800 /* no recovery, implies
90 * read-only mount */
91#define XFSMNT_SHARED 0x00001000 /* shared XFS mount */
92#define XFSMNT_IOSIZE 0x00002000 /* optimize for I/O size */
93#define XFSMNT_OSYNCISOSYNC 0x00004000 /* o_sync is REALLY o_sync */
94 /* (osyncisdsync is now default) */
95#define XFSMNT_32BITINODES 0x00200000 /* restrict inodes to 32
96 * bits of address space */
97#define XFSMNT_GQUOTA 0x00400000 /* group quota accounting */
98#define XFSMNT_GQUOTAENF 0x00800000 /* group quota limit
99 * enforcement */
100#define XFSMNT_NOUUID 0x01000000 /* Ignore fs uuid */
101#define XFSMNT_DMAPI 0x02000000 /* enable dmapi/xdsm */
102#define XFSMNT_NOLOGFLUSH 0x04000000 /* Don't flush for log blocks */
103#define XFSMNT_IDELETE 0x08000000 /* inode cluster delete */
104#define XFSMNT_SWALLOC 0x10000000 /* turn on stripe width
105 * allocation */
106#define XFSMNT_IHASHSIZE 0x20000000 /* inode hash table size */
107#define XFSMNT_DIRSYNC 0x40000000 /* sync creat,link,unlink,rename
108 * symlink,mkdir,rmdir,mknod */
109
110#endif /* __XFS_CLNT_H__ */
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
new file mode 100644
index 000000000000..d7fe28866764
--- /dev/null
+++ b/fs/xfs/xfs_da_btree.c
@@ -0,0 +1,2648 @@
1/*
2 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34
35#include "xfs_macros.h"
36#include "xfs_types.h"
37#include "xfs_inum.h"
38#include "xfs_log.h"
39#include "xfs_trans.h"
40#include "xfs_sb.h"
41#include "xfs_ag.h"
42#include "xfs_dir.h"
43#include "xfs_dir2.h"
44#include "xfs_dmapi.h"
45#include "xfs_mount.h"
46#include "xfs_alloc_btree.h"
47#include "xfs_bmap_btree.h"
48#include "xfs_ialloc_btree.h"
49#include "xfs_alloc.h"
50#include "xfs_btree.h"
51#include "xfs_attr_sf.h"
52#include "xfs_dir_sf.h"
53#include "xfs_dir2_sf.h"
54#include "xfs_dinode.h"
55#include "xfs_inode_item.h"
56#include "xfs_inode.h"
57#include "xfs_bmap.h"
58#include "xfs_da_btree.h"
59#include "xfs_attr.h"
60#include "xfs_attr_leaf.h"
61#include "xfs_dir_leaf.h"
62#include "xfs_dir2_data.h"
63#include "xfs_dir2_leaf.h"
64#include "xfs_dir2_block.h"
65#include "xfs_dir2_node.h"
66#include "xfs_error.h"
67#include "xfs_bit.h"
68
69/*
70 * xfs_da_btree.c
71 *
72 * Routines to implement directories as Btrees of hashed names.
73 */
74
75/*========================================================================
76 * Function prototypes for the kernel.
77 *========================================================================*/
78
79/*
80 * Routines used for growing the Btree.
81 */
82STATIC int xfs_da_root_split(xfs_da_state_t *state,
83 xfs_da_state_blk_t *existing_root,
84 xfs_da_state_blk_t *new_child);
85STATIC int xfs_da_node_split(xfs_da_state_t *state,
86 xfs_da_state_blk_t *existing_blk,
87 xfs_da_state_blk_t *split_blk,
88 xfs_da_state_blk_t *blk_to_add,
89 int treelevel,
90 int *result);
91STATIC void xfs_da_node_rebalance(xfs_da_state_t *state,
92 xfs_da_state_blk_t *node_blk_1,
93 xfs_da_state_blk_t *node_blk_2);
94STATIC void xfs_da_node_add(xfs_da_state_t *state,
95 xfs_da_state_blk_t *old_node_blk,
96 xfs_da_state_blk_t *new_node_blk);
97
98/*
99 * Routines used for shrinking the Btree.
100 */
101STATIC int xfs_da_root_join(xfs_da_state_t *state,
102 xfs_da_state_blk_t *root_blk);
103STATIC int xfs_da_node_toosmall(xfs_da_state_t *state, int *retval);
104STATIC void xfs_da_node_remove(xfs_da_state_t *state,
105 xfs_da_state_blk_t *drop_blk);
106STATIC void xfs_da_node_unbalance(xfs_da_state_t *state,
107 xfs_da_state_blk_t *src_node_blk,
108 xfs_da_state_blk_t *dst_node_blk);
109
110/*
111 * Utility routines.
112 */
113STATIC uint xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count);
114STATIC int xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp);
115STATIC xfs_dabuf_t *xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra);
116
117
118/*========================================================================
119 * Routines used for growing the Btree.
120 *========================================================================*/
121
122/*
123 * Create the initial contents of an intermediate node.
124 */
125int
126xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
127 xfs_dabuf_t **bpp, int whichfork)
128{
129 xfs_da_intnode_t *node;
130 xfs_dabuf_t *bp;
131 int error;
132 xfs_trans_t *tp;
133
134 tp = args->trans;
135 error = xfs_da_get_buf(tp, args->dp, blkno, -1, &bp, whichfork);
136 if (error)
137 return(error);
138 ASSERT(bp != NULL);
139 node = bp->data;
140 node->hdr.info.forw = 0;
141 node->hdr.info.back = 0;
142 INT_SET(node->hdr.info.magic, ARCH_CONVERT, XFS_DA_NODE_MAGIC);
143 node->hdr.info.pad = 0;
144 node->hdr.count = 0;
145 INT_SET(node->hdr.level, ARCH_CONVERT, level);
146
147 xfs_da_log_buf(tp, bp,
148 XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
149
150 *bpp = bp;
151 return(0);
152}
153
154/*
155 * Split a leaf node, rebalance, then possibly split
156 * intermediate nodes, rebalance, etc.
157 */
158int /* error */
159xfs_da_split(xfs_da_state_t *state)
160{
161 xfs_da_state_blk_t *oldblk, *newblk, *addblk;
162 xfs_da_intnode_t *node;
163 xfs_dabuf_t *bp;
164 int max, action, error, i;
165
166 /*
167 * Walk back up the tree splitting/inserting/adjusting as necessary.
168 * If we need to insert and there isn't room, split the node, then
169 * decide which fragment to insert the new block from below into.
170 * Note that we may split the root this way, but we need more fixup.
171 */
172 max = state->path.active - 1;
173 ASSERT((max >= 0) && (max < XFS_DA_NODE_MAXDEPTH));
174 ASSERT(state->path.blk[max].magic == XFS_ATTR_LEAF_MAGIC ||
175 state->path.blk[max].magic == XFS_DIRX_LEAF_MAGIC(state->mp));
176
177 addblk = &state->path.blk[max]; /* initial dummy value */
178 for (i = max; (i >= 0) && addblk; state->path.active--, i--) {
179 oldblk = &state->path.blk[i];
180 newblk = &state->altpath.blk[i];
181
182 /*
183 * If a leaf node then
184 * Allocate a new leaf node, then rebalance across them.
185 * else if an intermediate node then
186 * We split on the last layer, must we split the node?
187 */
188 switch (oldblk->magic) {
189 case XFS_ATTR_LEAF_MAGIC:
190#ifndef __KERNEL__
191 return(ENOTTY);
192#else
193 error = xfs_attr_leaf_split(state, oldblk, newblk);
194 if ((error != 0) && (error != ENOSPC)) {
195 return(error); /* GROT: attr is inconsistent */
196 }
197 if (!error) {
198 addblk = newblk;
199 break;
200 }
201 /*
202 * Entry wouldn't fit, split the leaf again.
203 */
204 state->extravalid = 1;
205 if (state->inleaf) {
206 state->extraafter = 0; /* before newblk */
207 error = xfs_attr_leaf_split(state, oldblk,
208 &state->extrablk);
209 } else {
210 state->extraafter = 1; /* after newblk */
211 error = xfs_attr_leaf_split(state, newblk,
212 &state->extrablk);
213 }
214 if (error)
215 return(error); /* GROT: attr inconsistent */
216 addblk = newblk;
217 break;
218#endif
219 case XFS_DIR_LEAF_MAGIC:
220 ASSERT(XFS_DIR_IS_V1(state->mp));
221 error = xfs_dir_leaf_split(state, oldblk, newblk);
222 if ((error != 0) && (error != ENOSPC)) {
223 return(error); /* GROT: dir is inconsistent */
224 }
225 if (!error) {
226 addblk = newblk;
227 break;
228 }
229 /*
230 * Entry wouldn't fit, split the leaf again.
231 */
232 state->extravalid = 1;
233 if (state->inleaf) {
234 state->extraafter = 0; /* before newblk */
235 error = xfs_dir_leaf_split(state, oldblk,
236 &state->extrablk);
237 if (error)
238 return(error); /* GROT: dir incon. */
239 addblk = newblk;
240 } else {
241 state->extraafter = 1; /* after newblk */
242 error = xfs_dir_leaf_split(state, newblk,
243 &state->extrablk);
244 if (error)
245 return(error); /* GROT: dir incon. */
246 addblk = newblk;
247 }
248 break;
249 case XFS_DIR2_LEAFN_MAGIC:
250 ASSERT(XFS_DIR_IS_V2(state->mp));
251 error = xfs_dir2_leafn_split(state, oldblk, newblk);
252 if (error)
253 return error;
254 addblk = newblk;
255 break;
256 case XFS_DA_NODE_MAGIC:
257 error = xfs_da_node_split(state, oldblk, newblk, addblk,
258 max - i, &action);
259 xfs_da_buf_done(addblk->bp);
260 addblk->bp = NULL;
261 if (error)
262 return(error); /* GROT: dir is inconsistent */
263 /*
264 * Record the newly split block for the next time thru?
265 */
266 if (action)
267 addblk = newblk;
268 else
269 addblk = NULL;
270 break;
271 }
272
273 /*
274 * Update the btree to show the new hashval for this child.
275 */
276 xfs_da_fixhashpath(state, &state->path);
277 /*
278 * If we won't need this block again, it's getting dropped
279 * from the active path by the loop control, so we need
280 * to mark it done now.
281 */
282 if (i > 0 || !addblk)
283 xfs_da_buf_done(oldblk->bp);
284 }
285 if (!addblk)
286 return(0);
287
288 /*
289 * Split the root node.
290 */
291 ASSERT(state->path.active == 0);
292 oldblk = &state->path.blk[0];
293 error = xfs_da_root_split(state, oldblk, addblk);
294 if (error) {
295 xfs_da_buf_done(oldblk->bp);
296 xfs_da_buf_done(addblk->bp);
297 addblk->bp = NULL;
298 return(error); /* GROT: dir is inconsistent */
299 }
300
301 /*
302 * Update pointers to the node which used to be block 0 and
303 * just got bumped because of the addition of a new root node.
304 * There might be three blocks involved if a double split occurred,
305 * and the original block 0 could be at any position in the list.
306 */
307
308 node = oldblk->bp->data;
309 if (node->hdr.info.forw) {
310 if (INT_GET(node->hdr.info.forw, ARCH_CONVERT) == addblk->blkno) {
311 bp = addblk->bp;
312 } else {
313 ASSERT(state->extravalid);
314 bp = state->extrablk.bp;
315 }
316 node = bp->data;
317 INT_SET(node->hdr.info.back, ARCH_CONVERT, oldblk->blkno);
318 xfs_da_log_buf(state->args->trans, bp,
319 XFS_DA_LOGRANGE(node, &node->hdr.info,
320 sizeof(node->hdr.info)));
321 }
322 node = oldblk->bp->data;
323 if (INT_GET(node->hdr.info.back, ARCH_CONVERT)) {
324 if (INT_GET(node->hdr.info.back, ARCH_CONVERT) == addblk->blkno) {
325 bp = addblk->bp;
326 } else {
327 ASSERT(state->extravalid);
328 bp = state->extrablk.bp;
329 }
330 node = bp->data;
331 INT_SET(node->hdr.info.forw, ARCH_CONVERT, oldblk->blkno);
332 xfs_da_log_buf(state->args->trans, bp,
333 XFS_DA_LOGRANGE(node, &node->hdr.info,
334 sizeof(node->hdr.info)));
335 }
336 xfs_da_buf_done(oldblk->bp);
337 xfs_da_buf_done(addblk->bp);
338 addblk->bp = NULL;
339 return(0);
340}
341
342/*
343 * Split the root. We have to create a new root and point to the two
344 * parts (the split old root) that we just created. Copy block zero to
345 * the EOF, extending the inode in process.
346 */
347STATIC int /* error */
348xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
349 xfs_da_state_blk_t *blk2)
350{
351 xfs_da_intnode_t *node, *oldroot;
352 xfs_da_args_t *args;
353 xfs_dablk_t blkno;
354 xfs_dabuf_t *bp;
355 int error, size;
356 xfs_inode_t *dp;
357 xfs_trans_t *tp;
358 xfs_mount_t *mp;
359 xfs_dir2_leaf_t *leaf;
360
361 /*
362 * Copy the existing (incorrect) block from the root node position
363 * to a free space somewhere.
364 */
365 args = state->args;
366 ASSERT(args != NULL);
367 error = xfs_da_grow_inode(args, &blkno);
368 if (error)
369 return(error);
370 dp = args->dp;
371 tp = args->trans;
372 mp = state->mp;
373 error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork);
374 if (error)
375 return(error);
376 ASSERT(bp != NULL);
377 node = bp->data;
378 oldroot = blk1->bp->data;
379 if (INT_GET(oldroot->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) {
380 size = (int)((char *)&oldroot->btree[INT_GET(oldroot->hdr.count, ARCH_CONVERT)] -
381 (char *)oldroot);
382 } else {
383 ASSERT(XFS_DIR_IS_V2(mp));
384 ASSERT(INT_GET(oldroot->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
385 leaf = (xfs_dir2_leaf_t *)oldroot;
386 size = (int)((char *)&leaf->ents[INT_GET(leaf->hdr.count, ARCH_CONVERT)] -
387 (char *)leaf);
388 }
389 memcpy(node, oldroot, size);
390 xfs_da_log_buf(tp, bp, 0, size - 1);
391 xfs_da_buf_done(blk1->bp);
392 blk1->bp = bp;
393 blk1->blkno = blkno;
394
395 /*
396 * Set up the new root node.
397 */
398 error = xfs_da_node_create(args,
399 args->whichfork == XFS_DATA_FORK &&
400 XFS_DIR_IS_V2(mp) ? mp->m_dirleafblk : 0,
401 INT_GET(node->hdr.level, ARCH_CONVERT) + 1, &bp, args->whichfork);
402 if (error)
403 return(error);
404 node = bp->data;
405 INT_SET(node->btree[0].hashval, ARCH_CONVERT, blk1->hashval);
406 INT_SET(node->btree[0].before, ARCH_CONVERT, blk1->blkno);
407 INT_SET(node->btree[1].hashval, ARCH_CONVERT, blk2->hashval);
408 INT_SET(node->btree[1].before, ARCH_CONVERT, blk2->blkno);
409 INT_SET(node->hdr.count, ARCH_CONVERT, 2);
410
411#ifdef DEBUG
412 if (INT_GET(oldroot->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC) {
413 ASSERT(blk1->blkno >= mp->m_dirleafblk &&
414 blk1->blkno < mp->m_dirfreeblk);
415 ASSERT(blk2->blkno >= mp->m_dirleafblk &&
416 blk2->blkno < mp->m_dirfreeblk);
417 }
418#endif
419
420 /* Header is already logged by xfs_da_node_create */
421 xfs_da_log_buf(tp, bp,
422 XFS_DA_LOGRANGE(node, node->btree,
423 sizeof(xfs_da_node_entry_t) * 2));
424 xfs_da_buf_done(bp);
425
426 return(0);
427}
428
429/*
430 * Split the node, rebalance, then add the new entry.
431 */
432STATIC int /* error */
433xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
434 xfs_da_state_blk_t *newblk,
435 xfs_da_state_blk_t *addblk,
436 int treelevel, int *result)
437{
438 xfs_da_intnode_t *node;
439 xfs_dablk_t blkno;
440 int newcount, error;
441 int useextra;
442
443 node = oldblk->bp->data;
444 ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
445
446 /*
447 * With V2 the extra block is data or freespace.
448 */
449 useextra = state->extravalid && XFS_DIR_IS_V1(state->mp);
450 newcount = 1 + useextra;
451 /*
452 * Do we have to split the node?
453 */
454 if ((INT_GET(node->hdr.count, ARCH_CONVERT) + newcount) > state->node_ents) {
455 /*
456 * Allocate a new node, add to the doubly linked chain of
457 * nodes, then move some of our excess entries into it.
458 */
459 error = xfs_da_grow_inode(state->args, &blkno);
460 if (error)
461 return(error); /* GROT: dir is inconsistent */
462
463 error = xfs_da_node_create(state->args, blkno, treelevel,
464 &newblk->bp, state->args->whichfork);
465 if (error)
466 return(error); /* GROT: dir is inconsistent */
467 newblk->blkno = blkno;
468 newblk->magic = XFS_DA_NODE_MAGIC;
469 xfs_da_node_rebalance(state, oldblk, newblk);
470 error = xfs_da_blk_link(state, oldblk, newblk);
471 if (error)
472 return(error);
473 *result = 1;
474 } else {
475 *result = 0;
476 }
477
478 /*
479 * Insert the new entry(s) into the correct block
480 * (updating last hashval in the process).
481 *
482 * xfs_da_node_add() inserts BEFORE the given index,
483 * and as a result of using node_lookup_int() we always
484 * point to a valid entry (not after one), but a split
485 * operation always results in a new block whose hashvals
486 * FOLLOW the current block.
487 *
488 * If we had double-split op below us, then add the extra block too.
489 */
490 node = oldblk->bp->data;
491 if (oldblk->index <= INT_GET(node->hdr.count, ARCH_CONVERT)) {
492 oldblk->index++;
493 xfs_da_node_add(state, oldblk, addblk);
494 if (useextra) {
495 if (state->extraafter)
496 oldblk->index++;
497 xfs_da_node_add(state, oldblk, &state->extrablk);
498 state->extravalid = 0;
499 }
500 } else {
501 newblk->index++;
502 xfs_da_node_add(state, newblk, addblk);
503 if (useextra) {
504 if (state->extraafter)
505 newblk->index++;
506 xfs_da_node_add(state, newblk, &state->extrablk);
507 state->extravalid = 0;
508 }
509 }
510
511 return(0);
512}
513
514/*
515 * Balance the btree elements between two intermediate nodes,
516 * usually one full and one empty.
517 *
518 * NOTE: if blk2 is empty, then it will get the upper half of blk1.
519 */
520STATIC void
521xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
522 xfs_da_state_blk_t *blk2)
523{
524 xfs_da_intnode_t *node1, *node2, *tmpnode;
525 xfs_da_node_entry_t *btree_s, *btree_d;
526 int count, tmp;
527 xfs_trans_t *tp;
528
529 node1 = blk1->bp->data;
530 node2 = blk2->bp->data;
531 /*
532 * Figure out how many entries need to move, and in which direction.
533 * Swap the nodes around if that makes it simpler.
534 */
535 if ((INT_GET(node1->hdr.count, ARCH_CONVERT) > 0) && (INT_GET(node2->hdr.count, ARCH_CONVERT) > 0) &&
536 ((INT_GET(node2->btree[ 0 ].hashval, ARCH_CONVERT) < INT_GET(node1->btree[ 0 ].hashval, ARCH_CONVERT)) ||
537 (INT_GET(node2->btree[ INT_GET(node2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT) <
538 INT_GET(node1->btree[ INT_GET(node1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT)))) {
539 tmpnode = node1;
540 node1 = node2;
541 node2 = tmpnode;
542 }
543 ASSERT(INT_GET(node1->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
544 ASSERT(INT_GET(node2->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
545 count = (INT_GET(node1->hdr.count, ARCH_CONVERT) - INT_GET(node2->hdr.count, ARCH_CONVERT)) / 2;
546 if (count == 0)
547 return;
548 tp = state->args->trans;
549 /*
550 * Two cases: high-to-low and low-to-high.
551 */
552 if (count > 0) {
553 /*
554 * Move elements in node2 up to make a hole.
555 */
556 if ((tmp = INT_GET(node2->hdr.count, ARCH_CONVERT)) > 0) {
557 tmp *= (uint)sizeof(xfs_da_node_entry_t);
558 btree_s = &node2->btree[0];
559 btree_d = &node2->btree[count];
560 memmove(btree_d, btree_s, tmp);
561 }
562
563 /*
564 * Move the req'd B-tree elements from high in node1 to
565 * low in node2.
566 */
567 INT_MOD(node2->hdr.count, ARCH_CONVERT, count);
568 tmp = count * (uint)sizeof(xfs_da_node_entry_t);
569 btree_s = &node1->btree[INT_GET(node1->hdr.count, ARCH_CONVERT) - count];
570 btree_d = &node2->btree[0];
571 memcpy(btree_d, btree_s, tmp);
572 INT_MOD(node1->hdr.count, ARCH_CONVERT, -(count));
573
574 } else {
575 /*
576 * Move the req'd B-tree elements from low in node2 to
577 * high in node1.
578 */
579 count = -count;
580 tmp = count * (uint)sizeof(xfs_da_node_entry_t);
581 btree_s = &node2->btree[0];
582 btree_d = &node1->btree[INT_GET(node1->hdr.count, ARCH_CONVERT)];
583 memcpy(btree_d, btree_s, tmp);
584 INT_MOD(node1->hdr.count, ARCH_CONVERT, count);
585 xfs_da_log_buf(tp, blk1->bp,
586 XFS_DA_LOGRANGE(node1, btree_d, tmp));
587
588 /*
589 * Move elements in node2 down to fill the hole.
590 */
591 tmp = INT_GET(node2->hdr.count, ARCH_CONVERT) - count;
592 tmp *= (uint)sizeof(xfs_da_node_entry_t);
593 btree_s = &node2->btree[count];
594 btree_d = &node2->btree[0];
595 memmove(btree_d, btree_s, tmp);
596 INT_MOD(node2->hdr.count, ARCH_CONVERT, -(count));
597 }
598
599 /*
600 * Log header of node 1 and all current bits of node 2.
601 */
602 xfs_da_log_buf(tp, blk1->bp,
603 XFS_DA_LOGRANGE(node1, &node1->hdr, sizeof(node1->hdr)));
604 xfs_da_log_buf(tp, blk2->bp,
605 XFS_DA_LOGRANGE(node2, &node2->hdr,
606 sizeof(node2->hdr) +
607 sizeof(node2->btree[0]) * INT_GET(node2->hdr.count, ARCH_CONVERT)));
608
609 /*
610 * Record the last hashval from each block for upward propagation.
611 * (note: don't use the swapped node pointers)
612 */
613 node1 = blk1->bp->data;
614 node2 = blk2->bp->data;
615 blk1->hashval = INT_GET(node1->btree[ INT_GET(node1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
616 blk2->hashval = INT_GET(node2->btree[ INT_GET(node2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
617
618 /*
619 * Adjust the expected index for insertion.
620 */
621 if (blk1->index >= INT_GET(node1->hdr.count, ARCH_CONVERT)) {
622 blk2->index = blk1->index - INT_GET(node1->hdr.count, ARCH_CONVERT);
623 blk1->index = INT_GET(node1->hdr.count, ARCH_CONVERT) + 1; /* make it invalid */
624 }
625}
626
627/*
628 * Add a new entry to an intermediate node.
629 */
630STATIC void
631xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
632 xfs_da_state_blk_t *newblk)
633{
634 xfs_da_intnode_t *node;
635 xfs_da_node_entry_t *btree;
636 int tmp;
637 xfs_mount_t *mp;
638
639 node = oldblk->bp->data;
640 mp = state->mp;
641 ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
642 ASSERT((oldblk->index >= 0) && (oldblk->index <= INT_GET(node->hdr.count, ARCH_CONVERT)));
643 ASSERT(newblk->blkno != 0);
644 if (state->args->whichfork == XFS_DATA_FORK && XFS_DIR_IS_V2(mp))
645 ASSERT(newblk->blkno >= mp->m_dirleafblk &&
646 newblk->blkno < mp->m_dirfreeblk);
647
648 /*
649 * We may need to make some room before we insert the new node.
650 */
651 tmp = 0;
652 btree = &node->btree[ oldblk->index ];
653 if (oldblk->index < INT_GET(node->hdr.count, ARCH_CONVERT)) {
654 tmp = (INT_GET(node->hdr.count, ARCH_CONVERT) - oldblk->index) * (uint)sizeof(*btree);
655 memmove(btree + 1, btree, tmp);
656 }
657 INT_SET(btree->hashval, ARCH_CONVERT, newblk->hashval);
658 INT_SET(btree->before, ARCH_CONVERT, newblk->blkno);
659 xfs_da_log_buf(state->args->trans, oldblk->bp,
660 XFS_DA_LOGRANGE(node, btree, tmp + sizeof(*btree)));
661 INT_MOD(node->hdr.count, ARCH_CONVERT, +1);
662 xfs_da_log_buf(state->args->trans, oldblk->bp,
663 XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
664
665 /*
666 * Copy the last hash value from the oldblk to propagate upwards.
667 */
668 oldblk->hashval = INT_GET(node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
669}
670
671/*========================================================================
672 * Routines used for shrinking the Btree.
673 *========================================================================*/
674
675/*
676 * Deallocate an empty leaf node, remove it from its parent,
677 * possibly deallocating that block, etc...
678 */
679int
680xfs_da_join(xfs_da_state_t *state)
681{
682 xfs_da_state_blk_t *drop_blk, *save_blk;
683 int action, error;
684
685 action = 0;
686 drop_blk = &state->path.blk[ state->path.active-1 ];
687 save_blk = &state->altpath.blk[ state->path.active-1 ];
688 ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC);
689 ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC ||
690 drop_blk->magic == XFS_DIRX_LEAF_MAGIC(state->mp));
691
692 /*
693 * Walk back up the tree joining/deallocating as necessary.
694 * When we stop dropping blocks, break out.
695 */
696 for ( ; state->path.active >= 2; drop_blk--, save_blk--,
697 state->path.active--) {
698 /*
699 * See if we can combine the block with a neighbor.
700 * (action == 0) => no options, just leave
701 * (action == 1) => coalesce, then unlink
702 * (action == 2) => block empty, unlink it
703 */
704 switch (drop_blk->magic) {
705 case XFS_ATTR_LEAF_MAGIC:
706#ifndef __KERNEL__
707 error = ENOTTY;
708#else
709 error = xfs_attr_leaf_toosmall(state, &action);
710#endif
711 if (error)
712 return(error);
713 if (action == 0)
714 return(0);
715#ifdef __KERNEL__
716 xfs_attr_leaf_unbalance(state, drop_blk, save_blk);
717#endif
718 break;
719 case XFS_DIR_LEAF_MAGIC:
720 ASSERT(XFS_DIR_IS_V1(state->mp));
721 error = xfs_dir_leaf_toosmall(state, &action);
722 if (error)
723 return(error);
724 if (action == 0)
725 return(0);
726 xfs_dir_leaf_unbalance(state, drop_blk, save_blk);
727 break;
728 case XFS_DIR2_LEAFN_MAGIC:
729 ASSERT(XFS_DIR_IS_V2(state->mp));
730 error = xfs_dir2_leafn_toosmall(state, &action);
731 if (error)
732 return error;
733 if (action == 0)
734 return 0;
735 xfs_dir2_leafn_unbalance(state, drop_blk, save_blk);
736 break;
737 case XFS_DA_NODE_MAGIC:
738 /*
739 * Remove the offending node, fixup hashvals,
740 * check for a toosmall neighbor.
741 */
742 xfs_da_node_remove(state, drop_blk);
743 xfs_da_fixhashpath(state, &state->path);
744 error = xfs_da_node_toosmall(state, &action);
745 if (error)
746 return(error);
747 if (action == 0)
748 return 0;
749 xfs_da_node_unbalance(state, drop_blk, save_blk);
750 break;
751 }
752 xfs_da_fixhashpath(state, &state->altpath);
753 error = xfs_da_blk_unlink(state, drop_blk, save_blk);
754 xfs_da_state_kill_altpath(state);
755 if (error)
756 return(error);
757 error = xfs_da_shrink_inode(state->args, drop_blk->blkno,
758 drop_blk->bp);
759 drop_blk->bp = NULL;
760 if (error)
761 return(error);
762 }
763 /*
764 * We joined all the way to the top. If it turns out that
765 * we only have one entry in the root, make the child block
766 * the new root.
767 */
768 xfs_da_node_remove(state, drop_blk);
769 xfs_da_fixhashpath(state, &state->path);
770 error = xfs_da_root_join(state, &state->path.blk[0]);
771 return(error);
772}
773
774/*
775 * We have only one entry in the root. Copy the only remaining child of
776 * the old root to block 0 as the new root node.
777 */
778STATIC int
779xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
780{
781 xfs_da_intnode_t *oldroot;
782 /* REFERENCED */
783 xfs_da_blkinfo_t *blkinfo;
784 xfs_da_args_t *args;
785 xfs_dablk_t child;
786 xfs_dabuf_t *bp;
787 int error;
788
789 args = state->args;
790 ASSERT(args != NULL);
791 ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC);
792 oldroot = root_blk->bp->data;
793 ASSERT(INT_GET(oldroot->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
794 ASSERT(!oldroot->hdr.info.forw);
795 ASSERT(!oldroot->hdr.info.back);
796
797 /*
798 * If the root has more than one child, then don't do anything.
799 */
800 if (INT_GET(oldroot->hdr.count, ARCH_CONVERT) > 1)
801 return(0);
802
803 /*
804 * Read in the (only) child block, then copy those bytes into
805 * the root block's buffer and free the original child block.
806 */
807 child = INT_GET(oldroot->btree[ 0 ].before, ARCH_CONVERT);
808 ASSERT(child != 0);
809 error = xfs_da_read_buf(args->trans, args->dp, child, -1, &bp,
810 args->whichfork);
811 if (error)
812 return(error);
813 ASSERT(bp != NULL);
814 blkinfo = bp->data;
815 if (INT_GET(oldroot->hdr.level, ARCH_CONVERT) == 1) {
816 ASSERT(INT_GET(blkinfo->magic, ARCH_CONVERT) == XFS_DIRX_LEAF_MAGIC(state->mp) ||
817 INT_GET(blkinfo->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC);
818 } else {
819 ASSERT(INT_GET(blkinfo->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
820 }
821 ASSERT(!blkinfo->forw);
822 ASSERT(!blkinfo->back);
823 memcpy(root_blk->bp->data, bp->data, state->blocksize);
824 xfs_da_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1);
825 error = xfs_da_shrink_inode(args, child, bp);
826 return(error);
827}
828
829/*
830 * Check a node block and its neighbors to see if the block should be
831 * collapsed into one or the other neighbor. Always keep the block
832 * with the smaller block number.
833 * If the current block is over 50% full, don't try to join it, return 0.
834 * If the block is empty, fill in the state structure and return 2.
835 * If it can be collapsed, fill in the state structure and return 1.
836 * If nothing can be done, return 0.
837 */
838STATIC int
839xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
840{
841 xfs_da_intnode_t *node;
842 xfs_da_state_blk_t *blk;
843 xfs_da_blkinfo_t *info;
844 int count, forward, error, retval, i;
845 xfs_dablk_t blkno;
846 xfs_dabuf_t *bp;
847
848 /*
849 * Check for the degenerate case of the block being over 50% full.
850 * If so, it's not worth even looking to see if we might be able
851 * to coalesce with a sibling.
852 */
853 blk = &state->path.blk[ state->path.active-1 ];
854 info = blk->bp->data;
855 ASSERT(INT_GET(info->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
856 node = (xfs_da_intnode_t *)info;
857 count = INT_GET(node->hdr.count, ARCH_CONVERT);
858 if (count > (state->node_ents >> 1)) {
859 *action = 0; /* blk over 50%, don't try to join */
860 return(0); /* blk over 50%, don't try to join */
861 }
862
863 /*
864 * Check for the degenerate case of the block being empty.
865 * If the block is empty, we'll simply delete it, no need to
866 * coalesce it with a sibling block. We choose (aribtrarily)
867 * to merge with the forward block unless it is NULL.
868 */
869 if (count == 0) {
870 /*
871 * Make altpath point to the block we want to keep and
872 * path point to the block we want to drop (this one).
873 */
874 forward = info->forw;
875 memcpy(&state->altpath, &state->path, sizeof(state->path));
876 error = xfs_da_path_shift(state, &state->altpath, forward,
877 0, &retval);
878 if (error)
879 return(error);
880 if (retval) {
881 *action = 0;
882 } else {
883 *action = 2;
884 }
885 return(0);
886 }
887
888 /*
889 * Examine each sibling block to see if we can coalesce with
890 * at least 25% free space to spare. We need to figure out
891 * whether to merge with the forward or the backward block.
892 * We prefer coalescing with the lower numbered sibling so as
893 * to shrink a directory over time.
894 */
895 /* start with smaller blk num */
896 forward = (INT_GET(info->forw, ARCH_CONVERT)
897 < INT_GET(info->back, ARCH_CONVERT));
898 for (i = 0; i < 2; forward = !forward, i++) {
899 if (forward)
900 blkno = INT_GET(info->forw, ARCH_CONVERT);
901 else
902 blkno = INT_GET(info->back, ARCH_CONVERT);
903 if (blkno == 0)
904 continue;
905 error = xfs_da_read_buf(state->args->trans, state->args->dp,
906 blkno, -1, &bp, state->args->whichfork);
907 if (error)
908 return(error);
909 ASSERT(bp != NULL);
910
911 node = (xfs_da_intnode_t *)info;
912 count = state->node_ents;
913 count -= state->node_ents >> 2;
914 count -= INT_GET(node->hdr.count, ARCH_CONVERT);
915 node = bp->data;
916 ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
917 count -= INT_GET(node->hdr.count, ARCH_CONVERT);
918 xfs_da_brelse(state->args->trans, bp);
919 if (count >= 0)
920 break; /* fits with at least 25% to spare */
921 }
922 if (i >= 2) {
923 *action = 0;
924 return(0);
925 }
926
927 /*
928 * Make altpath point to the block we want to keep (the lower
929 * numbered block) and path point to the block we want to drop.
930 */
931 memcpy(&state->altpath, &state->path, sizeof(state->path));
932 if (blkno < blk->blkno) {
933 error = xfs_da_path_shift(state, &state->altpath, forward,
934 0, &retval);
935 if (error) {
936 return(error);
937 }
938 if (retval) {
939 *action = 0;
940 return(0);
941 }
942 } else {
943 error = xfs_da_path_shift(state, &state->path, forward,
944 0, &retval);
945 if (error) {
946 return(error);
947 }
948 if (retval) {
949 *action = 0;
950 return(0);
951 }
952 }
953 *action = 1;
954 return(0);
955}
956
957/*
958 * Walk back up the tree adjusting hash values as necessary,
959 * when we stop making changes, return.
960 */
961void
962xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path)
963{
964 xfs_da_state_blk_t *blk;
965 xfs_da_intnode_t *node;
966 xfs_da_node_entry_t *btree;
967 xfs_dahash_t lasthash=0;
968 int level, count;
969
970 level = path->active-1;
971 blk = &path->blk[ level ];
972 switch (blk->magic) {
973#ifdef __KERNEL__
974 case XFS_ATTR_LEAF_MAGIC:
975 lasthash = xfs_attr_leaf_lasthash(blk->bp, &count);
976 if (count == 0)
977 return;
978 break;
979#endif
980 case XFS_DIR_LEAF_MAGIC:
981 ASSERT(XFS_DIR_IS_V1(state->mp));
982 lasthash = xfs_dir_leaf_lasthash(blk->bp, &count);
983 if (count == 0)
984 return;
985 break;
986 case XFS_DIR2_LEAFN_MAGIC:
987 ASSERT(XFS_DIR_IS_V2(state->mp));
988 lasthash = xfs_dir2_leafn_lasthash(blk->bp, &count);
989 if (count == 0)
990 return;
991 break;
992 case XFS_DA_NODE_MAGIC:
993 lasthash = xfs_da_node_lasthash(blk->bp, &count);
994 if (count == 0)
995 return;
996 break;
997 }
998 for (blk--, level--; level >= 0; blk--, level--) {
999 node = blk->bp->data;
1000 ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
1001 btree = &node->btree[ blk->index ];
1002 if (INT_GET(btree->hashval, ARCH_CONVERT) == lasthash)
1003 break;
1004 blk->hashval = lasthash;
1005 INT_SET(btree->hashval, ARCH_CONVERT, lasthash);
1006 xfs_da_log_buf(state->args->trans, blk->bp,
1007 XFS_DA_LOGRANGE(node, btree, sizeof(*btree)));
1008
1009 lasthash = INT_GET(node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
1010 }
1011}
1012
1013/*
1014 * Remove an entry from an intermediate node.
1015 */
1016STATIC void
1017xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk)
1018{
1019 xfs_da_intnode_t *node;
1020 xfs_da_node_entry_t *btree;
1021 int tmp;
1022
1023 node = drop_blk->bp->data;
1024 ASSERT(drop_blk->index < INT_GET(node->hdr.count, ARCH_CONVERT));
1025 ASSERT(drop_blk->index >= 0);
1026
1027 /*
1028 * Copy over the offending entry, or just zero it out.
1029 */
1030 btree = &node->btree[drop_blk->index];
1031 if (drop_blk->index < (INT_GET(node->hdr.count, ARCH_CONVERT)-1)) {
1032 tmp = INT_GET(node->hdr.count, ARCH_CONVERT) - drop_blk->index - 1;
1033 tmp *= (uint)sizeof(xfs_da_node_entry_t);
1034 memmove(btree, btree + 1, tmp);
1035 xfs_da_log_buf(state->args->trans, drop_blk->bp,
1036 XFS_DA_LOGRANGE(node, btree, tmp));
1037 btree = &node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ];
1038 }
1039 memset((char *)btree, 0, sizeof(xfs_da_node_entry_t));
1040 xfs_da_log_buf(state->args->trans, drop_blk->bp,
1041 XFS_DA_LOGRANGE(node, btree, sizeof(*btree)));
1042 INT_MOD(node->hdr.count, ARCH_CONVERT, -1);
1043 xfs_da_log_buf(state->args->trans, drop_blk->bp,
1044 XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
1045
1046 /*
1047 * Copy the last hash value from the block to propagate upwards.
1048 */
1049 btree--;
1050 drop_blk->hashval = INT_GET(btree->hashval, ARCH_CONVERT);
1051}
1052
1053/*
1054 * Unbalance the btree elements between two intermediate nodes,
1055 * move all Btree elements from one node into another.
1056 */
1057STATIC void
1058xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
1059 xfs_da_state_blk_t *save_blk)
1060{
1061 xfs_da_intnode_t *drop_node, *save_node;
1062 xfs_da_node_entry_t *btree;
1063 int tmp;
1064 xfs_trans_t *tp;
1065
1066 drop_node = drop_blk->bp->data;
1067 save_node = save_blk->bp->data;
1068 ASSERT(INT_GET(drop_node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
1069 ASSERT(INT_GET(save_node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
1070 tp = state->args->trans;
1071
1072 /*
1073 * If the dying block has lower hashvals, then move all the
1074 * elements in the remaining block up to make a hole.
1075 */
1076 if ((INT_GET(drop_node->btree[ 0 ].hashval, ARCH_CONVERT) < INT_GET(save_node->btree[ 0 ].hashval, ARCH_CONVERT)) ||
1077 (INT_GET(drop_node->btree[ INT_GET(drop_node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT) <
1078 INT_GET(save_node->btree[ INT_GET(save_node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT)))
1079 {
1080 btree = &save_node->btree[ INT_GET(drop_node->hdr.count, ARCH_CONVERT) ];
1081 tmp = INT_GET(save_node->hdr.count, ARCH_CONVERT) * (uint)sizeof(xfs_da_node_entry_t);
1082 memmove(btree, &save_node->btree[0], tmp);
1083 btree = &save_node->btree[0];
1084 xfs_da_log_buf(tp, save_blk->bp,
1085 XFS_DA_LOGRANGE(save_node, btree,
1086 (INT_GET(save_node->hdr.count, ARCH_CONVERT) + INT_GET(drop_node->hdr.count, ARCH_CONVERT)) *
1087 sizeof(xfs_da_node_entry_t)));
1088 } else {
1089 btree = &save_node->btree[ INT_GET(save_node->hdr.count, ARCH_CONVERT) ];
1090 xfs_da_log_buf(tp, save_blk->bp,
1091 XFS_DA_LOGRANGE(save_node, btree,
1092 INT_GET(drop_node->hdr.count, ARCH_CONVERT) *
1093 sizeof(xfs_da_node_entry_t)));
1094 }
1095
1096 /*
1097 * Move all the B-tree elements from drop_blk to save_blk.
1098 */
1099 tmp = INT_GET(drop_node->hdr.count, ARCH_CONVERT) * (uint)sizeof(xfs_da_node_entry_t);
1100 memcpy(btree, &drop_node->btree[0], tmp);
1101 INT_MOD(save_node->hdr.count, ARCH_CONVERT, INT_GET(drop_node->hdr.count, ARCH_CONVERT));
1102
1103 xfs_da_log_buf(tp, save_blk->bp,
1104 XFS_DA_LOGRANGE(save_node, &save_node->hdr,
1105 sizeof(save_node->hdr)));
1106
1107 /*
1108 * Save the last hashval in the remaining block for upward propagation.
1109 */
1110 save_blk->hashval = INT_GET(save_node->btree[ INT_GET(save_node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
1111}
1112
1113/*========================================================================
1114 * Routines used for finding things in the Btree.
1115 *========================================================================*/
1116
1117/*
1118 * Walk down the Btree looking for a particular filename, filling
1119 * in the state structure as we go.
1120 *
1121 * We will set the state structure to point to each of the elements
1122 * in each of the nodes where either the hashval is or should be.
1123 *
1124 * We support duplicate hashval's so for each entry in the current
1125 * node that could contain the desired hashval, descend. This is a
1126 * pruned depth-first tree search.
1127 */
1128int /* error */
1129xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
1130{
1131 xfs_da_state_blk_t *blk;
1132 xfs_da_blkinfo_t *curr;
1133 xfs_da_intnode_t *node;
1134 xfs_da_node_entry_t *btree;
1135 xfs_dablk_t blkno;
1136 int probe, span, max, error, retval;
1137 xfs_dahash_t hashval;
1138 xfs_da_args_t *args;
1139
1140 args = state->args;
1141
1142 /*
1143 * Descend thru the B-tree searching each level for the right
1144 * node to use, until the right hashval is found.
1145 */
1146 if (args->whichfork == XFS_DATA_FORK && XFS_DIR_IS_V2(state->mp))
1147 blkno = state->mp->m_dirleafblk;
1148 else
1149 blkno = 0;
1150 for (blk = &state->path.blk[0], state->path.active = 1;
1151 state->path.active <= XFS_DA_NODE_MAXDEPTH;
1152 blk++, state->path.active++) {
1153 /*
1154 * Read the next node down in the tree.
1155 */
1156 blk->blkno = blkno;
1157 error = xfs_da_read_buf(args->trans, args->dp, blkno,
1158 -1, &blk->bp, args->whichfork);
1159 if (error) {
1160 blk->blkno = 0;
1161 state->path.active--;
1162 return(error);
1163 }
1164 curr = blk->bp->data;
1165 ASSERT(INT_GET(curr->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC ||
1166 INT_GET(curr->magic, ARCH_CONVERT) == XFS_DIRX_LEAF_MAGIC(state->mp) ||
1167 INT_GET(curr->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC);
1168
1169 /*
1170 * Search an intermediate node for a match.
1171 */
1172 blk->magic = INT_GET(curr->magic, ARCH_CONVERT);
1173 if (INT_GET(curr->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) {
1174 node = blk->bp->data;
1175 blk->hashval = INT_GET(node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
1176
1177 /*
1178 * Binary search. (note: small blocks will skip loop)
1179 */
1180 max = INT_GET(node->hdr.count, ARCH_CONVERT);
1181 probe = span = max / 2;
1182 hashval = args->hashval;
1183 for (btree = &node->btree[probe]; span > 4;
1184 btree = &node->btree[probe]) {
1185 span /= 2;
1186 if (INT_GET(btree->hashval, ARCH_CONVERT) < hashval)
1187 probe += span;
1188 else if (INT_GET(btree->hashval, ARCH_CONVERT) > hashval)
1189 probe -= span;
1190 else
1191 break;
1192 }
1193 ASSERT((probe >= 0) && (probe < max));
1194 ASSERT((span <= 4) || (INT_GET(btree->hashval, ARCH_CONVERT) == hashval));
1195
1196 /*
1197 * Since we may have duplicate hashval's, find the first
1198 * matching hashval in the node.
1199 */
1200 while ((probe > 0) && (INT_GET(btree->hashval, ARCH_CONVERT) >= hashval)) {
1201 btree--;
1202 probe--;
1203 }
1204 while ((probe < max) && (INT_GET(btree->hashval, ARCH_CONVERT) < hashval)) {
1205 btree++;
1206 probe++;
1207 }
1208
1209 /*
1210 * Pick the right block to descend on.
1211 */
1212 if (probe == max) {
1213 blk->index = max-1;
1214 blkno = INT_GET(node->btree[ max-1 ].before, ARCH_CONVERT);
1215 } else {
1216 blk->index = probe;
1217 blkno = INT_GET(btree->before, ARCH_CONVERT);
1218 }
1219 }
1220#ifdef __KERNEL__
1221 else if (INT_GET(curr->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC) {
1222 blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
1223 break;
1224 }
1225#endif
1226 else if (INT_GET(curr->magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC) {
1227 blk->hashval = xfs_dir_leaf_lasthash(blk->bp, NULL);
1228 break;
1229 }
1230 else if (INT_GET(curr->magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC) {
1231 blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL);
1232 break;
1233 }
1234 }
1235
1236 /*
1237 * A leaf block that ends in the hashval that we are interested in
1238 * (final hashval == search hashval) means that the next block may
1239 * contain more entries with the same hashval, shift upward to the
1240 * next leaf and keep searching.
1241 */
1242 for (;;) {
1243 if (blk->magic == XFS_DIR_LEAF_MAGIC) {
1244 ASSERT(XFS_DIR_IS_V1(state->mp));
1245 retval = xfs_dir_leaf_lookup_int(blk->bp, args,
1246 &blk->index);
1247 } else if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
1248 ASSERT(XFS_DIR_IS_V2(state->mp));
1249 retval = xfs_dir2_leafn_lookup_int(blk->bp, args,
1250 &blk->index, state);
1251 }
1252#ifdef __KERNEL__
1253 else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
1254 retval = xfs_attr_leaf_lookup_int(blk->bp, args);
1255 blk->index = args->index;
1256 args->blkno = blk->blkno;
1257 }
1258#endif
1259 if (((retval == ENOENT) || (retval == ENOATTR)) &&
1260 (blk->hashval == args->hashval)) {
1261 error = xfs_da_path_shift(state, &state->path, 1, 1,
1262 &retval);
1263 if (error)
1264 return(error);
1265 if (retval == 0) {
1266 continue;
1267 }
1268#ifdef __KERNEL__
1269 else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
1270 /* path_shift() gives ENOENT */
1271 retval = XFS_ERROR(ENOATTR);
1272 }
1273#endif
1274 }
1275 break;
1276 }
1277 *result = retval;
1278 return(0);
1279}
1280
1281/*========================================================================
1282 * Utility routines.
1283 *========================================================================*/
1284
1285/*
1286 * Link a new block into a doubly linked list of blocks (of whatever type).
1287 */
1288int /* error */
1289xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
1290 xfs_da_state_blk_t *new_blk)
1291{
1292 xfs_da_blkinfo_t *old_info, *new_info, *tmp_info;
1293 xfs_da_args_t *args;
1294 int before=0, error;
1295 xfs_dabuf_t *bp;
1296
1297 /*
1298 * Set up environment.
1299 */
1300 args = state->args;
1301 ASSERT(args != NULL);
1302 old_info = old_blk->bp->data;
1303 new_info = new_blk->bp->data;
1304 ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC ||
1305 old_blk->magic == XFS_DIRX_LEAF_MAGIC(state->mp) ||
1306 old_blk->magic == XFS_ATTR_LEAF_MAGIC);
1307 ASSERT(old_blk->magic == INT_GET(old_info->magic, ARCH_CONVERT));
1308 ASSERT(new_blk->magic == INT_GET(new_info->magic, ARCH_CONVERT));
1309 ASSERT(old_blk->magic == new_blk->magic);
1310
1311 switch (old_blk->magic) {
1312#ifdef __KERNEL__
1313 case XFS_ATTR_LEAF_MAGIC:
1314 before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp);
1315 break;
1316#endif
1317 case XFS_DIR_LEAF_MAGIC:
1318 ASSERT(XFS_DIR_IS_V1(state->mp));
1319 before = xfs_dir_leaf_order(old_blk->bp, new_blk->bp);
1320 break;
1321 case XFS_DIR2_LEAFN_MAGIC:
1322 ASSERT(XFS_DIR_IS_V2(state->mp));
1323 before = xfs_dir2_leafn_order(old_blk->bp, new_blk->bp);
1324 break;
1325 case XFS_DA_NODE_MAGIC:
1326 before = xfs_da_node_order(old_blk->bp, new_blk->bp);
1327 break;
1328 }
1329
1330 /*
1331 * Link blocks in appropriate order.
1332 */
1333 if (before) {
1334 /*
1335 * Link new block in before existing block.
1336 */
1337 INT_SET(new_info->forw, ARCH_CONVERT, old_blk->blkno);
1338 new_info->back = old_info->back; /* INT_: direct copy */
1339 if (INT_GET(old_info->back, ARCH_CONVERT)) {
1340 error = xfs_da_read_buf(args->trans, args->dp,
1341 INT_GET(old_info->back,
1342 ARCH_CONVERT), -1, &bp,
1343 args->whichfork);
1344 if (error)
1345 return(error);
1346 ASSERT(bp != NULL);
1347 tmp_info = bp->data;
1348 ASSERT(INT_GET(tmp_info->magic, ARCH_CONVERT) == INT_GET(old_info->magic, ARCH_CONVERT));
1349 ASSERT(INT_GET(tmp_info->forw, ARCH_CONVERT) == old_blk->blkno);
1350 INT_SET(tmp_info->forw, ARCH_CONVERT, new_blk->blkno);
1351 xfs_da_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
1352 xfs_da_buf_done(bp);
1353 }
1354 INT_SET(old_info->back, ARCH_CONVERT, new_blk->blkno);
1355 } else {
1356 /*
1357 * Link new block in after existing block.
1358 */
1359 new_info->forw = old_info->forw; /* INT_: direct copy */
1360 INT_SET(new_info->back, ARCH_CONVERT, old_blk->blkno);
1361 if (INT_GET(old_info->forw, ARCH_CONVERT)) {
1362 error = xfs_da_read_buf(args->trans, args->dp,
1363 INT_GET(old_info->forw, ARCH_CONVERT), -1, &bp,
1364 args->whichfork);
1365 if (error)
1366 return(error);
1367 ASSERT(bp != NULL);
1368 tmp_info = bp->data;
1369 ASSERT(INT_GET(tmp_info->magic, ARCH_CONVERT)
1370 == INT_GET(old_info->magic, ARCH_CONVERT));
1371 ASSERT(INT_GET(tmp_info->back, ARCH_CONVERT)
1372 == old_blk->blkno);
1373 INT_SET(tmp_info->back, ARCH_CONVERT, new_blk->blkno);
1374 xfs_da_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
1375 xfs_da_buf_done(bp);
1376 }
1377 INT_SET(old_info->forw, ARCH_CONVERT, new_blk->blkno);
1378 }
1379
1380 xfs_da_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1);
1381 xfs_da_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1);
1382 return(0);
1383}
1384
1385/*
1386 * Compare two intermediate nodes for "order".
1387 */
1388STATIC int
1389xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp)
1390{
1391 xfs_da_intnode_t *node1, *node2;
1392
1393 node1 = node1_bp->data;
1394 node2 = node2_bp->data;
1395 ASSERT((INT_GET(node1->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) &&
1396 (INT_GET(node2->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC));
1397 if ((INT_GET(node1->hdr.count, ARCH_CONVERT) > 0) && (INT_GET(node2->hdr.count, ARCH_CONVERT) > 0) &&
1398 ((INT_GET(node2->btree[ 0 ].hashval, ARCH_CONVERT) <
1399 INT_GET(node1->btree[ 0 ].hashval, ARCH_CONVERT)) ||
1400 (INT_GET(node2->btree[ INT_GET(node2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT) <
1401 INT_GET(node1->btree[ INT_GET(node1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT)))) {
1402 return(1);
1403 }
1404 return(0);
1405}
1406
1407/*
1408 * Pick up the last hashvalue from an intermediate node.
1409 */
1410STATIC uint
1411xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count)
1412{
1413 xfs_da_intnode_t *node;
1414
1415 node = bp->data;
1416 ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
1417 if (count)
1418 *count = INT_GET(node->hdr.count, ARCH_CONVERT);
1419 if (!node->hdr.count)
1420 return(0);
1421 return(INT_GET(node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT));
1422}
1423
1424/*
1425 * Unlink a block from a doubly linked list of blocks.
1426 */
1427int /* error */
1428xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
1429 xfs_da_state_blk_t *save_blk)
1430{
1431 xfs_da_blkinfo_t *drop_info, *save_info, *tmp_info;
1432 xfs_da_args_t *args;
1433 xfs_dabuf_t *bp;
1434 int error;
1435
1436 /*
1437 * Set up environment.
1438 */
1439 args = state->args;
1440 ASSERT(args != NULL);
1441 save_info = save_blk->bp->data;
1442 drop_info = drop_blk->bp->data;
1443 ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC ||
1444 save_blk->magic == XFS_DIRX_LEAF_MAGIC(state->mp) ||
1445 save_blk->magic == XFS_ATTR_LEAF_MAGIC);
1446 ASSERT(save_blk->magic == INT_GET(save_info->magic, ARCH_CONVERT));
1447 ASSERT(drop_blk->magic == INT_GET(drop_info->magic, ARCH_CONVERT));
1448 ASSERT(save_blk->magic == drop_blk->magic);
1449 ASSERT((INT_GET(save_info->forw, ARCH_CONVERT) == drop_blk->blkno) ||
1450 (INT_GET(save_info->back, ARCH_CONVERT) == drop_blk->blkno));
1451 ASSERT((INT_GET(drop_info->forw, ARCH_CONVERT) == save_blk->blkno) ||
1452 (INT_GET(drop_info->back, ARCH_CONVERT) == save_blk->blkno));
1453
1454 /*
1455 * Unlink the leaf block from the doubly linked chain of leaves.
1456 */
1457 if (INT_GET(save_info->back, ARCH_CONVERT) == drop_blk->blkno) {
1458 save_info->back = drop_info->back; /* INT_: direct copy */
1459 if (INT_GET(drop_info->back, ARCH_CONVERT)) {
1460 error = xfs_da_read_buf(args->trans, args->dp,
1461 INT_GET(drop_info->back,
1462 ARCH_CONVERT), -1, &bp,
1463 args->whichfork);
1464 if (error)
1465 return(error);
1466 ASSERT(bp != NULL);
1467 tmp_info = bp->data;
1468 ASSERT(INT_GET(tmp_info->magic, ARCH_CONVERT) == INT_GET(save_info->magic, ARCH_CONVERT));
1469 ASSERT(INT_GET(tmp_info->forw, ARCH_CONVERT) == drop_blk->blkno);
1470 INT_SET(tmp_info->forw, ARCH_CONVERT, save_blk->blkno);
1471 xfs_da_log_buf(args->trans, bp, 0,
1472 sizeof(*tmp_info) - 1);
1473 xfs_da_buf_done(bp);
1474 }
1475 } else {
1476 save_info->forw = drop_info->forw; /* INT_: direct copy */
1477 if (INT_GET(drop_info->forw, ARCH_CONVERT)) {
1478 error = xfs_da_read_buf(args->trans, args->dp,
1479 INT_GET(drop_info->forw, ARCH_CONVERT), -1, &bp,
1480 args->whichfork);
1481 if (error)
1482 return(error);
1483 ASSERT(bp != NULL);
1484 tmp_info = bp->data;
1485 ASSERT(INT_GET(tmp_info->magic, ARCH_CONVERT)
1486 == INT_GET(save_info->magic, ARCH_CONVERT));
1487 ASSERT(INT_GET(tmp_info->back, ARCH_CONVERT)
1488 == drop_blk->blkno);
1489 INT_SET(tmp_info->back, ARCH_CONVERT, save_blk->blkno);
1490 xfs_da_log_buf(args->trans, bp, 0,
1491 sizeof(*tmp_info) - 1);
1492 xfs_da_buf_done(bp);
1493 }
1494 }
1495
1496 xfs_da_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1);
1497 return(0);
1498}
1499
1500/*
1501 * Move a path "forward" or "!forward" one block at the current level.
1502 *
1503 * This routine will adjust a "path" to point to the next block
1504 * "forward" (higher hashvalues) or "!forward" (lower hashvals) in the
1505 * Btree, including updating pointers to the intermediate nodes between
1506 * the new bottom and the root.
1507 */
1508int /* error */
1509xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
1510 int forward, int release, int *result)
1511{
1512 xfs_da_state_blk_t *blk;
1513 xfs_da_blkinfo_t *info;
1514 xfs_da_intnode_t *node;
1515 xfs_da_args_t *args;
1516 xfs_dablk_t blkno=0;
1517 int level, error;
1518
1519 /*
1520 * Roll up the Btree looking for the first block where our
1521 * current index is not at the edge of the block. Note that
1522 * we skip the bottom layer because we want the sibling block.
1523 */
1524 args = state->args;
1525 ASSERT(args != NULL);
1526 ASSERT(path != NULL);
1527 ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
1528 level = (path->active-1) - 1; /* skip bottom layer in path */
1529 for (blk = &path->blk[level]; level >= 0; blk--, level--) {
1530 ASSERT(blk->bp != NULL);
1531 node = blk->bp->data;
1532 ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
1533 if (forward && (blk->index < INT_GET(node->hdr.count, ARCH_CONVERT)-1)) {
1534 blk->index++;
1535 blkno = INT_GET(node->btree[ blk->index ].before, ARCH_CONVERT);
1536 break;
1537 } else if (!forward && (blk->index > 0)) {
1538 blk->index--;
1539 blkno = INT_GET(node->btree[ blk->index ].before, ARCH_CONVERT);
1540 break;
1541 }
1542 }
1543 if (level < 0) {
1544 *result = XFS_ERROR(ENOENT); /* we're out of our tree */
1545 ASSERT(args->oknoent);
1546 return(0);
1547 }
1548
1549 /*
1550 * Roll down the edge of the subtree until we reach the
1551 * same depth we were at originally.
1552 */
1553 for (blk++, level++; level < path->active; blk++, level++) {
1554 /*
1555 * Release the old block.
1556 * (if it's dirty, trans won't actually let go)
1557 */
1558 if (release)
1559 xfs_da_brelse(args->trans, blk->bp);
1560
1561 /*
1562 * Read the next child block.
1563 */
1564 blk->blkno = blkno;
1565 error = xfs_da_read_buf(args->trans, args->dp, blkno, -1,
1566 &blk->bp, args->whichfork);
1567 if (error)
1568 return(error);
1569 ASSERT(blk->bp != NULL);
1570 info = blk->bp->data;
1571 ASSERT(INT_GET(info->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC ||
1572 INT_GET(info->magic, ARCH_CONVERT) == XFS_DIRX_LEAF_MAGIC(state->mp) ||
1573 INT_GET(info->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC);
1574 blk->magic = INT_GET(info->magic, ARCH_CONVERT);
1575 if (INT_GET(info->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) {
1576 node = (xfs_da_intnode_t *)info;
1577 blk->hashval = INT_GET(node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
1578 if (forward)
1579 blk->index = 0;
1580 else
1581 blk->index = INT_GET(node->hdr.count, ARCH_CONVERT)-1;
1582 blkno = INT_GET(node->btree[ blk->index ].before, ARCH_CONVERT);
1583 } else {
1584 ASSERT(level == path->active-1);
1585 blk->index = 0;
1586 switch(blk->magic) {
1587#ifdef __KERNEL__
1588 case XFS_ATTR_LEAF_MAGIC:
1589 blk->hashval = xfs_attr_leaf_lasthash(blk->bp,
1590 NULL);
1591 break;
1592#endif
1593 case XFS_DIR_LEAF_MAGIC:
1594 ASSERT(XFS_DIR_IS_V1(state->mp));
1595 blk->hashval = xfs_dir_leaf_lasthash(blk->bp,
1596 NULL);
1597 break;
1598 case XFS_DIR2_LEAFN_MAGIC:
1599 ASSERT(XFS_DIR_IS_V2(state->mp));
1600 blk->hashval = xfs_dir2_leafn_lasthash(blk->bp,
1601 NULL);
1602 break;
1603 default:
1604 ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC ||
1605 blk->magic ==
1606 XFS_DIRX_LEAF_MAGIC(state->mp));
1607 break;
1608 }
1609 }
1610 }
1611 *result = 0;
1612 return(0);
1613}
1614
1615
1616/*========================================================================
1617 * Utility routines.
1618 *========================================================================*/
1619
1620/*
1621 * Implement a simple hash on a character string.
1622 * Rotate the hash value by 7 bits, then XOR each character in.
1623 * This is implemented with some source-level loop unrolling.
1624 */
1625xfs_dahash_t
1626xfs_da_hashname(uchar_t *name, int namelen)
1627{
1628 xfs_dahash_t hash;
1629
1630#ifdef SLOWVERSION
1631 /*
1632 * This is the old one-byte-at-a-time version.
1633 */
1634 for (hash = 0; namelen > 0; namelen--)
1635 hash = *name++ ^ rol32(hash, 7);
1636
1637 return(hash);
1638#else
1639 /*
1640 * Do four characters at a time as long as we can.
1641 */
1642 for (hash = 0; namelen >= 4; namelen -= 4, name += 4)
1643 hash = (name[0] << 21) ^ (name[1] << 14) ^ (name[2] << 7) ^
1644 (name[3] << 0) ^ rol32(hash, 7 * 4);
1645
1646 /*
1647 * Now do the rest of the characters.
1648 */
1649 switch (namelen) {
1650 case 3:
1651 return (name[0] << 14) ^ (name[1] << 7) ^ (name[2] << 0) ^
1652 rol32(hash, 7 * 3);
1653 case 2:
1654 return (name[0] << 7) ^ (name[1] << 0) ^ rol32(hash, 7 * 2);
1655 case 1:
1656 return (name[0] << 0) ^ rol32(hash, 7 * 1);
1657 case 0:
1658 return hash;
1659 }
1660 /* NOTREACHED */
1661#endif
1662 return 0; /* keep gcc happy */
1663}
1664
1665/*
1666 * Add a block to the btree ahead of the file.
1667 * Return the new block number to the caller.
1668 */
1669int
1670xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
1671{
1672 xfs_fileoff_t bno, b;
1673 xfs_bmbt_irec_t map;
1674 xfs_bmbt_irec_t *mapp;
1675 xfs_inode_t *dp;
1676 int nmap, error, w, count, c, got, i, mapi;
1677 xfs_fsize_t size;
1678 xfs_trans_t *tp;
1679 xfs_mount_t *mp;
1680
1681 dp = args->dp;
1682 mp = dp->i_mount;
1683 w = args->whichfork;
1684 tp = args->trans;
1685 /*
1686 * For new directories adjust the file offset and block count.
1687 */
1688 if (w == XFS_DATA_FORK && XFS_DIR_IS_V2(mp)) {
1689 bno = mp->m_dirleafblk;
1690 count = mp->m_dirblkfsbs;
1691 } else {
1692 bno = 0;
1693 count = 1;
1694 }
1695 /*
1696 * Find a spot in the file space to put the new block.
1697 */
1698 if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, w))) {
1699 return error;
1700 }
1701 if (w == XFS_DATA_FORK && XFS_DIR_IS_V2(mp))
1702 ASSERT(bno >= mp->m_dirleafblk && bno < mp->m_dirfreeblk);
1703 /*
1704 * Try mapping it in one filesystem block.
1705 */
1706 nmap = 1;
1707 ASSERT(args->firstblock != NULL);
1708 if ((error = xfs_bmapi(tp, dp, bno, count,
1709 XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|
1710 XFS_BMAPI_CONTIG,
1711 args->firstblock, args->total, &map, &nmap,
1712 args->flist))) {
1713 return error;
1714 }
1715 ASSERT(nmap <= 1);
1716 if (nmap == 1) {
1717 mapp = &map;
1718 mapi = 1;
1719 }
1720 /*
1721 * If we didn't get it and the block might work if fragmented,
1722 * try without the CONTIG flag. Loop until we get it all.
1723 */
1724 else if (nmap == 0 && count > 1) {
1725 mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP);
1726 for (b = bno, mapi = 0; b < bno + count; ) {
1727 nmap = MIN(XFS_BMAP_MAX_NMAP, count);
1728 c = (int)(bno + count - b);
1729 if ((error = xfs_bmapi(tp, dp, b, c,
1730 XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE|
1731 XFS_BMAPI_METADATA,
1732 args->firstblock, args->total,
1733 &mapp[mapi], &nmap, args->flist))) {
1734 kmem_free(mapp, sizeof(*mapp) * count);
1735 return error;
1736 }
1737 if (nmap < 1)
1738 break;
1739 mapi += nmap;
1740 b = mapp[mapi - 1].br_startoff +
1741 mapp[mapi - 1].br_blockcount;
1742 }
1743 } else {
1744 mapi = 0;
1745 mapp = NULL;
1746 }
1747 /*
1748 * Count the blocks we got, make sure it matches the total.
1749 */
1750 for (i = 0, got = 0; i < mapi; i++)
1751 got += mapp[i].br_blockcount;
1752 if (got != count || mapp[0].br_startoff != bno ||
1753 mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
1754 bno + count) {
1755 if (mapp != &map)
1756 kmem_free(mapp, sizeof(*mapp) * count);
1757 return XFS_ERROR(ENOSPC);
1758 }
1759 if (mapp != &map)
1760 kmem_free(mapp, sizeof(*mapp) * count);
1761 *new_blkno = (xfs_dablk_t)bno;
1762 /*
1763 * For version 1 directories, adjust the file size if it changed.
1764 */
1765 if (w == XFS_DATA_FORK && XFS_DIR_IS_V1(mp)) {
1766 ASSERT(mapi == 1);
1767 if ((error = xfs_bmap_last_offset(tp, dp, &bno, w)))
1768 return error;
1769 size = XFS_FSB_TO_B(mp, bno);
1770 if (size != dp->i_d.di_size) {
1771 dp->i_d.di_size = size;
1772 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1773 }
1774 }
1775 return 0;
1776}
1777
1778/*
1779 * Ick. We need to always be able to remove a btree block, even
1780 * if there's no space reservation because the filesystem is full.
1781 * This is called if xfs_bunmapi on a btree block fails due to ENOSPC.
1782 * It swaps the target block with the last block in the file. The
1783 * last block in the file can always be removed since it can't cause
1784 * a bmap btree split to do that.
1785 */
1786STATIC int
1787xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
1788 xfs_dabuf_t **dead_bufp)
1789{
1790 xfs_dablk_t dead_blkno, last_blkno, sib_blkno, par_blkno;
1791 xfs_dabuf_t *dead_buf, *last_buf, *sib_buf, *par_buf;
1792 xfs_fileoff_t lastoff;
1793 xfs_inode_t *ip;
1794 xfs_trans_t *tp;
1795 xfs_mount_t *mp;
1796 int error, w, entno, level, dead_level;
1797 xfs_da_blkinfo_t *dead_info, *sib_info;
1798 xfs_da_intnode_t *par_node, *dead_node;
1799 xfs_dir_leafblock_t *dead_leaf;
1800 xfs_dir2_leaf_t *dead_leaf2;
1801 xfs_dahash_t dead_hash;
1802
1803 dead_buf = *dead_bufp;
1804 dead_blkno = *dead_blknop;
1805 tp = args->trans;
1806 ip = args->dp;
1807 w = args->whichfork;
1808 ASSERT(w == XFS_DATA_FORK);
1809 mp = ip->i_mount;
1810 if (XFS_DIR_IS_V2(mp)) {
1811 lastoff = mp->m_dirfreeblk;
1812 error = xfs_bmap_last_before(tp, ip, &lastoff, w);
1813 } else
1814 error = xfs_bmap_last_offset(tp, ip, &lastoff, w);
1815 if (error)
1816 return error;
1817 if (unlikely(lastoff == 0)) {
1818 XFS_ERROR_REPORT("xfs_da_swap_lastblock(1)", XFS_ERRLEVEL_LOW,
1819 mp);
1820 return XFS_ERROR(EFSCORRUPTED);
1821 }
1822 /*
1823 * Read the last block in the btree space.
1824 */
1825 last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs;
1826 if ((error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w)))
1827 return error;
1828 /*
1829 * Copy the last block into the dead buffer and log it.
1830 */
1831 memcpy(dead_buf->data, last_buf->data, mp->m_dirblksize);
1832 xfs_da_log_buf(tp, dead_buf, 0, mp->m_dirblksize - 1);
1833 dead_info = dead_buf->data;
1834 /*
1835 * Get values from the moved block.
1836 */
1837 if (INT_GET(dead_info->magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC) {
1838 ASSERT(XFS_DIR_IS_V1(mp));
1839 dead_leaf = (xfs_dir_leafblock_t *)dead_info;
1840 dead_level = 0;
1841 dead_hash =
1842 INT_GET(dead_leaf->entries[INT_GET(dead_leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT);
1843 } else if (INT_GET(dead_info->magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC) {
1844 ASSERT(XFS_DIR_IS_V2(mp));
1845 dead_leaf2 = (xfs_dir2_leaf_t *)dead_info;
1846 dead_level = 0;
1847 dead_hash = INT_GET(dead_leaf2->ents[INT_GET(dead_leaf2->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT);
1848 } else {
1849 ASSERT(INT_GET(dead_info->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
1850 dead_node = (xfs_da_intnode_t *)dead_info;
1851 dead_level = INT_GET(dead_node->hdr.level, ARCH_CONVERT);
1852 dead_hash = INT_GET(dead_node->btree[INT_GET(dead_node->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT);
1853 }
1854 sib_buf = par_buf = NULL;
1855 /*
1856 * If the moved block has a left sibling, fix up the pointers.
1857 */
1858 if ((sib_blkno = INT_GET(dead_info->back, ARCH_CONVERT))) {
1859 if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w)))
1860 goto done;
1861 sib_info = sib_buf->data;
1862 if (unlikely(
1863 INT_GET(sib_info->forw, ARCH_CONVERT) != last_blkno ||
1864 INT_GET(sib_info->magic, ARCH_CONVERT) != INT_GET(dead_info->magic, ARCH_CONVERT))) {
1865 XFS_ERROR_REPORT("xfs_da_swap_lastblock(2)",
1866 XFS_ERRLEVEL_LOW, mp);
1867 error = XFS_ERROR(EFSCORRUPTED);
1868 goto done;
1869 }
1870 INT_SET(sib_info->forw, ARCH_CONVERT, dead_blkno);
1871 xfs_da_log_buf(tp, sib_buf,
1872 XFS_DA_LOGRANGE(sib_info, &sib_info->forw,
1873 sizeof(sib_info->forw)));
1874 xfs_da_buf_done(sib_buf);
1875 sib_buf = NULL;
1876 }
1877 /*
1878 * If the moved block has a right sibling, fix up the pointers.
1879 */
1880 if ((sib_blkno = INT_GET(dead_info->forw, ARCH_CONVERT))) {
1881 if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w)))
1882 goto done;
1883 sib_info = sib_buf->data;
1884 if (unlikely(
1885 INT_GET(sib_info->back, ARCH_CONVERT) != last_blkno
1886 || INT_GET(sib_info->magic, ARCH_CONVERT)
1887 != INT_GET(dead_info->magic, ARCH_CONVERT))) {
1888 XFS_ERROR_REPORT("xfs_da_swap_lastblock(3)",
1889 XFS_ERRLEVEL_LOW, mp);
1890 error = XFS_ERROR(EFSCORRUPTED);
1891 goto done;
1892 }
1893 INT_SET(sib_info->back, ARCH_CONVERT, dead_blkno);
1894 xfs_da_log_buf(tp, sib_buf,
1895 XFS_DA_LOGRANGE(sib_info, &sib_info->back,
1896 sizeof(sib_info->back)));
1897 xfs_da_buf_done(sib_buf);
1898 sib_buf = NULL;
1899 }
1900 par_blkno = XFS_DIR_IS_V1(mp) ? 0 : mp->m_dirleafblk;
1901 level = -1;
1902 /*
1903 * Walk down the tree looking for the parent of the moved block.
1904 */
1905 for (;;) {
1906 if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w)))
1907 goto done;
1908 par_node = par_buf->data;
1909 if (unlikely(
1910 INT_GET(par_node->hdr.info.magic, ARCH_CONVERT) != XFS_DA_NODE_MAGIC ||
1911 (level >= 0 && level != INT_GET(par_node->hdr.level, ARCH_CONVERT) + 1))) {
1912 XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)",
1913 XFS_ERRLEVEL_LOW, mp);
1914 error = XFS_ERROR(EFSCORRUPTED);
1915 goto done;
1916 }
1917 level = INT_GET(par_node->hdr.level, ARCH_CONVERT);
1918 for (entno = 0;
1919 entno < INT_GET(par_node->hdr.count, ARCH_CONVERT) &&
1920 INT_GET(par_node->btree[entno].hashval, ARCH_CONVERT) < dead_hash;
1921 entno++)
1922 continue;
1923 if (unlikely(entno == INT_GET(par_node->hdr.count, ARCH_CONVERT))) {
1924 XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)",
1925 XFS_ERRLEVEL_LOW, mp);
1926 error = XFS_ERROR(EFSCORRUPTED);
1927 goto done;
1928 }
1929 par_blkno = INT_GET(par_node->btree[entno].before, ARCH_CONVERT);
1930 if (level == dead_level + 1)
1931 break;
1932 xfs_da_brelse(tp, par_buf);
1933 par_buf = NULL;
1934 }
1935 /*
1936 * We're in the right parent block.
1937 * Look for the right entry.
1938 */
1939 for (;;) {
1940 for (;
1941 entno < INT_GET(par_node->hdr.count, ARCH_CONVERT) &&
1942 INT_GET(par_node->btree[entno].before, ARCH_CONVERT) != last_blkno;
1943 entno++)
1944 continue;
1945 if (entno < INT_GET(par_node->hdr.count, ARCH_CONVERT))
1946 break;
1947 par_blkno = INT_GET(par_node->hdr.info.forw, ARCH_CONVERT);
1948 xfs_da_brelse(tp, par_buf);
1949 par_buf = NULL;
1950 if (unlikely(par_blkno == 0)) {
1951 XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)",
1952 XFS_ERRLEVEL_LOW, mp);
1953 error = XFS_ERROR(EFSCORRUPTED);
1954 goto done;
1955 }
1956 if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w)))
1957 goto done;
1958 par_node = par_buf->data;
1959 if (unlikely(
1960 INT_GET(par_node->hdr.level, ARCH_CONVERT) != level ||
1961 INT_GET(par_node->hdr.info.magic, ARCH_CONVERT) != XFS_DA_NODE_MAGIC)) {
1962 XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)",
1963 XFS_ERRLEVEL_LOW, mp);
1964 error = XFS_ERROR(EFSCORRUPTED);
1965 goto done;
1966 }
1967 entno = 0;
1968 }
1969 /*
1970 * Update the parent entry pointing to the moved block.
1971 */
1972 INT_SET(par_node->btree[entno].before, ARCH_CONVERT, dead_blkno);
1973 xfs_da_log_buf(tp, par_buf,
1974 XFS_DA_LOGRANGE(par_node, &par_node->btree[entno].before,
1975 sizeof(par_node->btree[entno].before)));
1976 xfs_da_buf_done(par_buf);
1977 xfs_da_buf_done(dead_buf);
1978 *dead_blknop = last_blkno;
1979 *dead_bufp = last_buf;
1980 return 0;
1981done:
1982 if (par_buf)
1983 xfs_da_brelse(tp, par_buf);
1984 if (sib_buf)
1985 xfs_da_brelse(tp, sib_buf);
1986 xfs_da_brelse(tp, last_buf);
1987 return error;
1988}
1989
1990/*
1991 * Remove a btree block from a directory or attribute.
1992 */
1993int
1994xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
1995 xfs_dabuf_t *dead_buf)
1996{
1997 xfs_inode_t *dp;
1998 int done, error, w, count;
1999 xfs_fileoff_t bno;
2000 xfs_fsize_t size;
2001 xfs_trans_t *tp;
2002 xfs_mount_t *mp;
2003
2004 dp = args->dp;
2005 w = args->whichfork;
2006 tp = args->trans;
2007 mp = dp->i_mount;
2008 if (w == XFS_DATA_FORK && XFS_DIR_IS_V2(mp))
2009 count = mp->m_dirblkfsbs;
2010 else
2011 count = 1;
2012 for (;;) {
2013 /*
2014 * Remove extents. If we get ENOSPC for a dir we have to move
2015 * the last block to the place we want to kill.
2016 */
2017 if ((error = xfs_bunmapi(tp, dp, dead_blkno, count,
2018 XFS_BMAPI_AFLAG(w)|XFS_BMAPI_METADATA,
2019 0, args->firstblock, args->flist,
2020 &done)) == ENOSPC) {
2021 if (w != XFS_DATA_FORK)
2022 goto done;
2023 if ((error = xfs_da_swap_lastblock(args, &dead_blkno,
2024 &dead_buf)))
2025 goto done;
2026 } else if (error)
2027 goto done;
2028 else
2029 break;
2030 }
2031 ASSERT(done);
2032 xfs_da_binval(tp, dead_buf);
2033 /*
2034 * Adjust the directory size for version 1.
2035 */
2036 if (w == XFS_DATA_FORK && XFS_DIR_IS_V1(mp)) {
2037 if ((error = xfs_bmap_last_offset(tp, dp, &bno, w)))
2038 return error;
2039 size = XFS_FSB_TO_B(dp->i_mount, bno);
2040 if (size != dp->i_d.di_size) {
2041 dp->i_d.di_size = size;
2042 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2043 }
2044 }
2045 return 0;
2046done:
2047 xfs_da_binval(tp, dead_buf);
2048 return error;
2049}
2050
2051/*
2052 * See if the mapping(s) for this btree block are valid, i.e.
2053 * don't contain holes, are logically contiguous, and cover the whole range.
2054 */
2055STATIC int
2056xfs_da_map_covers_blocks(
2057 int nmap,
2058 xfs_bmbt_irec_t *mapp,
2059 xfs_dablk_t bno,
2060 int count)
2061{
2062 int i;
2063 xfs_fileoff_t off;
2064
2065 for (i = 0, off = bno; i < nmap; i++) {
2066 if (mapp[i].br_startblock == HOLESTARTBLOCK ||
2067 mapp[i].br_startblock == DELAYSTARTBLOCK) {
2068 return 0;
2069 }
2070 if (off != mapp[i].br_startoff) {
2071 return 0;
2072 }
2073 off += mapp[i].br_blockcount;
2074 }
2075 return off == bno + count;
2076}
2077
2078/*
2079 * Make a dabuf.
2080 * Used for get_buf, read_buf, read_bufr, and reada_buf.
2081 */
2082STATIC int
2083xfs_da_do_buf(
2084 xfs_trans_t *trans,
2085 xfs_inode_t *dp,
2086 xfs_dablk_t bno,
2087 xfs_daddr_t *mappedbnop,
2088 xfs_dabuf_t **bpp,
2089 int whichfork,
2090 int caller,
2091 inst_t *ra)
2092{
2093 xfs_buf_t *bp = NULL;
2094 xfs_buf_t **bplist;
2095 int error=0;
2096 int i;
2097 xfs_bmbt_irec_t map;
2098 xfs_bmbt_irec_t *mapp;
2099 xfs_daddr_t mappedbno;
2100 xfs_mount_t *mp;
2101 int nbplist=0;
2102 int nfsb;
2103 int nmap;
2104 xfs_dabuf_t *rbp;
2105
2106 mp = dp->i_mount;
2107 if (whichfork == XFS_DATA_FORK && XFS_DIR_IS_V2(mp))
2108 nfsb = mp->m_dirblkfsbs;
2109 else
2110 nfsb = 1;
2111 mappedbno = *mappedbnop;
2112 /*
2113 * Caller doesn't have a mapping. -2 means don't complain
2114 * if we land in a hole.
2115 */
2116 if (mappedbno == -1 || mappedbno == -2) {
2117 /*
2118 * Optimize the one-block case.
2119 */
2120 if (nfsb == 1) {
2121 xfs_fsblock_t fsb;
2122
2123 if ((error =
2124 xfs_bmapi_single(trans, dp, whichfork, &fsb,
2125 (xfs_fileoff_t)bno))) {
2126 return error;
2127 }
2128 mapp = &map;
2129 if (fsb == NULLFSBLOCK) {
2130 nmap = 0;
2131 } else {
2132 map.br_startblock = fsb;
2133 map.br_startoff = (xfs_fileoff_t)bno;
2134 map.br_blockcount = 1;
2135 nmap = 1;
2136 }
2137 } else {
2138 mapp = kmem_alloc(sizeof(*mapp) * nfsb, KM_SLEEP);
2139 nmap = nfsb;
2140 if ((error = xfs_bmapi(trans, dp, (xfs_fileoff_t)bno,
2141 nfsb,
2142 XFS_BMAPI_METADATA |
2143 XFS_BMAPI_AFLAG(whichfork),
2144 NULL, 0, mapp, &nmap, NULL)))
2145 goto exit0;
2146 }
2147 } else {
2148 map.br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno);
2149 map.br_startoff = (xfs_fileoff_t)bno;
2150 map.br_blockcount = nfsb;
2151 mapp = &map;
2152 nmap = 1;
2153 }
2154 if (!xfs_da_map_covers_blocks(nmap, mapp, bno, nfsb)) {
2155 error = mappedbno == -2 ? 0 : XFS_ERROR(EFSCORRUPTED);
2156 if (unlikely(error == EFSCORRUPTED)) {
2157 if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
2158 int i;
2159 cmn_err(CE_ALERT, "xfs_da_do_buf: bno %lld\n",
2160 (long long)bno);
2161 cmn_err(CE_ALERT, "dir: inode %lld\n",
2162 (long long)dp->i_ino);
2163 for (i = 0; i < nmap; i++) {
2164 cmn_err(CE_ALERT,
2165 "[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d\n",
2166 i,
2167 (long long)mapp[i].br_startoff,
2168 (long long)mapp[i].br_startblock,
2169 (long long)mapp[i].br_blockcount,
2170 mapp[i].br_state);
2171 }
2172 }
2173 XFS_ERROR_REPORT("xfs_da_do_buf(1)",
2174 XFS_ERRLEVEL_LOW, mp);
2175 }
2176 goto exit0;
2177 }
2178 if (caller != 3 && nmap > 1) {
2179 bplist = kmem_alloc(sizeof(*bplist) * nmap, KM_SLEEP);
2180 nbplist = 0;
2181 } else
2182 bplist = NULL;
2183 /*
2184 * Turn the mapping(s) into buffer(s).
2185 */
2186 for (i = 0; i < nmap; i++) {
2187 int nmapped;
2188
2189 mappedbno = XFS_FSB_TO_DADDR(mp, mapp[i].br_startblock);
2190 if (i == 0)
2191 *mappedbnop = mappedbno;
2192 nmapped = (int)XFS_FSB_TO_BB(mp, mapp[i].br_blockcount);
2193 switch (caller) {
2194 case 0:
2195 bp = xfs_trans_get_buf(trans, mp->m_ddev_targp,
2196 mappedbno, nmapped, 0);
2197 error = bp ? XFS_BUF_GETERROR(bp) : XFS_ERROR(EIO);
2198 break;
2199 case 1:
2200#ifndef __KERNEL__
2201 case 2:
2202#endif
2203 bp = NULL;
2204 error = xfs_trans_read_buf(mp, trans, mp->m_ddev_targp,
2205 mappedbno, nmapped, 0, &bp);
2206 break;
2207#ifdef __KERNEL__
2208 case 3:
2209 xfs_baread(mp->m_ddev_targp, mappedbno, nmapped);
2210 error = 0;
2211 bp = NULL;
2212 break;
2213#endif
2214 }
2215 if (error) {
2216 if (bp)
2217 xfs_trans_brelse(trans, bp);
2218 goto exit1;
2219 }
2220 if (!bp)
2221 continue;
2222 if (caller == 1) {
2223 if (whichfork == XFS_ATTR_FORK) {
2224 XFS_BUF_SET_VTYPE_REF(bp, B_FS_ATTR_BTREE,
2225 XFS_ATTR_BTREE_REF);
2226 } else {
2227 XFS_BUF_SET_VTYPE_REF(bp, B_FS_DIR_BTREE,
2228 XFS_DIR_BTREE_REF);
2229 }
2230 }
2231 if (bplist) {
2232 bplist[nbplist++] = bp;
2233 }
2234 }
2235 /*
2236 * Build a dabuf structure.
2237 */
2238 if (bplist) {
2239 rbp = xfs_da_buf_make(nbplist, bplist, ra);
2240 } else if (bp)
2241 rbp = xfs_da_buf_make(1, &bp, ra);
2242 else
2243 rbp = NULL;
2244 /*
2245 * For read_buf, check the magic number.
2246 */
2247 if (caller == 1) {
2248 xfs_dir2_data_t *data;
2249 xfs_dir2_free_t *free;
2250 xfs_da_blkinfo_t *info;
2251 uint magic, magic1;
2252
2253 info = rbp->data;
2254 data = rbp->data;
2255 free = rbp->data;
2256 magic = INT_GET(info->magic, ARCH_CONVERT);
2257 magic1 = INT_GET(data->hdr.magic, ARCH_CONVERT);
2258 if (unlikely(
2259 XFS_TEST_ERROR((magic != XFS_DA_NODE_MAGIC) &&
2260 (magic != XFS_DIR_LEAF_MAGIC) &&
2261 (magic != XFS_ATTR_LEAF_MAGIC) &&
2262 (magic != XFS_DIR2_LEAF1_MAGIC) &&
2263 (magic != XFS_DIR2_LEAFN_MAGIC) &&
2264 (magic1 != XFS_DIR2_BLOCK_MAGIC) &&
2265 (magic1 != XFS_DIR2_DATA_MAGIC) &&
2266 (INT_GET(free->hdr.magic, ARCH_CONVERT) != XFS_DIR2_FREE_MAGIC),
2267 mp, XFS_ERRTAG_DA_READ_BUF,
2268 XFS_RANDOM_DA_READ_BUF))) {
2269 xfs_buftrace("DA READ ERROR", rbp->bps[0]);
2270 XFS_CORRUPTION_ERROR("xfs_da_do_buf(2)",
2271 XFS_ERRLEVEL_LOW, mp, info);
2272 error = XFS_ERROR(EFSCORRUPTED);
2273 xfs_da_brelse(trans, rbp);
2274 nbplist = 0;
2275 goto exit1;
2276 }
2277 }
2278 if (bplist) {
2279 kmem_free(bplist, sizeof(*bplist) * nmap);
2280 }
2281 if (mapp != &map) {
2282 kmem_free(mapp, sizeof(*mapp) * nfsb);
2283 }
2284 if (bpp)
2285 *bpp = rbp;
2286 return 0;
2287exit1:
2288 if (bplist) {
2289 for (i = 0; i < nbplist; i++)
2290 xfs_trans_brelse(trans, bplist[i]);
2291 kmem_free(bplist, sizeof(*bplist) * nmap);
2292 }
2293exit0:
2294 if (mapp != &map)
2295 kmem_free(mapp, sizeof(*mapp) * nfsb);
2296 if (bpp)
2297 *bpp = NULL;
2298 return error;
2299}
2300
2301/*
2302 * Get a buffer for the dir/attr block.
2303 */
2304int
2305xfs_da_get_buf(
2306 xfs_trans_t *trans,
2307 xfs_inode_t *dp,
2308 xfs_dablk_t bno,
2309 xfs_daddr_t mappedbno,
2310 xfs_dabuf_t **bpp,
2311 int whichfork)
2312{
2313 return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 0,
2314 (inst_t *)__return_address);
2315}
2316
2317/*
2318 * Get a buffer for the dir/attr block, fill in the contents.
2319 */
2320int
2321xfs_da_read_buf(
2322 xfs_trans_t *trans,
2323 xfs_inode_t *dp,
2324 xfs_dablk_t bno,
2325 xfs_daddr_t mappedbno,
2326 xfs_dabuf_t **bpp,
2327 int whichfork)
2328{
2329 return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 1,
2330 (inst_t *)__return_address);
2331}
2332
2333/*
2334 * Readahead the dir/attr block.
2335 */
2336xfs_daddr_t
2337xfs_da_reada_buf(
2338 xfs_trans_t *trans,
2339 xfs_inode_t *dp,
2340 xfs_dablk_t bno,
2341 int whichfork)
2342{
2343 xfs_daddr_t rval;
2344
2345 rval = -1;
2346 if (xfs_da_do_buf(trans, dp, bno, &rval, NULL, whichfork, 3,
2347 (inst_t *)__return_address))
2348 return -1;
2349 else
2350 return rval;
2351}
2352
2353/*
2354 * Calculate the number of bits needed to hold i different values.
2355 */
2356uint
2357xfs_da_log2_roundup(uint i)
2358{
2359 uint rval;
2360
2361 for (rval = 0; rval < NBBY * sizeof(i); rval++) {
2362 if ((1 << rval) >= i)
2363 break;
2364 }
2365 return(rval);
2366}
2367
2368kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */
2369kmem_zone_t *xfs_dabuf_zone; /* dabuf zone */
2370
2371/*
2372 * Allocate a dir-state structure.
2373 * We don't put them on the stack since they're large.
2374 */
2375xfs_da_state_t *
2376xfs_da_state_alloc(void)
2377{
2378 return kmem_zone_zalloc(xfs_da_state_zone, KM_SLEEP);
2379}
2380
2381/*
2382 * Kill the altpath contents of a da-state structure.
2383 */
2384void
2385xfs_da_state_kill_altpath(xfs_da_state_t *state)
2386{
2387 int i;
2388
2389 for (i = 0; i < state->altpath.active; i++) {
2390 if (state->altpath.blk[i].bp) {
2391 if (state->altpath.blk[i].bp != state->path.blk[i].bp)
2392 xfs_da_buf_done(state->altpath.blk[i].bp);
2393 state->altpath.blk[i].bp = NULL;
2394 }
2395 }
2396 state->altpath.active = 0;
2397}
2398
2399/*
2400 * Free a da-state structure.
2401 */
2402void
2403xfs_da_state_free(xfs_da_state_t *state)
2404{
2405 int i;
2406
2407 xfs_da_state_kill_altpath(state);
2408 for (i = 0; i < state->path.active; i++) {
2409 if (state->path.blk[i].bp)
2410 xfs_da_buf_done(state->path.blk[i].bp);
2411 }
2412 if (state->extravalid && state->extrablk.bp)
2413 xfs_da_buf_done(state->extrablk.bp);
2414#ifdef DEBUG
2415 memset((char *)state, 0, sizeof(*state));
2416#endif /* DEBUG */
2417 kmem_zone_free(xfs_da_state_zone, state);
2418}
2419
2420#ifdef XFS_DABUF_DEBUG
2421xfs_dabuf_t *xfs_dabuf_global_list;
2422lock_t xfs_dabuf_global_lock;
2423#endif
2424
2425/*
2426 * Create a dabuf.
2427 */
2428/* ARGSUSED */
2429STATIC xfs_dabuf_t *
2430xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra)
2431{
2432 xfs_buf_t *bp;
2433 xfs_dabuf_t *dabuf;
2434 int i;
2435 int off;
2436
2437 if (nbuf == 1)
2438 dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_SLEEP);
2439 else
2440 dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_SLEEP);
2441 dabuf->dirty = 0;
2442#ifdef XFS_DABUF_DEBUG
2443 dabuf->ra = ra;
2444 dabuf->target = XFS_BUF_TARGET(bps[0]);
2445 dabuf->blkno = XFS_BUF_ADDR(bps[0]);
2446#endif
2447 if (nbuf == 1) {
2448 dabuf->nbuf = 1;
2449 bp = bps[0];
2450 dabuf->bbcount = (short)BTOBB(XFS_BUF_COUNT(bp));
2451 dabuf->data = XFS_BUF_PTR(bp);
2452 dabuf->bps[0] = bp;
2453 } else {
2454 dabuf->nbuf = nbuf;
2455 for (i = 0, dabuf->bbcount = 0; i < nbuf; i++) {
2456 dabuf->bps[i] = bp = bps[i];
2457 dabuf->bbcount += BTOBB(XFS_BUF_COUNT(bp));
2458 }
2459 dabuf->data = kmem_alloc(BBTOB(dabuf->bbcount), KM_SLEEP);
2460 for (i = off = 0; i < nbuf; i++, off += XFS_BUF_COUNT(bp)) {
2461 bp = bps[i];
2462 memcpy((char *)dabuf->data + off, XFS_BUF_PTR(bp),
2463 XFS_BUF_COUNT(bp));
2464 }
2465 }
2466#ifdef XFS_DABUF_DEBUG
2467 {
2468 SPLDECL(s);
2469 xfs_dabuf_t *p;
2470
2471 s = mutex_spinlock(&xfs_dabuf_global_lock);
2472 for (p = xfs_dabuf_global_list; p; p = p->next) {
2473 ASSERT(p->blkno != dabuf->blkno ||
2474 p->target != dabuf->target);
2475 }
2476 dabuf->prev = NULL;
2477 if (xfs_dabuf_global_list)
2478 xfs_dabuf_global_list->prev = dabuf;
2479 dabuf->next = xfs_dabuf_global_list;
2480 xfs_dabuf_global_list = dabuf;
2481 mutex_spinunlock(&xfs_dabuf_global_lock, s);
2482 }
2483#endif
2484 return dabuf;
2485}
2486
2487/*
2488 * Un-dirty a dabuf.
2489 */
2490STATIC void
2491xfs_da_buf_clean(xfs_dabuf_t *dabuf)
2492{
2493 xfs_buf_t *bp;
2494 int i;
2495 int off;
2496
2497 if (dabuf->dirty) {
2498 ASSERT(dabuf->nbuf > 1);
2499 dabuf->dirty = 0;
2500 for (i = off = 0; i < dabuf->nbuf;
2501 i++, off += XFS_BUF_COUNT(bp)) {
2502 bp = dabuf->bps[i];
2503 memcpy(XFS_BUF_PTR(bp), (char *)dabuf->data + off,
2504 XFS_BUF_COUNT(bp));
2505 }
2506 }
2507}
2508
2509/*
2510 * Release a dabuf.
2511 */
2512void
2513xfs_da_buf_done(xfs_dabuf_t *dabuf)
2514{
2515 ASSERT(dabuf);
2516 ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
2517 if (dabuf->dirty)
2518 xfs_da_buf_clean(dabuf);
2519 if (dabuf->nbuf > 1)
2520 kmem_free(dabuf->data, BBTOB(dabuf->bbcount));
2521#ifdef XFS_DABUF_DEBUG
2522 {
2523 SPLDECL(s);
2524
2525 s = mutex_spinlock(&xfs_dabuf_global_lock);
2526 if (dabuf->prev)
2527 dabuf->prev->next = dabuf->next;
2528 else
2529 xfs_dabuf_global_list = dabuf->next;
2530 if (dabuf->next)
2531 dabuf->next->prev = dabuf->prev;
2532 mutex_spinunlock(&xfs_dabuf_global_lock, s);
2533 }
2534 memset(dabuf, 0, XFS_DA_BUF_SIZE(dabuf->nbuf));
2535#endif
2536 if (dabuf->nbuf == 1)
2537 kmem_zone_free(xfs_dabuf_zone, dabuf);
2538 else
2539 kmem_free(dabuf, XFS_DA_BUF_SIZE(dabuf->nbuf));
2540}
2541
2542/*
2543 * Log transaction from a dabuf.
2544 */
2545void
2546xfs_da_log_buf(xfs_trans_t *tp, xfs_dabuf_t *dabuf, uint first, uint last)
2547{
2548 xfs_buf_t *bp;
2549 uint f;
2550 int i;
2551 uint l;
2552 int off;
2553
2554 ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
2555 if (dabuf->nbuf == 1) {
2556 ASSERT(dabuf->data == (void *)XFS_BUF_PTR(dabuf->bps[0]));
2557 xfs_trans_log_buf(tp, dabuf->bps[0], first, last);
2558 return;
2559 }
2560 dabuf->dirty = 1;
2561 ASSERT(first <= last);
2562 for (i = off = 0; i < dabuf->nbuf; i++, off += XFS_BUF_COUNT(bp)) {
2563 bp = dabuf->bps[i];
2564 f = off;
2565 l = f + XFS_BUF_COUNT(bp) - 1;
2566 if (f < first)
2567 f = first;
2568 if (l > last)
2569 l = last;
2570 if (f <= l)
2571 xfs_trans_log_buf(tp, bp, f - off, l - off);
2572 /*
2573 * B_DONE is set by xfs_trans_log buf.
2574 * If we don't set it on a new buffer (get not read)
2575 * then if we don't put anything in the buffer it won't
2576 * be set, and at commit it it released into the cache,
2577 * and then a read will fail.
2578 */
2579 else if (!(XFS_BUF_ISDONE(bp)))
2580 XFS_BUF_DONE(bp);
2581 }
2582 ASSERT(last < off);
2583}
2584
2585/*
2586 * Release dabuf from a transaction.
2587 * Have to free up the dabuf before the buffers are released,
2588 * since the synchronization on the dabuf is really the lock on the buffer.
2589 */
2590void
2591xfs_da_brelse(xfs_trans_t *tp, xfs_dabuf_t *dabuf)
2592{
2593 xfs_buf_t *bp;
2594 xfs_buf_t **bplist;
2595 int i;
2596 int nbuf;
2597
2598 ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
2599 if ((nbuf = dabuf->nbuf) == 1) {
2600 bplist = &bp;
2601 bp = dabuf->bps[0];
2602 } else {
2603 bplist = kmem_alloc(nbuf * sizeof(*bplist), KM_SLEEP);
2604 memcpy(bplist, dabuf->bps, nbuf * sizeof(*bplist));
2605 }
2606 xfs_da_buf_done(dabuf);
2607 for (i = 0; i < nbuf; i++)
2608 xfs_trans_brelse(tp, bplist[i]);
2609 if (bplist != &bp)
2610 kmem_free(bplist, nbuf * sizeof(*bplist));
2611}
2612
2613/*
2614 * Invalidate dabuf from a transaction.
2615 */
2616void
2617xfs_da_binval(xfs_trans_t *tp, xfs_dabuf_t *dabuf)
2618{
2619 xfs_buf_t *bp;
2620 xfs_buf_t **bplist;
2621 int i;
2622 int nbuf;
2623
2624 ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
2625 if ((nbuf = dabuf->nbuf) == 1) {
2626 bplist = &bp;
2627 bp = dabuf->bps[0];
2628 } else {
2629 bplist = kmem_alloc(nbuf * sizeof(*bplist), KM_SLEEP);
2630 memcpy(bplist, dabuf->bps, nbuf * sizeof(*bplist));
2631 }
2632 xfs_da_buf_done(dabuf);
2633 for (i = 0; i < nbuf; i++)
2634 xfs_trans_binval(tp, bplist[i]);
2635 if (bplist != &bp)
2636 kmem_free(bplist, nbuf * sizeof(*bplist));
2637}
2638
2639/*
2640 * Get the first daddr from a dabuf.
2641 */
2642xfs_daddr_t
2643xfs_da_blkno(xfs_dabuf_t *dabuf)
2644{
2645 ASSERT(dabuf->nbuf);
2646 ASSERT(dabuf->data);
2647 return XFS_BUF_ADDR(dabuf->bps[0]);
2648}
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
new file mode 100644
index 000000000000..9fc699d96995
--- /dev/null
+++ b/fs/xfs/xfs_da_btree.h
@@ -0,0 +1,335 @@
1/*
2 * Copyright (c) 2000, 2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_DA_BTREE_H__
33#define __XFS_DA_BTREE_H__
34
35struct xfs_buf;
36struct xfs_bmap_free;
37struct xfs_inode;
38struct xfs_mount;
39struct xfs_trans;
40struct zone;
41
42/*========================================================================
43 * Directory Structure when greater than XFS_LBSIZE(mp) bytes.
44 *========================================================================*/
45
46/*
47 * This structure is common to both leaf nodes and non-leaf nodes in the Btree.
48 *
49 * Is is used to manage a doubly linked list of all blocks at the same
50 * level in the Btree, and to identify which type of block this is.
51 */
52#define XFS_DA_NODE_MAGIC 0xfebe /* magic number: non-leaf blocks */
53#define XFS_DIR_LEAF_MAGIC 0xfeeb /* magic number: directory leaf blks */
54#define XFS_ATTR_LEAF_MAGIC 0xfbee /* magic number: attribute leaf blks */
55#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */
56#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */
57
58#define XFS_DIRX_LEAF_MAGIC(mp) \
59 (XFS_DIR_IS_V1(mp) ? XFS_DIR_LEAF_MAGIC : XFS_DIR2_LEAFN_MAGIC)
60
61typedef struct xfs_da_blkinfo {
62 xfs_dablk_t forw; /* previous block in list */
63 xfs_dablk_t back; /* following block in list */
64 __uint16_t magic; /* validity check on block */
65 __uint16_t pad; /* unused */
66} xfs_da_blkinfo_t;
67
68/*
69 * This is the structure of the root and intermediate nodes in the Btree.
70 * The leaf nodes are defined above.
71 *
72 * Entries are not packed.
73 *
74 * Since we have duplicate keys, use a binary search but always follow
75 * all match in the block, not just the first match found.
76 */
77#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */
78
79typedef struct xfs_da_intnode {
80 struct xfs_da_node_hdr { /* constant-structure header block */
81 xfs_da_blkinfo_t info; /* block type, links, etc. */
82 __uint16_t count; /* count of active entries */
83 __uint16_t level; /* level above leaves (leaf == 0) */
84 } hdr;
85 struct xfs_da_node_entry {
86 xfs_dahash_t hashval; /* hash value for this descendant */
87 xfs_dablk_t before; /* Btree block before this key */
88 } btree[1]; /* variable sized array of keys */
89} xfs_da_intnode_t;
90typedef struct xfs_da_node_hdr xfs_da_node_hdr_t;
91typedef struct xfs_da_node_entry xfs_da_node_entry_t;
92
93#define XFS_DA_MAXHASH ((xfs_dahash_t)-1) /* largest valid hash value */
94
95/*
96 * Macros used by directory code to interface to the filesystem.
97 */
98#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LBSIZE)
99int xfs_lbsize(struct xfs_mount *mp);
100#define XFS_LBSIZE(mp) xfs_lbsize(mp)
101#else
102#define XFS_LBSIZE(mp) ((mp)->m_sb.sb_blocksize)
103#endif
104#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LBLOG)
105int xfs_lblog(struct xfs_mount *mp);
106#define XFS_LBLOG(mp) xfs_lblog(mp)
107#else
108#define XFS_LBLOG(mp) ((mp)->m_sb.sb_blocklog)
109#endif
110
111/*
112 * Macros used by directory code to interface to the kernel
113 */
114
115/*
116 * Macros used to manipulate directory off_t's
117 */
118#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DA_MAKE_BNOENTRY)
119__uint32_t xfs_da_make_bnoentry(struct xfs_mount *mp, xfs_dablk_t bno,
120 int entry);
121#define XFS_DA_MAKE_BNOENTRY(mp,bno,entry) \
122 xfs_da_make_bnoentry(mp,bno,entry)
123#else
124#define XFS_DA_MAKE_BNOENTRY(mp,bno,entry) \
125 (((bno) << (mp)->m_dircook_elog) | (entry))
126#endif
127#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DA_MAKE_COOKIE)
128xfs_off_t xfs_da_make_cookie(struct xfs_mount *mp, xfs_dablk_t bno, int entry,
129 xfs_dahash_t hash);
130#define XFS_DA_MAKE_COOKIE(mp,bno,entry,hash) \
131 xfs_da_make_cookie(mp,bno,entry,hash)
132#else
133#define XFS_DA_MAKE_COOKIE(mp,bno,entry,hash) \
134 (((xfs_off_t)XFS_DA_MAKE_BNOENTRY(mp, bno, entry) << 32) | (hash))
135#endif
136#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DA_COOKIE_HASH)
137xfs_dahash_t xfs_da_cookie_hash(struct xfs_mount *mp, xfs_off_t cookie);
138#define XFS_DA_COOKIE_HASH(mp,cookie) xfs_da_cookie_hash(mp,cookie)
139#else
140#define XFS_DA_COOKIE_HASH(mp,cookie) ((xfs_dahash_t)(cookie))
141#endif
142#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DA_COOKIE_BNO)
143xfs_dablk_t xfs_da_cookie_bno(struct xfs_mount *mp, xfs_off_t cookie);
144#define XFS_DA_COOKIE_BNO(mp,cookie) xfs_da_cookie_bno(mp,cookie)
145#else
146#define XFS_DA_COOKIE_BNO(mp,cookie) \
147 (((xfs_off_t)(cookie) >> 31) == -1LL ? \
148 (xfs_dablk_t)0 : \
149 (xfs_dablk_t)((xfs_off_t)(cookie) >> ((mp)->m_dircook_elog + 32)))
150#endif
151#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DA_COOKIE_ENTRY)
152int xfs_da_cookie_entry(struct xfs_mount *mp, xfs_off_t cookie);
153#define XFS_DA_COOKIE_ENTRY(mp,cookie) xfs_da_cookie_entry(mp,cookie)
154#else
155#define XFS_DA_COOKIE_ENTRY(mp,cookie) \
156 (((xfs_off_t)(cookie) >> 31) == -1LL ? \
157 (xfs_dablk_t)0 : \
158 (xfs_dablk_t)(((xfs_off_t)(cookie) >> 32) & \
159 ((1 << (mp)->m_dircook_elog) - 1)))
160#endif
161
162
163/*========================================================================
164 * Btree searching and modification structure definitions.
165 *========================================================================*/
166
167/*
168 * Structure to ease passing around component names.
169 */
170typedef struct xfs_da_args {
171 uchar_t *name; /* string (maybe not NULL terminated) */
172 int namelen; /* length of string (maybe no NULL) */
173 uchar_t *value; /* set of bytes (maybe contain NULLs) */
174 int valuelen; /* length of value */
175 int flags; /* argument flags (eg: ATTR_NOCREATE) */
176 xfs_dahash_t hashval; /* hash value of name */
177 xfs_ino_t inumber; /* input/output inode number */
178 struct xfs_inode *dp; /* directory inode to manipulate */
179 xfs_fsblock_t *firstblock; /* ptr to firstblock for bmap calls */
180 struct xfs_bmap_free *flist; /* ptr to freelist for bmap_finish */
181 struct xfs_trans *trans; /* current trans (changes over time) */
182 xfs_extlen_t total; /* total blocks needed, for 1st bmap */
183 int whichfork; /* data or attribute fork */
184 xfs_dablk_t blkno; /* blkno of attr leaf of interest */
185 int index; /* index of attr of interest in blk */
186 xfs_dablk_t rmtblkno; /* remote attr value starting blkno */
187 int rmtblkcnt; /* remote attr value block count */
188 xfs_dablk_t blkno2; /* blkno of 2nd attr leaf of interest */
189 int index2; /* index of 2nd attr in blk */
190 xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */
191 int rmtblkcnt2; /* remote attr value block count */
192 unsigned char justcheck; /* T/F: check for ok with no space */
193 unsigned char rename; /* T/F: this is an atomic rename op */
194 unsigned char addname; /* T/F: this is an add operation */
195 unsigned char oknoent; /* T/F: ok to return ENOENT, else die */
196} xfs_da_args_t;
197
198/*
199 * Structure to describe buffer(s) for a block.
200 * This is needed in the directory version 2 format case, when
201 * multiple non-contiguous fsblocks might be needed to cover one
202 * logical directory block.
203 * If the buffer count is 1 then the data pointer points to the
204 * same place as the b_addr field for the buffer, else to kmem_alloced memory.
205 */
206typedef struct xfs_dabuf {
207 int nbuf; /* number of buffer pointers present */
208 short dirty; /* data needs to be copied back */
209 short bbcount; /* how large is data in bbs */
210 void *data; /* pointer for buffers' data */
211#ifdef XFS_DABUF_DEBUG
212 inst_t *ra; /* return address of caller to make */
213 struct xfs_dabuf *next; /* next in global chain */
214 struct xfs_dabuf *prev; /* previous in global chain */
215 struct xfs_buftarg *target; /* device for buffer */
216 xfs_daddr_t blkno; /* daddr first in bps[0] */
217#endif
218 struct xfs_buf *bps[1]; /* actually nbuf of these */
219} xfs_dabuf_t;
220#define XFS_DA_BUF_SIZE(n) \
221 (sizeof(xfs_dabuf_t) + sizeof(struct xfs_buf *) * ((n) - 1))
222
223#ifdef XFS_DABUF_DEBUG
224extern xfs_dabuf_t *xfs_dabuf_global_list;
225#endif
226
227/*
228 * Storage for holding state during Btree searches and split/join ops.
229 *
230 * Only need space for 5 intermediate nodes. With a minimum of 62-way
231 * fanout to the Btree, we can support over 900 million directory blocks,
232 * which is slightly more than enough.
233 */
234typedef struct xfs_da_state_blk {
235 xfs_dabuf_t *bp; /* buffer containing block */
236 xfs_dablk_t blkno; /* filesystem blkno of buffer */
237 xfs_daddr_t disk_blkno; /* on-disk blkno (in BBs) of buffer */
238 int index; /* relevant index into block */
239 xfs_dahash_t hashval; /* last hash value in block */
240 int magic; /* blk's magic number, ie: blk type */
241} xfs_da_state_blk_t;
242
243typedef struct xfs_da_state_path {
244 int active; /* number of active levels */
245 xfs_da_state_blk_t blk[XFS_DA_NODE_MAXDEPTH];
246} xfs_da_state_path_t;
247
248typedef struct xfs_da_state {
249 xfs_da_args_t *args; /* filename arguments */
250 struct xfs_mount *mp; /* filesystem mount point */
251 unsigned int blocksize; /* logical block size */
252 unsigned int node_ents; /* how many entries in danode */
253 xfs_da_state_path_t path; /* search/split paths */
254 xfs_da_state_path_t altpath; /* alternate path for join */
255 unsigned char inleaf; /* insert into 1->lf, 0->splf */
256 unsigned char extravalid; /* T/F: extrablk is in use */
257 unsigned char extraafter; /* T/F: extrablk is after new */
258 xfs_da_state_blk_t extrablk; /* for double-splits on leafs */
259 /* for dirv2 extrablk is data */
260} xfs_da_state_t;
261
262/*
263 * Utility macros to aid in logging changed structure fields.
264 */
265#define XFS_DA_LOGOFF(BASE, ADDR) ((char *)(ADDR) - (char *)(BASE))
266#define XFS_DA_LOGRANGE(BASE, ADDR, SIZE) \
267 (uint)(XFS_DA_LOGOFF(BASE, ADDR)), \
268 (uint)(XFS_DA_LOGOFF(BASE, ADDR)+(SIZE)-1)
269
270
271#ifdef __KERNEL__
272/*========================================================================
273 * Function prototypes for the kernel.
274 *========================================================================*/
275
276/*
277 * Routines used for growing the Btree.
278 */
279int xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
280 xfs_dabuf_t **bpp, int whichfork);
281int xfs_da_split(xfs_da_state_t *state);
282
283/*
284 * Routines used for shrinking the Btree.
285 */
286int xfs_da_join(xfs_da_state_t *state);
287void xfs_da_fixhashpath(xfs_da_state_t *state,
288 xfs_da_state_path_t *path_to_to_fix);
289
290/*
291 * Routines used for finding things in the Btree.
292 */
293int xfs_da_node_lookup_int(xfs_da_state_t *state, int *result);
294int xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
295 int forward, int release, int *result);
296/*
297 * Utility routines.
298 */
299int xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
300 xfs_da_state_blk_t *save_blk);
301int xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
302 xfs_da_state_blk_t *new_blk);
303
304/*
305 * Utility routines.
306 */
307int xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno);
308int xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp,
309 xfs_dablk_t bno, xfs_daddr_t mappedbno,
310 xfs_dabuf_t **bp, int whichfork);
311int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
312 xfs_dablk_t bno, xfs_daddr_t mappedbno,
313 xfs_dabuf_t **bpp, int whichfork);
314xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp,
315 xfs_dablk_t bno, int whichfork);
316int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
317 xfs_dabuf_t *dead_buf);
318
319uint xfs_da_hashname(uchar_t *name_string, int name_length);
320uint xfs_da_log2_roundup(uint i);
321xfs_da_state_t *xfs_da_state_alloc(void);
322void xfs_da_state_free(xfs_da_state_t *state);
323void xfs_da_state_kill_altpath(xfs_da_state_t *state);
324
325void xfs_da_buf_done(xfs_dabuf_t *dabuf);
326void xfs_da_log_buf(struct xfs_trans *tp, xfs_dabuf_t *dabuf, uint first,
327 uint last);
328void xfs_da_brelse(struct xfs_trans *tp, xfs_dabuf_t *dabuf);
329void xfs_da_binval(struct xfs_trans *tp, xfs_dabuf_t *dabuf);
330xfs_daddr_t xfs_da_blkno(xfs_dabuf_t *dabuf);
331
332extern struct kmem_zone *xfs_da_state_zone;
333#endif /* __KERNEL__ */
334
335#endif /* __XFS_DA_BTREE_H__ */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
new file mode 100644
index 000000000000..08d551a17347
--- /dev/null
+++ b/fs/xfs/xfs_dfrag.c
@@ -0,0 +1,387 @@
1/*
2 * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34#include "xfs_macros.h"
35#include "xfs_types.h"
36#include "xfs_inum.h"
37#include "xfs_log.h"
38#include "xfs_trans.h"
39#include "xfs_sb.h"
40#include "xfs_dir.h"
41#include "xfs_dir2.h"
42#include "xfs_dmapi.h"
43#include "xfs_mount.h"
44#include "xfs_ag.h"
45#include "xfs_alloc_btree.h"
46#include "xfs_bmap_btree.h"
47#include "xfs_ialloc_btree.h"
48#include "xfs_btree.h"
49#include "xfs_attr_sf.h"
50#include "xfs_dir_sf.h"
51#include "xfs_dir2_sf.h"
52#include "xfs_dinode.h"
53#include "xfs_inode_item.h"
54#include "xfs_inode.h"
55#include "xfs_bmap.h"
56#include "xfs_ialloc.h"
57#include "xfs_itable.h"
58#include "xfs_dfrag.h"
59#include "xfs_error.h"
60#include "xfs_mac.h"
61#include "xfs_rw.h"
62
63/*
64 * Syssgi interface for swapext
65 */
66int
67xfs_swapext(
68 xfs_swapext_t __user *sxp)
69{
70 xfs_swapext_t sx;
71 xfs_inode_t *ip=NULL, *tip=NULL, *ips[2];
72 xfs_trans_t *tp;
73 xfs_mount_t *mp;
74 xfs_bstat_t *sbp;
75 struct file *fp = NULL, *tfp = NULL;
76 vnode_t *vp, *tvp;
77 bhv_desc_t *bdp, *tbdp;
78 vn_bhv_head_t *bhp, *tbhp;
79 uint lock_flags=0;
80 int ilf_fields, tilf_fields;
81 int error = 0;
82 xfs_ifork_t tempif, *ifp, *tifp;
83 __uint64_t tmp;
84 int aforkblks = 0;
85 int taforkblks = 0;
86 int locked = 0;
87
88 if (copy_from_user(&sx, sxp, sizeof(sx)))
89 return XFS_ERROR(EFAULT);
90
91 /* Pull information for the target fd */
92 if (((fp = fget((int)sx.sx_fdtarget)) == NULL) ||
93 ((vp = LINVFS_GET_VP(fp->f_dentry->d_inode)) == NULL)) {
94 error = XFS_ERROR(EINVAL);
95 goto error0;
96 }
97
98 bhp = VN_BHV_HEAD(vp);
99 bdp = vn_bhv_lookup(bhp, &xfs_vnodeops);
100 if (bdp == NULL) {
101 error = XFS_ERROR(EBADF);
102 goto error0;
103 } else {
104 ip = XFS_BHVTOI(bdp);
105 }
106
107 if (((tfp = fget((int)sx.sx_fdtmp)) == NULL) ||
108 ((tvp = LINVFS_GET_VP(tfp->f_dentry->d_inode)) == NULL)) {
109 error = XFS_ERROR(EINVAL);
110 goto error0;
111 }
112
113 tbhp = VN_BHV_HEAD(tvp);
114 tbdp = vn_bhv_lookup(tbhp, &xfs_vnodeops);
115 if (tbdp == NULL) {
116 error = XFS_ERROR(EBADF);
117 goto error0;
118 } else {
119 tip = XFS_BHVTOI(tbdp);
120 }
121
122 if (ip->i_mount != tip->i_mount) {
123 error = XFS_ERROR(EINVAL);
124 goto error0;
125 }
126
127 if (ip->i_ino == tip->i_ino) {
128 error = XFS_ERROR(EINVAL);
129 goto error0;
130 }
131
132 mp = ip->i_mount;
133
134 sbp = &sx.sx_stat;
135
136 if (XFS_FORCED_SHUTDOWN(mp)) {
137 error = XFS_ERROR(EIO);
138 goto error0;
139 }
140
141 locked = 1;
142
143 /* Lock in i_ino order */
144 if (ip->i_ino < tip->i_ino) {
145 ips[0] = ip;
146 ips[1] = tip;
147 } else {
148 ips[0] = tip;
149 ips[1] = ip;
150 }
151 lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL;
152 xfs_lock_inodes(ips, 2, 0, lock_flags);
153
154 /* Check permissions */
155 error = xfs_iaccess(ip, S_IWUSR, NULL);
156 if (error)
157 goto error0;
158
159 error = xfs_iaccess(tip, S_IWUSR, NULL);
160 if (error)
161 goto error0;
162
163 /* Verify that both files have the same format */
164 if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
165 error = XFS_ERROR(EINVAL);
166 goto error0;
167 }
168
169 /* Verify both files are either real-time or non-realtime */
170 if ((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) !=
171 (tip->i_d.di_flags & XFS_DIFLAG_REALTIME)) {
172 error = XFS_ERROR(EINVAL);
173 goto error0;
174 }
175
176 /* Should never get a local format */
177 if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
178 tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
179 error = XFS_ERROR(EINVAL);
180 goto error0;
181 }
182
183 if (VN_CACHED(tvp) != 0)
184 xfs_inval_cached_pages(XFS_ITOV(tip), &(tip->i_iocore),
185 (loff_t)0, 0, 0);
186
187 /* Verify O_DIRECT for ftmp */
188 if (VN_CACHED(tvp) != 0) {
189 error = XFS_ERROR(EINVAL);
190 goto error0;
191 }
192
193 /* Verify all data are being swapped */
194 if (sx.sx_offset != 0 ||
195 sx.sx_length != ip->i_d.di_size ||
196 sx.sx_length != tip->i_d.di_size) {
197 error = XFS_ERROR(EFAULT);
198 goto error0;
199 }
200
201 /*
202 * If the target has extended attributes, the tmp file
203 * must also in order to ensure the correct data fork
204 * format.
205 */
206 if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) {
207 error = XFS_ERROR(EINVAL);
208 goto error0;
209 }
210
211 /*
212 * Compare the current change & modify times with that
213 * passed in. If they differ, we abort this swap.
214 * This is the mechanism used to ensure the calling
215 * process that the file was not changed out from
216 * under it.
217 */
218 if ((sbp->bs_ctime.tv_sec != ip->i_d.di_ctime.t_sec) ||
219 (sbp->bs_ctime.tv_nsec != ip->i_d.di_ctime.t_nsec) ||
220 (sbp->bs_mtime.tv_sec != ip->i_d.di_mtime.t_sec) ||
221 (sbp->bs_mtime.tv_nsec != ip->i_d.di_mtime.t_nsec)) {
222 error = XFS_ERROR(EBUSY);
223 goto error0;
224 }
225
226 /* We need to fail if the file is memory mapped. Once we have tossed
227 * all existing pages, the page fault will have no option
228 * but to go to the filesystem for pages. By making the page fault call
229 * VOP_READ (or write in the case of autogrow) they block on the iolock
230 * until we have switched the extents.
231 */
232 if (VN_MAPPED(vp)) {
233 error = XFS_ERROR(EBUSY);
234 goto error0;
235 }
236
237 xfs_iunlock(ip, XFS_ILOCK_EXCL);
238 xfs_iunlock(tip, XFS_ILOCK_EXCL);
239
240 /*
241 * There is a race condition here since we gave up the
242 * ilock. However, the data fork will not change since
243 * we have the iolock (locked for truncation too) so we
244 * are safe. We don't really care if non-io related
245 * fields change.
246 */
247
248 VOP_TOSS_PAGES(vp, 0, -1, FI_REMAPF);
249
250 tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
251 if ((error = xfs_trans_reserve(tp, 0,
252 XFS_ICHANGE_LOG_RES(mp), 0,
253 0, 0))) {
254 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
255 xfs_iunlock(tip, XFS_IOLOCK_EXCL);
256 xfs_trans_cancel(tp, 0);
257 return error;
258 }
259 xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
260
261 /*
262 * Count the number of extended attribute blocks
263 */
264 if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
265 (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
266 error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
267 if (error) {
268 xfs_iunlock(ip, lock_flags);
269 xfs_iunlock(tip, lock_flags);
270 xfs_trans_cancel(tp, 0);
271 return error;
272 }
273 }
274 if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
275 (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
276 error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
277 &taforkblks);
278 if (error) {
279 xfs_iunlock(ip, lock_flags);
280 xfs_iunlock(tip, lock_flags);
281 xfs_trans_cancel(tp, 0);
282 return error;
283 }
284 }
285
286 /*
287 * Swap the data forks of the inodes
288 */
289 ifp = &ip->i_df;
290 tifp = &tip->i_df;
291 tempif = *ifp; /* struct copy */
292 *ifp = *tifp; /* struct copy */
293 *tifp = tempif; /* struct copy */
294
295 /*
296 * Fix the on-disk inode values
297 */
298 tmp = (__uint64_t)ip->i_d.di_nblocks;
299 ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks;
300 tip->i_d.di_nblocks = tmp + taforkblks - aforkblks;
301
302 tmp = (__uint64_t) ip->i_d.di_nextents;
303 ip->i_d.di_nextents = tip->i_d.di_nextents;
304 tip->i_d.di_nextents = tmp;
305
306 tmp = (__uint64_t) ip->i_d.di_format;
307 ip->i_d.di_format = tip->i_d.di_format;
308 tip->i_d.di_format = tmp;
309
310 ilf_fields = XFS_ILOG_CORE;
311
312 switch(ip->i_d.di_format) {
313 case XFS_DINODE_FMT_EXTENTS:
314 /* If the extents fit in the inode, fix the
315 * pointer. Otherwise it's already NULL or
316 * pointing to the extent.
317 */
318 if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) {
319 ifp->if_u1.if_extents =
320 ifp->if_u2.if_inline_ext;
321 }
322 ilf_fields |= XFS_ILOG_DEXT;
323 break;
324 case XFS_DINODE_FMT_BTREE:
325 ilf_fields |= XFS_ILOG_DBROOT;
326 break;
327 }
328
329 tilf_fields = XFS_ILOG_CORE;
330
331 switch(tip->i_d.di_format) {
332 case XFS_DINODE_FMT_EXTENTS:
333 /* If the extents fit in the inode, fix the
334 * pointer. Otherwise it's already NULL or
335 * pointing to the extent.
336 */
337 if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) {
338 tifp->if_u1.if_extents =
339 tifp->if_u2.if_inline_ext;
340 }
341 tilf_fields |= XFS_ILOG_DEXT;
342 break;
343 case XFS_DINODE_FMT_BTREE:
344 tilf_fields |= XFS_ILOG_DBROOT;
345 break;
346 }
347
348 /*
349 * Increment vnode ref counts since xfs_trans_commit &
350 * xfs_trans_cancel will both unlock the inodes and
351 * decrement the associated ref counts.
352 */
353 VN_HOLD(vp);
354 VN_HOLD(tvp);
355
356 xfs_trans_ijoin(tp, ip, lock_flags);
357 xfs_trans_ijoin(tp, tip, lock_flags);
358
359 xfs_trans_log_inode(tp, ip, ilf_fields);
360 xfs_trans_log_inode(tp, tip, tilf_fields);
361
362 /*
363 * If this is a synchronous mount, make sure that the
364 * transaction goes to disk before returning to the user.
365 */
366 if (mp->m_flags & XFS_MOUNT_WSYNC) {
367 xfs_trans_set_sync(tp);
368 }
369
370 error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT, NULL);
371
372 fput(fp);
373 fput(tfp);
374
375 return error;
376
377 error0:
378 if (locked) {
379 xfs_iunlock(ip, lock_flags);
380 xfs_iunlock(tip, lock_flags);
381 }
382
383 if (fp != NULL) fput(fp);
384 if (tfp != NULL) fput(tfp);
385
386 return error;
387}
diff --git a/fs/xfs/xfs_dfrag.h b/fs/xfs/xfs_dfrag.h
new file mode 100644
index 000000000000..904860594b8f
--- /dev/null
+++ b/fs/xfs/xfs_dfrag.h
@@ -0,0 +1,67 @@
1/*
2 * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_DFRAG_H__
33#define __XFS_DFRAG_H__
34
35/*
36 * Structure passed to xfs_swapext
37 */
38
39typedef struct xfs_swapext
40{
41 __int64_t sx_version; /* version */
42 __int64_t sx_fdtarget; /* fd of target file */
43 __int64_t sx_fdtmp; /* fd of tmp file */
44 xfs_off_t sx_offset; /* offset into file */
45 xfs_off_t sx_length; /* leng from offset */
46 char sx_pad[16]; /* pad space, unused */
47 xfs_bstat_t sx_stat; /* stat of target b4 copy */
48} xfs_swapext_t;
49
50/*
51 * Version flag
52 */
53#define XFS_SX_VERSION 0
54
55#ifdef __KERNEL__
56/*
57 * Prototypes for visible xfs_dfrag.c routines.
58 */
59
60/*
61 * Syscall interface for xfs_swapext
62 */
63int xfs_swapext(struct xfs_swapext __user *sx);
64
65#endif /* __KERNEL__ */
66
67#endif /* __XFS_DFRAG_H__ */
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
new file mode 100644
index 000000000000..f5c932b064e6
--- /dev/null
+++ b/fs/xfs/xfs_dinode.h
@@ -0,0 +1,418 @@
1/*
2 * Copyright (c) 2000, 2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_DINODE_H__
33#define __XFS_DINODE_H__
34
35struct xfs_buf;
36struct xfs_mount;
37
38#define XFS_DINODE_VERSION_1 1
39#define XFS_DINODE_VERSION_2 2
40#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DINODE_GOOD_VERSION)
41int xfs_dinode_good_version(int v);
42#define XFS_DINODE_GOOD_VERSION(v) xfs_dinode_good_version(v)
43#else
44#define XFS_DINODE_GOOD_VERSION(v) (((v) == XFS_DINODE_VERSION_1) || \
45 ((v) == XFS_DINODE_VERSION_2))
46#endif
47#define XFS_DINODE_MAGIC 0x494e /* 'IN' */
48
49/*
50 * Disk inode structure.
51 * This is just the header; the inode is expanded to fill a variable size
52 * with the last field expanding. It is split into the core and "other"
53 * because we only need the core part in the in-core inode.
54 */
55typedef struct xfs_timestamp {
56 __int32_t t_sec; /* timestamp seconds */
57 __int32_t t_nsec; /* timestamp nanoseconds */
58} xfs_timestamp_t;
59
60/*
61 * Note: Coordinate changes to this structure with the XFS_DI_* #defines
62 * below and the offsets table in xfs_ialloc_log_di().
63 */
64typedef struct xfs_dinode_core
65{
66 __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */
67 __uint16_t di_mode; /* mode and type of file */
68 __int8_t di_version; /* inode version */
69 __int8_t di_format; /* format of di_c data */
70 __uint16_t di_onlink; /* old number of links to file */
71 __uint32_t di_uid; /* owner's user id */
72 __uint32_t di_gid; /* owner's group id */
73 __uint32_t di_nlink; /* number of links to file */
74 __uint16_t di_projid; /* owner's project id */
75 __uint8_t di_pad[8]; /* unused, zeroed space */
76 __uint16_t di_flushiter; /* incremented on flush */
77 xfs_timestamp_t di_atime; /* time last accessed */
78 xfs_timestamp_t di_mtime; /* time last modified */
79 xfs_timestamp_t di_ctime; /* time created/inode modified */
80 xfs_fsize_t di_size; /* number of bytes in file */
81 xfs_drfsbno_t di_nblocks; /* # of direct & btree blocks used */
82 xfs_extlen_t di_extsize; /* basic/minimum extent size for file */
83 xfs_extnum_t di_nextents; /* number of extents in data fork */
84 xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/
85 __uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */
86 __int8_t di_aformat; /* format of attr fork's data */
87 __uint32_t di_dmevmask; /* DMIG event mask */
88 __uint16_t di_dmstate; /* DMIG state info */
89 __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */
90 __uint32_t di_gen; /* generation number */
91} xfs_dinode_core_t;
92
93#define DI_MAX_FLUSH 0xffff
94
95typedef struct xfs_dinode
96{
97 xfs_dinode_core_t di_core;
98 /*
99 * In adding anything between the core and the union, be
100 * sure to update the macros like XFS_LITINO below and
101 * XFS_BMAP_RBLOCK_DSIZE in xfs_bmap_btree.h.
102 */
103 xfs_agino_t di_next_unlinked;/* agi unlinked list ptr */
104 union {
105 xfs_bmdr_block_t di_bmbt; /* btree root block */
106 xfs_bmbt_rec_32_t di_bmx[1]; /* extent list */
107 xfs_dir_shortform_t di_dirsf; /* shortform directory */
108 xfs_dir2_sf_t di_dir2sf; /* shortform directory v2 */
109 char di_c[1]; /* local contents */
110 xfs_dev_t di_dev; /* device for S_IFCHR/S_IFBLK */
111 uuid_t di_muuid; /* mount point value */
112 char di_symlink[1]; /* local symbolic link */
113 } di_u;
114 union {
115 xfs_bmdr_block_t di_abmbt; /* btree root block */
116 xfs_bmbt_rec_32_t di_abmx[1]; /* extent list */
117 xfs_attr_shortform_t di_attrsf; /* shortform attribute list */
118 } di_a;
119} xfs_dinode_t;
120
121/*
122 * The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
123 * Since the pathconf interface is signed, we use 2^31 - 1 instead.
124 * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX.
125 */
126#define XFS_MAXLINK ((1U << 31) - 1U)
127#define XFS_MAXLINK_1 65535U
128
129/*
130 * Bit names for logging disk inodes only
131 */
132#define XFS_DI_MAGIC 0x0000001
133#define XFS_DI_MODE 0x0000002
134#define XFS_DI_VERSION 0x0000004
135#define XFS_DI_FORMAT 0x0000008
136#define XFS_DI_ONLINK 0x0000010
137#define XFS_DI_UID 0x0000020
138#define XFS_DI_GID 0x0000040
139#define XFS_DI_NLINK 0x0000080
140#define XFS_DI_PROJID 0x0000100
141#define XFS_DI_PAD 0x0000200
142#define XFS_DI_ATIME 0x0000400
143#define XFS_DI_MTIME 0x0000800
144#define XFS_DI_CTIME 0x0001000
145#define XFS_DI_SIZE 0x0002000
146#define XFS_DI_NBLOCKS 0x0004000
147#define XFS_DI_EXTSIZE 0x0008000
148#define XFS_DI_NEXTENTS 0x0010000
149#define XFS_DI_NAEXTENTS 0x0020000
150#define XFS_DI_FORKOFF 0x0040000
151#define XFS_DI_AFORMAT 0x0080000
152#define XFS_DI_DMEVMASK 0x0100000
153#define XFS_DI_DMSTATE 0x0200000
154#define XFS_DI_FLAGS 0x0400000
155#define XFS_DI_GEN 0x0800000
156#define XFS_DI_NEXT_UNLINKED 0x1000000
157#define XFS_DI_U 0x2000000
158#define XFS_DI_A 0x4000000
159#define XFS_DI_NUM_BITS 27
160#define XFS_DI_ALL_BITS ((1 << XFS_DI_NUM_BITS) - 1)
161#define XFS_DI_CORE_BITS (XFS_DI_ALL_BITS & ~(XFS_DI_U|XFS_DI_A))
162
163/*
164 * Values for di_format
165 */
166typedef enum xfs_dinode_fmt
167{
168 XFS_DINODE_FMT_DEV, /* CHR, BLK: di_dev */
169 XFS_DINODE_FMT_LOCAL, /* DIR, REG: di_c */
170 /* LNK: di_symlink */
171 XFS_DINODE_FMT_EXTENTS, /* DIR, REG, LNK: di_bmx */
172 XFS_DINODE_FMT_BTREE, /* DIR, REG, LNK: di_bmbt */
173 XFS_DINODE_FMT_UUID /* MNT: di_uuid */
174} xfs_dinode_fmt_t;
175
176/*
177 * Inode minimum and maximum sizes.
178 */
179#define XFS_DINODE_MIN_LOG 8
180#define XFS_DINODE_MAX_LOG 11
181#define XFS_DINODE_MIN_SIZE (1 << XFS_DINODE_MIN_LOG)
182#define XFS_DINODE_MAX_SIZE (1 << XFS_DINODE_MAX_LOG)
183
184/*
185 * Inode size for given fs.
186 */
187#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LITINO)
188int xfs_litino(struct xfs_mount *mp);
189#define XFS_LITINO(mp) xfs_litino(mp)
190#else
191#define XFS_LITINO(mp) ((mp)->m_litino)
192#endif
193#define XFS_BROOT_SIZE_ADJ \
194 (sizeof(xfs_bmbt_block_t) - sizeof(xfs_bmdr_block_t))
195
196/*
197 * Fork identifiers. Here so utilities can use them without including
198 * xfs_inode.h.
199 */
200#define XFS_DATA_FORK 0
201#define XFS_ATTR_FORK 1
202
203/*
204 * Inode data & attribute fork sizes, per inode.
205 */
206#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_Q)
207int xfs_cfork_q_disk(xfs_dinode_core_t *dcp);
208int xfs_cfork_q(xfs_dinode_core_t *dcp);
209#define XFS_CFORK_Q_DISK(dcp) xfs_cfork_q_disk(dcp)
210#define XFS_CFORK_Q(dcp) xfs_cfork_q(dcp)
211#else
212#define XFS_CFORK_Q_DISK(dcp) ((dcp)->di_forkoff != 0)
213#define XFS_CFORK_Q(dcp) ((dcp)->di_forkoff != 0)
214
215#endif
216#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_BOFF)
217int xfs_cfork_boff_disk(xfs_dinode_core_t *dcp);
218int xfs_cfork_boff(xfs_dinode_core_t *dcp);
219#define XFS_CFORK_BOFF_DISK(dcp) xfs_cfork_boff_disk(dcp)
220#define XFS_CFORK_BOFF(dcp) xfs_cfork_boff(dcp)
221#else
222#define XFS_CFORK_BOFF_DISK(dcp) ((int)(INT_GET((dcp)->di_forkoff, ARCH_CONVERT) << 3))
223#define XFS_CFORK_BOFF(dcp) ((int)((dcp)->di_forkoff << 3))
224
225#endif
226#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_DSIZE)
227int xfs_cfork_dsize_disk(xfs_dinode_core_t *dcp, struct xfs_mount *mp);
228int xfs_cfork_dsize(xfs_dinode_core_t *dcp, struct xfs_mount *mp);
229#define XFS_CFORK_DSIZE_DISK(dcp,mp) xfs_cfork_dsize_disk(dcp,mp)
230#define XFS_CFORK_DSIZE(dcp,mp) xfs_cfork_dsize(dcp,mp)
231#else
232#define XFS_CFORK_DSIZE_DISK(dcp,mp) \
233 (XFS_CFORK_Q_DISK(dcp) ? XFS_CFORK_BOFF_DISK(dcp) : XFS_LITINO(mp))
234#define XFS_CFORK_DSIZE(dcp,mp) \
235 (XFS_CFORK_Q(dcp) ? XFS_CFORK_BOFF(dcp) : XFS_LITINO(mp))
236
237#endif
238#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_ASIZE)
239int xfs_cfork_asize_disk(xfs_dinode_core_t *dcp, struct xfs_mount *mp);
240int xfs_cfork_asize(xfs_dinode_core_t *dcp, struct xfs_mount *mp);
241#define XFS_CFORK_ASIZE_DISK(dcp,mp) xfs_cfork_asize_disk(dcp,mp)
242#define XFS_CFORK_ASIZE(dcp,mp) xfs_cfork_asize(dcp,mp)
243#else
244#define XFS_CFORK_ASIZE_DISK(dcp,mp) \
245 (XFS_CFORK_Q_DISK(dcp) ? XFS_LITINO(mp) - XFS_CFORK_BOFF_DISK(dcp) : 0)
246#define XFS_CFORK_ASIZE(dcp,mp) \
247 (XFS_CFORK_Q(dcp) ? XFS_LITINO(mp) - XFS_CFORK_BOFF(dcp) : 0)
248
249#endif
250#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_SIZE)
251int xfs_cfork_size_disk(xfs_dinode_core_t *dcp, struct xfs_mount *mp, int w);
252int xfs_cfork_size(xfs_dinode_core_t *dcp, struct xfs_mount *mp, int w);
253#define XFS_CFORK_SIZE_DISK(dcp,mp,w) xfs_cfork_size_disk(dcp,mp,w)
254#define XFS_CFORK_SIZE(dcp,mp,w) xfs_cfork_size(dcp,mp,w)
255#else
256#define XFS_CFORK_SIZE_DISK(dcp,mp,w) \
257 ((w) == XFS_DATA_FORK ? \
258 XFS_CFORK_DSIZE_DISK(dcp, mp) : \
259 XFS_CFORK_ASIZE_DISK(dcp, mp))
260#define XFS_CFORK_SIZE(dcp,mp,w) \
261 ((w) == XFS_DATA_FORK ? \
262 XFS_CFORK_DSIZE(dcp, mp) : XFS_CFORK_ASIZE(dcp, mp))
263
264#endif
265
266#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_DSIZE)
267int xfs_dfork_dsize(xfs_dinode_t *dip, struct xfs_mount *mp);
268#define XFS_DFORK_DSIZE(dip,mp) xfs_dfork_dsize(dip,mp)
269#else
270#define XFS_DFORK_DSIZE(dip,mp) XFS_CFORK_DSIZE_DISK(&(dip)->di_core, mp)
271
272#endif
273#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_ASIZE)
274int xfs_dfork_asize(xfs_dinode_t *dip, struct xfs_mount *mp);
275#define XFS_DFORK_ASIZE(dip,mp) xfs_dfork_asize(dip,mp)
276#else
277#define XFS_DFORK_ASIZE(dip,mp) XFS_CFORK_ASIZE_DISK(&(dip)->di_core, mp)
278
279#endif
280#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_SIZE)
281int xfs_dfork_size(xfs_dinode_t *dip, struct xfs_mount *mp, int w);
282#define XFS_DFORK_SIZE(dip,mp,w) xfs_dfork_size(dip,mp,w)
283#else
284#define XFS_DFORK_SIZE(dip,mp,w) XFS_CFORK_SIZE_DISK(&(dip)->di_core, mp, w)
285
286#endif
287
288/*
289 * Macros for accessing per-fork disk inode information.
290 */
291#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_Q)
292int xfs_dfork_q(xfs_dinode_t *dip);
293#define XFS_DFORK_Q(dip) xfs_dfork_q(dip)
294#else
295#define XFS_DFORK_Q(dip) XFS_CFORK_Q_DISK(&(dip)->di_core)
296
297#endif
298#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_BOFF)
299int xfs_dfork_boff(xfs_dinode_t *dip);
300#define XFS_DFORK_BOFF(dip) xfs_dfork_boff(dip)
301#else
302#define XFS_DFORK_BOFF(dip) XFS_CFORK_BOFF_DISK(&(dip)->di_core)
303
304#endif
305#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_DPTR)
306char *xfs_dfork_dptr(xfs_dinode_t *dip);
307#define XFS_DFORK_DPTR(dip) xfs_dfork_dptr(dip)
308#else
309#define XFS_DFORK_DPTR(dip) ((dip)->di_u.di_c)
310
311#endif
312#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_APTR)
313char *xfs_dfork_aptr(xfs_dinode_t *dip);
314#define XFS_DFORK_APTR(dip) xfs_dfork_aptr(dip)
315#else
316#define XFS_DFORK_APTR(dip) ((dip)->di_u.di_c + XFS_DFORK_BOFF(dip))
317
318#endif
319#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_PTR)
320char *xfs_dfork_ptr(xfs_dinode_t *dip, int w);
321#define XFS_DFORK_PTR(dip,w) xfs_dfork_ptr(dip,w)
322#else
323#define XFS_DFORK_PTR(dip,w) \
324 ((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip))
325
326#endif
327#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_FORMAT)
328int xfs_cfork_format(xfs_dinode_core_t *dcp, int w);
329#define XFS_CFORK_FORMAT(dcp,w) xfs_cfork_format(dcp,w)
330#else
331#define XFS_CFORK_FORMAT(dcp,w) \
332 ((w) == XFS_DATA_FORK ? (dcp)->di_format : (dcp)->di_aformat)
333
334#endif
335#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_FMT_SET)
336void xfs_cfork_fmt_set(xfs_dinode_core_t *dcp, int w, int n);
337#define XFS_CFORK_FMT_SET(dcp,w,n) xfs_cfork_fmt_set(dcp,w,n)
338#else
339#define XFS_CFORK_FMT_SET(dcp,w,n) \
340 ((w) == XFS_DATA_FORK ? \
341 ((dcp)->di_format = (n)) : \
342 ((dcp)->di_aformat = (n)))
343
344#endif
345#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_NEXTENTS)
346int xfs_cfork_nextents_disk(xfs_dinode_core_t *dcp, int w);
347int xfs_cfork_nextents(xfs_dinode_core_t *dcp, int w);
348#define XFS_CFORK_NEXTENTS_DISK(dcp,w) xfs_cfork_nextents_disk(dcp,w)
349#define XFS_CFORK_NEXTENTS(dcp,w) xfs_cfork_nextents(dcp,w)
350#else
351#define XFS_CFORK_NEXTENTS_DISK(dcp,w) \
352 ((w) == XFS_DATA_FORK ? \
353 INT_GET((dcp)->di_nextents, ARCH_CONVERT) : \
354 INT_GET((dcp)->di_anextents, ARCH_CONVERT))
355#define XFS_CFORK_NEXTENTS(dcp,w) \
356 ((w) == XFS_DATA_FORK ? (dcp)->di_nextents : (dcp)->di_anextents)
357
358#endif
359#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_NEXT_SET)
360void xfs_cfork_next_set(xfs_dinode_core_t *dcp, int w, int n);
361#define XFS_CFORK_NEXT_SET(dcp,w,n) xfs_cfork_next_set(dcp,w,n)
362#else
363#define XFS_CFORK_NEXT_SET(dcp,w,n) \
364 ((w) == XFS_DATA_FORK ? \
365 ((dcp)->di_nextents = (n)) : \
366 ((dcp)->di_anextents = (n)))
367
368#endif
369
370#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_NEXTENTS)
371int xfs_dfork_nextents(xfs_dinode_t *dip, int w);
372#define XFS_DFORK_NEXTENTS(dip,w) xfs_dfork_nextents(dip,w)
373#else
374#define XFS_DFORK_NEXTENTS(dip,w) XFS_CFORK_NEXTENTS_DISK(&(dip)->di_core, w)
375#endif
376
377#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_DINODE)
378xfs_dinode_t *xfs_buf_to_dinode(struct xfs_buf *bp);
379#define XFS_BUF_TO_DINODE(bp) xfs_buf_to_dinode(bp)
380#else
381#define XFS_BUF_TO_DINODE(bp) ((xfs_dinode_t *)(XFS_BUF_PTR(bp)))
382#endif
383
384/*
385 * Values for di_flags
386 * There should be a one-to-one correspondence between these flags and the
387 * XFS_XFLAG_s.
388 */
389#define XFS_DIFLAG_REALTIME_BIT 0 /* file's blocks come from rt area */
390#define XFS_DIFLAG_PREALLOC_BIT 1 /* file space has been preallocated */
391#define XFS_DIFLAG_NEWRTBM_BIT 2 /* for rtbitmap inode, new format */
392#define XFS_DIFLAG_IMMUTABLE_BIT 3 /* inode is immutable */
393#define XFS_DIFLAG_APPEND_BIT 4 /* inode is append-only */
394#define XFS_DIFLAG_SYNC_BIT 5 /* inode is written synchronously */
395#define XFS_DIFLAG_NOATIME_BIT 6 /* do not update atime */
396#define XFS_DIFLAG_NODUMP_BIT 7 /* do not dump */
397#define XFS_DIFLAG_RTINHERIT_BIT 8 /* create with realtime bit set */
398#define XFS_DIFLAG_PROJINHERIT_BIT 9 /* create with parents projid */
399#define XFS_DIFLAG_NOSYMLINKS_BIT 10 /* disallow symlink creation */
400#define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT)
401#define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT)
402#define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT)
403#define XFS_DIFLAG_IMMUTABLE (1 << XFS_DIFLAG_IMMUTABLE_BIT)
404#define XFS_DIFLAG_APPEND (1 << XFS_DIFLAG_APPEND_BIT)
405#define XFS_DIFLAG_SYNC (1 << XFS_DIFLAG_SYNC_BIT)
406#define XFS_DIFLAG_NOATIME (1 << XFS_DIFLAG_NOATIME_BIT)
407#define XFS_DIFLAG_NODUMP (1 << XFS_DIFLAG_NODUMP_BIT)
408#define XFS_DIFLAG_RTINHERIT (1 << XFS_DIFLAG_RTINHERIT_BIT)
409#define XFS_DIFLAG_PROJINHERIT (1 << XFS_DIFLAG_PROJINHERIT_BIT)
410#define XFS_DIFLAG_NOSYMLINKS (1 << XFS_DIFLAG_NOSYMLINKS_BIT)
411
412#define XFS_DIFLAG_ANY \
413 (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \
414 XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \
415 XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \
416 XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS)
417
418#endif /* __XFS_DINODE_H__ */
diff --git a/fs/xfs/xfs_dir.c b/fs/xfs/xfs_dir.c
new file mode 100644
index 000000000000..ba30bc7682f2
--- /dev/null
+++ b/fs/xfs/xfs_dir.c
@@ -0,0 +1,1223 @@
1/*
2 * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34
35#include "xfs_macros.h"
36#include "xfs_types.h"
37#include "xfs_inum.h"
38#include "xfs_log.h"
39#include "xfs_trans.h"
40#include "xfs_sb.h"
41#include "xfs_dir.h"
42#include "xfs_dir2.h"
43#include "xfs_dmapi.h"
44#include "xfs_mount.h"
45#include "xfs_alloc_btree.h"
46#include "xfs_bmap_btree.h"
47#include "xfs_ialloc_btree.h"
48#include "xfs_alloc.h"
49#include "xfs_btree.h"
50#include "xfs_attr_sf.h"
51#include "xfs_dir_sf.h"
52#include "xfs_dir2_sf.h"
53#include "xfs_dinode.h"
54#include "xfs_inode.h"
55#include "xfs_bmap.h"
56#include "xfs_da_btree.h"
57#include "xfs_dir_leaf.h"
58#include "xfs_error.h"
59
60/*
61 * xfs_dir.c
62 *
63 * Provide the external interfaces to manage directories.
64 */
65
66/*========================================================================
67 * Function prototypes for the kernel.
68 *========================================================================*/
69
70/*
71 * Functions for the dirops interfaces.
72 */
73static void xfs_dir_mount(struct xfs_mount *mp);
74
75static int xfs_dir_isempty(struct xfs_inode *dp);
76
77static int xfs_dir_init(struct xfs_trans *trans,
78 struct xfs_inode *dir,
79 struct xfs_inode *parent_dir);
80
81static int xfs_dir_createname(struct xfs_trans *trans,
82 struct xfs_inode *dp,
83 char *name_string,
84 int name_len,
85 xfs_ino_t inode_number,
86 xfs_fsblock_t *firstblock,
87 xfs_bmap_free_t *flist,
88 xfs_extlen_t total);
89
90static int xfs_dir_lookup(struct xfs_trans *tp,
91 struct xfs_inode *dp,
92 char *name_string,
93 int name_length,
94 xfs_ino_t *inode_number);
95
96static int xfs_dir_removename(struct xfs_trans *trans,
97 struct xfs_inode *dp,
98 char *name_string,
99 int name_length,
100 xfs_ino_t ino,
101 xfs_fsblock_t *firstblock,
102 xfs_bmap_free_t *flist,
103 xfs_extlen_t total);
104
105static int xfs_dir_getdents(struct xfs_trans *tp,
106 struct xfs_inode *dp,
107 struct uio *uiop,
108 int *eofp);
109
110static int xfs_dir_replace(struct xfs_trans *tp,
111 struct xfs_inode *dp,
112 char *name_string,
113 int name_length,
114 xfs_ino_t inode_number,
115 xfs_fsblock_t *firstblock,
116 xfs_bmap_free_t *flist,
117 xfs_extlen_t total);
118
119static int xfs_dir_canenter(struct xfs_trans *tp,
120 struct xfs_inode *dp,
121 char *name_string,
122 int name_length);
123
124static int xfs_dir_shortform_validate_ondisk(xfs_mount_t *mp,
125 xfs_dinode_t *dip);
126
127xfs_dirops_t xfsv1_dirops = {
128 .xd_mount = xfs_dir_mount,
129 .xd_isempty = xfs_dir_isempty,
130 .xd_init = xfs_dir_init,
131 .xd_createname = xfs_dir_createname,
132 .xd_lookup = xfs_dir_lookup,
133 .xd_removename = xfs_dir_removename,
134 .xd_getdents = xfs_dir_getdents,
135 .xd_replace = xfs_dir_replace,
136 .xd_canenter = xfs_dir_canenter,
137 .xd_shortform_validate_ondisk = xfs_dir_shortform_validate_ondisk,
138 .xd_shortform_to_single = xfs_dir_shortform_to_leaf,
139};
140
141/*
142 * Internal routines when dirsize == XFS_LBSIZE(mp).
143 */
144STATIC int xfs_dir_leaf_lookup(xfs_da_args_t *args);
145STATIC int xfs_dir_leaf_removename(xfs_da_args_t *args, int *number_entries,
146 int *total_namebytes);
147STATIC int xfs_dir_leaf_getdents(xfs_trans_t *trans, xfs_inode_t *dp,
148 uio_t *uio, int *eofp,
149 xfs_dirent_t *dbp,
150 xfs_dir_put_t put);
151STATIC int xfs_dir_leaf_replace(xfs_da_args_t *args);
152
153/*
154 * Internal routines when dirsize > XFS_LBSIZE(mp).
155 */
156STATIC int xfs_dir_node_addname(xfs_da_args_t *args);
157STATIC int xfs_dir_node_lookup(xfs_da_args_t *args);
158STATIC int xfs_dir_node_removename(xfs_da_args_t *args);
159STATIC int xfs_dir_node_getdents(xfs_trans_t *trans, xfs_inode_t *dp,
160 uio_t *uio, int *eofp,
161 xfs_dirent_t *dbp,
162 xfs_dir_put_t put);
163STATIC int xfs_dir_node_replace(xfs_da_args_t *args);
164
165#if defined(XFS_DIR_TRACE)
166ktrace_t *xfs_dir_trace_buf;
167#endif
168
169
170/*========================================================================
171 * Overall external interface routines.
172 *========================================================================*/
173
174xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot;
175
176/*
177 * One-time startup routine called from xfs_init().
178 */
179void
180xfs_dir_startup(void)
181{
182 xfs_dir_hash_dot = xfs_da_hashname(".", 1);
183 xfs_dir_hash_dotdot = xfs_da_hashname("..", 2);
184}
185
186/*
187 * Initialize directory-related fields in the mount structure.
188 */
189static void
190xfs_dir_mount(xfs_mount_t *mp)
191{
192 uint shortcount, leafcount, count;
193
194 mp->m_dirversion = 1;
195 shortcount = (mp->m_attroffset - (uint)sizeof(xfs_dir_sf_hdr_t)) /
196 (uint)sizeof(xfs_dir_sf_entry_t);
197 leafcount = (XFS_LBSIZE(mp) - (uint)sizeof(xfs_dir_leaf_hdr_t)) /
198 ((uint)sizeof(xfs_dir_leaf_entry_t) +
199 (uint)sizeof(xfs_dir_leaf_name_t));
200 count = shortcount > leafcount ? shortcount : leafcount;
201 mp->m_dircook_elog = xfs_da_log2_roundup(count + 1);
202 ASSERT(mp->m_dircook_elog <= mp->m_sb.sb_blocklog);
203 mp->m_dir_node_ents = mp->m_attr_node_ents =
204 (XFS_LBSIZE(mp) - (uint)sizeof(xfs_da_node_hdr_t)) /
205 (uint)sizeof(xfs_da_node_entry_t);
206 mp->m_dir_magicpct = (XFS_LBSIZE(mp) * 37) / 100;
207 mp->m_dirblksize = mp->m_sb.sb_blocksize;
208 mp->m_dirblkfsbs = 1;
209}
210
211/*
212 * Return 1 if directory contains only "." and "..".
213 */
214static int
215xfs_dir_isempty(xfs_inode_t *dp)
216{
217 xfs_dir_sf_hdr_t *hdr;
218
219 ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
220 if (dp->i_d.di_size == 0)
221 return(1);
222 if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp))
223 return(0);
224 hdr = (xfs_dir_sf_hdr_t *)dp->i_df.if_u1.if_data;
225 return(hdr->count == 0);
226}
227
228/*
229 * Initialize a directory with its "." and ".." entries.
230 */
231static int
232xfs_dir_init(xfs_trans_t *trans, xfs_inode_t *dir, xfs_inode_t *parent_dir)
233{
234 xfs_da_args_t args;
235 int error;
236
237 memset((char *)&args, 0, sizeof(args));
238 args.dp = dir;
239 args.trans = trans;
240
241 ASSERT((dir->i_d.di_mode & S_IFMT) == S_IFDIR);
242 if ((error = xfs_dir_ino_validate(trans->t_mountp, parent_dir->i_ino)))
243 return error;
244
245 return(xfs_dir_shortform_create(&args, parent_dir->i_ino));
246}
247
248/*
249 * Generic handler routine to add a name to a directory.
250 * Transitions directory from shortform to Btree as necessary.
251 */
252static int /* error */
253xfs_dir_createname(xfs_trans_t *trans, xfs_inode_t *dp, char *name,
254 int namelen, xfs_ino_t inum, xfs_fsblock_t *firstblock,
255 xfs_bmap_free_t *flist, xfs_extlen_t total)
256{
257 xfs_da_args_t args;
258 int retval, newsize, done;
259
260 ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
261
262 if ((retval = xfs_dir_ino_validate(trans->t_mountp, inum)))
263 return (retval);
264
265 XFS_STATS_INC(xs_dir_create);
266 /*
267 * Fill in the arg structure for this request.
268 */
269 args.name = name;
270 args.namelen = namelen;
271 args.hashval = xfs_da_hashname(name, namelen);
272 args.inumber = inum;
273 args.dp = dp;
274 args.firstblock = firstblock;
275 args.flist = flist;
276 args.total = total;
277 args.whichfork = XFS_DATA_FORK;
278 args.trans = trans;
279 args.justcheck = 0;
280 args.addname = args.oknoent = 1;
281
282 /*
283 * Decide on what work routines to call based on the inode size.
284 */
285 done = 0;
286 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
287 newsize = XFS_DIR_SF_ENTSIZE_BYNAME(args.namelen);
288 if ((dp->i_d.di_size + newsize) <= XFS_IFORK_DSIZE(dp)) {
289 retval = xfs_dir_shortform_addname(&args);
290 done = 1;
291 } else {
292 if (total == 0)
293 return XFS_ERROR(ENOSPC);
294 retval = xfs_dir_shortform_to_leaf(&args);
295 done = retval != 0;
296 }
297 }
298 if (!done && xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
299 retval = xfs_dir_leaf_addname(&args);
300 done = retval != ENOSPC;
301 if (!done) {
302 if (total == 0)
303 return XFS_ERROR(ENOSPC);
304 retval = xfs_dir_leaf_to_node(&args);
305 done = retval != 0;
306 }
307 }
308 if (!done) {
309 retval = xfs_dir_node_addname(&args);
310 }
311 return(retval);
312}
313
314/*
315 * Generic handler routine to check if a name can be added to a directory,
316 * without adding any blocks to the directory.
317 */
318static int /* error */
319xfs_dir_canenter(xfs_trans_t *trans, xfs_inode_t *dp, char *name, int namelen)
320{
321 xfs_da_args_t args;
322 int retval, newsize;
323
324 ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
325 /*
326 * Fill in the arg structure for this request.
327 */
328 args.name = name;
329 args.namelen = namelen;
330 args.hashval = xfs_da_hashname(name, namelen);
331 args.inumber = 0;
332 args.dp = dp;
333 args.firstblock = NULL;
334 args.flist = NULL;
335 args.total = 0;
336 args.whichfork = XFS_DATA_FORK;
337 args.trans = trans;
338 args.justcheck = args.addname = args.oknoent = 1;
339
340 /*
341 * Decide on what work routines to call based on the inode size.
342 */
343 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
344 newsize = XFS_DIR_SF_ENTSIZE_BYNAME(args.namelen);
345 if ((dp->i_d.di_size + newsize) <= XFS_IFORK_DSIZE(dp))
346 retval = 0;
347 else
348 retval = XFS_ERROR(ENOSPC);
349 } else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
350 retval = xfs_dir_leaf_addname(&args);
351 } else {
352 retval = xfs_dir_node_addname(&args);
353 }
354 return(retval);
355}
356
357/*
358 * Generic handler routine to remove a name from a directory.
359 * Transitions directory from Btree to shortform as necessary.
360 */
361static int /* error */
362xfs_dir_removename(xfs_trans_t *trans, xfs_inode_t *dp, char *name,
363 int namelen, xfs_ino_t ino, xfs_fsblock_t *firstblock,
364 xfs_bmap_free_t *flist, xfs_extlen_t total)
365{
366 xfs_da_args_t args;
367 int count, totallen, newsize, retval;
368
369 ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
370 XFS_STATS_INC(xs_dir_remove);
371 /*
372 * Fill in the arg structure for this request.
373 */
374 args.name = name;
375 args.namelen = namelen;
376 args.hashval = xfs_da_hashname(name, namelen);
377 args.inumber = ino;
378 args.dp = dp;
379 args.firstblock = firstblock;
380 args.flist = flist;
381 args.total = total;
382 args.whichfork = XFS_DATA_FORK;
383 args.trans = trans;
384 args.justcheck = args.addname = args.oknoent = 0;
385
386 /*
387 * Decide on what work routines to call based on the inode size.
388 */
389 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
390 retval = xfs_dir_shortform_removename(&args);
391 } else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
392 retval = xfs_dir_leaf_removename(&args, &count, &totallen);
393 if (retval == 0) {
394 newsize = XFS_DIR_SF_ALLFIT(count, totallen);
395 if (newsize <= XFS_IFORK_DSIZE(dp)) {
396 retval = xfs_dir_leaf_to_shortform(&args);
397 }
398 }
399 } else {
400 retval = xfs_dir_node_removename(&args);
401 }
402 return(retval);
403}
404
405static int /* error */
406xfs_dir_lookup(xfs_trans_t *trans, xfs_inode_t *dp, char *name, int namelen,
407 xfs_ino_t *inum)
408{
409 xfs_da_args_t args;
410 int retval;
411
412 ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
413
414 XFS_STATS_INC(xs_dir_lookup);
415 /*
416 * Fill in the arg structure for this request.
417 */
418 args.name = name;
419 args.namelen = namelen;
420 args.hashval = xfs_da_hashname(name, namelen);
421 args.inumber = 0;
422 args.dp = dp;
423 args.firstblock = NULL;
424 args.flist = NULL;
425 args.total = 0;
426 args.whichfork = XFS_DATA_FORK;
427 args.trans = trans;
428 args.justcheck = args.addname = 0;
429 args.oknoent = 1;
430
431 /*
432 * Decide on what work routines to call based on the inode size.
433 */
434 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
435 retval = xfs_dir_shortform_lookup(&args);
436 } else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
437 retval = xfs_dir_leaf_lookup(&args);
438 } else {
439 retval = xfs_dir_node_lookup(&args);
440 }
441 if (retval == EEXIST)
442 retval = 0;
443 *inum = args.inumber;
444 return(retval);
445}
446
447/*
448 * Implement readdir.
449 */
450static int /* error */
451xfs_dir_getdents(xfs_trans_t *trans, xfs_inode_t *dp, uio_t *uio, int *eofp)
452{
453 xfs_dirent_t *dbp;
454 int alignment, retval;
455 xfs_dir_put_t put;
456
457 XFS_STATS_INC(xs_dir_getdents);
458 ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
459
460 /*
461 * If our caller has given us a single contiguous memory buffer,
462 * just work directly within that buffer. If it's in user memory,
463 * lock it down first.
464 */
465 alignment = sizeof(xfs_off_t) - 1;
466 if ((uio->uio_iovcnt == 1) &&
467 (((__psint_t)uio->uio_iov[0].iov_base & alignment) == 0) &&
468 ((uio->uio_iov[0].iov_len & alignment) == 0)) {
469 dbp = NULL;
470 put = xfs_dir_put_dirent64_direct;
471 } else {
472 dbp = kmem_alloc(sizeof(*dbp) + MAXNAMELEN, KM_SLEEP);
473 put = xfs_dir_put_dirent64_uio;
474 }
475
476 /*
477 * Decide on what work routines to call based on the inode size.
478 */
479 *eofp = 0;
480
481 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
482 retval = xfs_dir_shortform_getdents(dp, uio, eofp, dbp, put);
483 } else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
484 retval = xfs_dir_leaf_getdents(trans, dp, uio, eofp, dbp, put);
485 } else {
486 retval = xfs_dir_node_getdents(trans, dp, uio, eofp, dbp, put);
487 }
488 if (dbp != NULL)
489 kmem_free(dbp, sizeof(*dbp) + MAXNAMELEN);
490
491 return(retval);
492}
493
494static int /* error */
495xfs_dir_replace(xfs_trans_t *trans, xfs_inode_t *dp, char *name, int namelen,
496 xfs_ino_t inum, xfs_fsblock_t *firstblock,
497 xfs_bmap_free_t *flist, xfs_extlen_t total)
498{
499 xfs_da_args_t args;
500 int retval;
501
502 ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
503
504 if ((retval = xfs_dir_ino_validate(trans->t_mountp, inum)))
505 return retval;
506
507 /*
508 * Fill in the arg structure for this request.
509 */
510 args.name = name;
511 args.namelen = namelen;
512 args.hashval = xfs_da_hashname(name, namelen);
513 args.inumber = inum;
514 args.dp = dp;
515 args.firstblock = firstblock;
516 args.flist = flist;
517 args.total = total;
518 args.whichfork = XFS_DATA_FORK;
519 args.trans = trans;
520 args.justcheck = args.addname = args.oknoent = 0;
521
522 /*
523 * Decide on what work routines to call based on the inode size.
524 */
525 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
526 retval = xfs_dir_shortform_replace(&args);
527 } else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
528 retval = xfs_dir_leaf_replace(&args);
529 } else {
530 retval = xfs_dir_node_replace(&args);
531 }
532
533 return(retval);
534}
535
536static int
537xfs_dir_shortform_validate_ondisk(xfs_mount_t *mp, xfs_dinode_t *dp)
538{
539 xfs_ino_t ino;
540 int namelen_sum;
541 int count;
542 xfs_dir_shortform_t *sf;
543 xfs_dir_sf_entry_t *sfe;
544 int i;
545
546
547
548 if ((INT_GET(dp->di_core.di_mode, ARCH_CONVERT) & S_IFMT) != S_IFDIR) {
549 return 0;
550 }
551 if (INT_GET(dp->di_core.di_format, ARCH_CONVERT) != XFS_DINODE_FMT_LOCAL) {
552 return 0;
553 }
554 if (INT_GET(dp->di_core.di_size, ARCH_CONVERT) < sizeof(sf->hdr)) {
555 xfs_fs_cmn_err(CE_WARN, mp, "Invalid shortform size: dp 0x%p",
556 dp);
557 return 1;
558 }
559 sf = (xfs_dir_shortform_t *)(&dp->di_u.di_dirsf);
560 ino = XFS_GET_DIR_INO8(sf->hdr.parent);
561 if (xfs_dir_ino_validate(mp, ino))
562 return 1;
563
564 count = sf->hdr.count;
565 if ((count < 0) || ((count * 10) > XFS_LITINO(mp))) {
566 xfs_fs_cmn_err(CE_WARN, mp,
567 "Invalid shortform count: dp 0x%p", dp);
568 return(1);
569 }
570
571 if (count == 0) {
572 return 0;
573 }
574
575 namelen_sum = 0;
576 sfe = &sf->list[0];
577 for (i = sf->hdr.count - 1; i >= 0; i--) {
578 ino = XFS_GET_DIR_INO8(sfe->inumber);
579 xfs_dir_ino_validate(mp, ino);
580 if (sfe->namelen >= XFS_LITINO(mp)) {
581 xfs_fs_cmn_err(CE_WARN, mp,
582 "Invalid shortform namelen: dp 0x%p", dp);
583 return 1;
584 }
585 namelen_sum += sfe->namelen;
586 sfe = XFS_DIR_SF_NEXTENTRY(sfe);
587 }
588 if (namelen_sum >= XFS_LITINO(mp)) {
589 xfs_fs_cmn_err(CE_WARN, mp,
590 "Invalid shortform namelen: dp 0x%p", dp);
591 return 1;
592 }
593
594 return 0;
595}
596
597/*========================================================================
598 * External routines when dirsize == XFS_LBSIZE(dp->i_mount).
599 *========================================================================*/
600
601/*
602 * Add a name to the leaf directory structure
603 * This is the external routine.
604 */
605int
606xfs_dir_leaf_addname(xfs_da_args_t *args)
607{
608 int index, retval;
609 xfs_dabuf_t *bp;
610
611 retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
612 XFS_DATA_FORK);
613 if (retval)
614 return(retval);
615 ASSERT(bp != NULL);
616
617 retval = xfs_dir_leaf_lookup_int(bp, args, &index);
618 if (retval == ENOENT)
619 retval = xfs_dir_leaf_add(bp, args, index);
620 xfs_da_buf_done(bp);
621 return(retval);
622}
623
624/*
625 * Remove a name from the leaf directory structure
626 * This is the external routine.
627 */
628STATIC int
629xfs_dir_leaf_removename(xfs_da_args_t *args, int *count, int *totallen)
630{
631 xfs_dir_leafblock_t *leaf;
632 int index, retval;
633 xfs_dabuf_t *bp;
634
635 retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
636 XFS_DATA_FORK);
637 if (retval)
638 return(retval);
639 ASSERT(bp != NULL);
640 leaf = bp->data;
641 ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
642 retval = xfs_dir_leaf_lookup_int(bp, args, &index);
643 if (retval == EEXIST) {
644 (void)xfs_dir_leaf_remove(args->trans, bp, index);
645 *count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
646 *totallen = INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
647 retval = 0;
648 }
649 xfs_da_buf_done(bp);
650 return(retval);
651}
652
653/*
654 * Look up a name in a leaf directory structure.
655 * This is the external routine.
656 */
657STATIC int
658xfs_dir_leaf_lookup(xfs_da_args_t *args)
659{
660 int index, retval;
661 xfs_dabuf_t *bp;
662
663 retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
664 XFS_DATA_FORK);
665 if (retval)
666 return(retval);
667 ASSERT(bp != NULL);
668 retval = xfs_dir_leaf_lookup_int(bp, args, &index);
669 xfs_da_brelse(args->trans, bp);
670 return(retval);
671}
672
673/*
674 * Copy out directory entries for getdents(), for leaf directories.
675 */
676STATIC int
677xfs_dir_leaf_getdents(xfs_trans_t *trans, xfs_inode_t *dp, uio_t *uio,
678 int *eofp, xfs_dirent_t *dbp, xfs_dir_put_t put)
679{
680 xfs_dabuf_t *bp;
681 int retval, eob;
682
683 retval = xfs_da_read_buf(dp->i_transp, dp, 0, -1, &bp, XFS_DATA_FORK);
684 if (retval)
685 return(retval);
686 ASSERT(bp != NULL);
687 retval = xfs_dir_leaf_getdents_int(bp, dp, 0, uio, &eob, dbp, put, -1);
688 xfs_da_brelse(trans, bp);
689 *eofp = (eob == 0);
690 return(retval);
691}
692
693/*
694 * Look up a name in a leaf directory structure, replace the inode number.
695 * This is the external routine.
696 */
697STATIC int
698xfs_dir_leaf_replace(xfs_da_args_t *args)
699{
700 int index, retval;
701 xfs_dabuf_t *bp;
702 xfs_ino_t inum;
703 xfs_dir_leafblock_t *leaf;
704 xfs_dir_leaf_entry_t *entry;
705 xfs_dir_leaf_name_t *namest;
706
707 inum = args->inumber;
708 retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
709 XFS_DATA_FORK);
710 if (retval)
711 return(retval);
712 ASSERT(bp != NULL);
713 retval = xfs_dir_leaf_lookup_int(bp, args, &index);
714 if (retval == EEXIST) {
715 leaf = bp->data;
716 entry = &leaf->entries[index];
717 namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
718 /* XXX - replace assert? */
719 XFS_DIR_SF_PUT_DIRINO(&inum, &namest->inumber);
720 xfs_da_log_buf(args->trans, bp,
721 XFS_DA_LOGRANGE(leaf, namest, sizeof(namest->inumber)));
722 xfs_da_buf_done(bp);
723 retval = 0;
724 } else
725 xfs_da_brelse(args->trans, bp);
726 return(retval);
727}
728
729
730/*========================================================================
731 * External routines when dirsize > XFS_LBSIZE(mp).
732 *========================================================================*/
733
734/*
735 * Add a name to a Btree-format directory.
736 *
737 * This will involve walking down the Btree, and may involve splitting
738 * leaf nodes and even splitting intermediate nodes up to and including
739 * the root node (a special case of an intermediate node).
740 */
741STATIC int
742xfs_dir_node_addname(xfs_da_args_t *args)
743{
744 xfs_da_state_t *state;
745 xfs_da_state_blk_t *blk;
746 int retval, error;
747
748 /*
749 * Fill in bucket of arguments/results/context to carry around.
750 */
751 state = xfs_da_state_alloc();
752 state->args = args;
753 state->mp = args->dp->i_mount;
754 state->blocksize = state->mp->m_sb.sb_blocksize;
755 state->node_ents = state->mp->m_dir_node_ents;
756
757 /*
758 * Search to see if name already exists, and get back a pointer
759 * to where it should go.
760 */
761 error = xfs_da_node_lookup_int(state, &retval);
762 if (error)
763 retval = error;
764 if (retval != ENOENT)
765 goto error;
766 blk = &state->path.blk[ state->path.active-1 ];
767 ASSERT(blk->magic == XFS_DIR_LEAF_MAGIC);
768 retval = xfs_dir_leaf_add(blk->bp, args, blk->index);
769 if (retval == 0) {
770 /*
771 * Addition succeeded, update Btree hashvals.
772 */
773 if (!args->justcheck)
774 xfs_da_fixhashpath(state, &state->path);
775 } else {
776 /*
777 * Addition failed, split as many Btree elements as required.
778 */
779 if (args->total == 0) {
780 ASSERT(retval == ENOSPC);
781 goto error;
782 }
783 retval = xfs_da_split(state);
784 }
785error:
786 xfs_da_state_free(state);
787
788 return(retval);
789}
790
791/*
792 * Remove a name from a B-tree directory.
793 *
794 * This will involve walking down the Btree, and may involve joining
795 * leaf nodes and even joining intermediate nodes up to and including
796 * the root node (a special case of an intermediate node).
797 */
798STATIC int
799xfs_dir_node_removename(xfs_da_args_t *args)
800{
801 xfs_da_state_t *state;
802 xfs_da_state_blk_t *blk;
803 int retval, error;
804
805 state = xfs_da_state_alloc();
806 state->args = args;
807 state->mp = args->dp->i_mount;
808 state->blocksize = state->mp->m_sb.sb_blocksize;
809 state->node_ents = state->mp->m_dir_node_ents;
810
811 /*
812 * Search to see if name exists, and get back a pointer to it.
813 */
814 error = xfs_da_node_lookup_int(state, &retval);
815 if (error)
816 retval = error;
817 if (retval != EEXIST) {
818 xfs_da_state_free(state);
819 return(retval);
820 }
821
822 /*
823 * Remove the name and update the hashvals in the tree.
824 */
825 blk = &state->path.blk[ state->path.active-1 ];
826 ASSERT(blk->magic == XFS_DIR_LEAF_MAGIC);
827 retval = xfs_dir_leaf_remove(args->trans, blk->bp, blk->index);
828 xfs_da_fixhashpath(state, &state->path);
829
830 /*
831 * Check to see if the tree needs to be collapsed.
832 */
833 error = 0;
834 if (retval) {
835 error = xfs_da_join(state);
836 }
837
838 xfs_da_state_free(state);
839 if (error)
840 return(error);
841 return(0);
842}
843
844/*
845 * Look up a filename in a int directory.
846 * Use an internal routine to actually do all the work.
847 */
848STATIC int
849xfs_dir_node_lookup(xfs_da_args_t *args)
850{
851 xfs_da_state_t *state;
852 int retval, error, i;
853
854 state = xfs_da_state_alloc();
855 state->args = args;
856 state->mp = args->dp->i_mount;
857 state->blocksize = state->mp->m_sb.sb_blocksize;
858 state->node_ents = state->mp->m_dir_node_ents;
859
860 /*
861 * Search to see if name exists,
862 * and get back a pointer to it.
863 */
864 error = xfs_da_node_lookup_int(state, &retval);
865 if (error) {
866 retval = error;
867 }
868
869 /*
870 * If not in a transaction, we have to release all the buffers.
871 */
872 for (i = 0; i < state->path.active; i++) {
873 xfs_da_brelse(args->trans, state->path.blk[i].bp);
874 state->path.blk[i].bp = NULL;
875 }
876
877 xfs_da_state_free(state);
878 return(retval);
879}
880
881STATIC int
882xfs_dir_node_getdents(xfs_trans_t *trans, xfs_inode_t *dp, uio_t *uio,
883 int *eofp, xfs_dirent_t *dbp, xfs_dir_put_t put)
884{
885 xfs_da_intnode_t *node;
886 xfs_da_node_entry_t *btree;
887 xfs_dir_leafblock_t *leaf = NULL;
888 xfs_dablk_t bno, nextbno;
889 xfs_dahash_t cookhash;
890 xfs_mount_t *mp;
891 int error, eob, i;
892 xfs_dabuf_t *bp;
893 xfs_daddr_t nextda;
894
895 /*
896 * Pick up our context.
897 */
898 mp = dp->i_mount;
899 bp = NULL;
900 bno = XFS_DA_COOKIE_BNO(mp, uio->uio_offset);
901 cookhash = XFS_DA_COOKIE_HASH(mp, uio->uio_offset);
902
903 xfs_dir_trace_g_du("node: start", dp, uio);
904
905 /*
906 * Re-find our place, even if we're confused about what our place is.
907 *
908 * First we check the block number from the magic cookie, it is a
909 * cache of where we ended last time. If we find a leaf block, and
910 * the starting hashval in that block is less than our desired
911 * hashval, then we run with it.
912 */
913 if (bno > 0) {
914 error = xfs_da_read_buf(trans, dp, bno, -2, &bp, XFS_DATA_FORK);
915 if ((error != 0) && (error != EFSCORRUPTED))
916 return(error);
917 if (bp)
918 leaf = bp->data;
919 if (bp && INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) != XFS_DIR_LEAF_MAGIC) {
920 xfs_dir_trace_g_dub("node: block not a leaf",
921 dp, uio, bno);
922 xfs_da_brelse(trans, bp);
923 bp = NULL;
924 }
925 if (bp && INT_GET(leaf->entries[0].hashval, ARCH_CONVERT) > cookhash) {
926 xfs_dir_trace_g_dub("node: leaf hash too large",
927 dp, uio, bno);
928 xfs_da_brelse(trans, bp);
929 bp = NULL;
930 }
931 if (bp &&
932 cookhash > INT_GET(leaf->entries[INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT)) {
933 xfs_dir_trace_g_dub("node: leaf hash too small",
934 dp, uio, bno);
935 xfs_da_brelse(trans, bp);
936 bp = NULL;
937 }
938 }
939
940 /*
941 * If we did not find a leaf block from the blockno in the cookie,
942 * or we there was no blockno in the cookie (eg: first time thru),
943 * the we start at the top of the Btree and re-find our hashval.
944 */
945 if (bp == NULL) {
946 xfs_dir_trace_g_du("node: start at root" , dp, uio);
947 bno = 0;
948 for (;;) {
949 error = xfs_da_read_buf(trans, dp, bno, -1, &bp,
950 XFS_DATA_FORK);
951 if (error)
952 return(error);
953 if (bp == NULL)
954 return(XFS_ERROR(EFSCORRUPTED));
955 node = bp->data;
956 if (INT_GET(node->hdr.info.magic, ARCH_CONVERT) != XFS_DA_NODE_MAGIC)
957 break;
958 btree = &node->btree[0];
959 xfs_dir_trace_g_dun("node: node detail", dp, uio, node);
960 for (i = 0; i < INT_GET(node->hdr.count, ARCH_CONVERT); btree++, i++) {
961 if (INT_GET(btree->hashval, ARCH_CONVERT) >= cookhash) {
962 bno = INT_GET(btree->before, ARCH_CONVERT);
963 break;
964 }
965 }
966 if (i == INT_GET(node->hdr.count, ARCH_CONVERT)) {
967 xfs_da_brelse(trans, bp);
968 xfs_dir_trace_g_du("node: hash beyond EOF",
969 dp, uio);
970 uio->uio_offset = XFS_DA_MAKE_COOKIE(mp, 0, 0,
971 XFS_DA_MAXHASH);
972 *eofp = 1;
973 return(0);
974 }
975 xfs_dir_trace_g_dub("node: going to block",
976 dp, uio, bno);
977 xfs_da_brelse(trans, bp);
978 }
979 }
980 ASSERT(cookhash != XFS_DA_MAXHASH);
981
982 /*
983 * We've dropped down to the (first) leaf block that contains the
984 * hashval we are interested in. Continue rolling upward thru the
985 * leaf blocks until we fill up our buffer.
986 */
987 for (;;) {
988 leaf = bp->data;
989 if (unlikely(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) != XFS_DIR_LEAF_MAGIC)) {
990 xfs_dir_trace_g_dul("node: not a leaf", dp, uio, leaf);
991 xfs_da_brelse(trans, bp);
992 XFS_CORRUPTION_ERROR("xfs_dir_node_getdents(1)",
993 XFS_ERRLEVEL_LOW, mp, leaf);
994 return XFS_ERROR(EFSCORRUPTED);
995 }
996 xfs_dir_trace_g_dul("node: leaf detail", dp, uio, leaf);
997 if ((nextbno = INT_GET(leaf->hdr.info.forw, ARCH_CONVERT))) {
998 nextda = xfs_da_reada_buf(trans, dp, nextbno,
999 XFS_DATA_FORK);
1000 } else
1001 nextda = -1;
1002 error = xfs_dir_leaf_getdents_int(bp, dp, bno, uio, &eob, dbp,
1003 put, nextda);
1004 xfs_da_brelse(trans, bp);
1005 bno = nextbno;
1006 if (eob) {
1007 xfs_dir_trace_g_dub("node: E-O-B", dp, uio, bno);
1008 *eofp = 0;
1009 return(error);
1010 }
1011 if (bno == 0)
1012 break;
1013 error = xfs_da_read_buf(trans, dp, bno, nextda, &bp,
1014 XFS_DATA_FORK);
1015 if (error)
1016 return(error);
1017 if (unlikely(bp == NULL)) {
1018 XFS_ERROR_REPORT("xfs_dir_node_getdents(2)",
1019 XFS_ERRLEVEL_LOW, mp);
1020 return(XFS_ERROR(EFSCORRUPTED));
1021 }
1022 }
1023 *eofp = 1;
1024 xfs_dir_trace_g_du("node: E-O-F", dp, uio);
1025 return(0);
1026}
1027
1028/*
1029 * Look up a filename in an int directory, replace the inode number.
1030 * Use an internal routine to actually do the lookup.
1031 */
1032STATIC int
1033xfs_dir_node_replace(xfs_da_args_t *args)
1034{
1035 xfs_da_state_t *state;
1036 xfs_da_state_blk_t *blk;
1037 xfs_dir_leafblock_t *leaf;
1038 xfs_dir_leaf_entry_t *entry;
1039 xfs_dir_leaf_name_t *namest;
1040 xfs_ino_t inum;
1041 int retval, error, i;
1042 xfs_dabuf_t *bp;
1043
1044 state = xfs_da_state_alloc();
1045 state->args = args;
1046 state->mp = args->dp->i_mount;
1047 state->blocksize = state->mp->m_sb.sb_blocksize;
1048 state->node_ents = state->mp->m_dir_node_ents;
1049 inum = args->inumber;
1050
1051 /*
1052 * Search to see if name exists,
1053 * and get back a pointer to it.
1054 */
1055 error = xfs_da_node_lookup_int(state, &retval);
1056 if (error) {
1057 retval = error;
1058 }
1059
1060 if (retval == EEXIST) {
1061 blk = &state->path.blk[state->path.active - 1];
1062 ASSERT(blk->magic == XFS_DIR_LEAF_MAGIC);
1063 bp = blk->bp;
1064 leaf = bp->data;
1065 entry = &leaf->entries[blk->index];
1066 namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
1067 /* XXX - replace assert ? */
1068 XFS_DIR_SF_PUT_DIRINO(&inum, &namest->inumber);
1069 xfs_da_log_buf(args->trans, bp,
1070 XFS_DA_LOGRANGE(leaf, namest, sizeof(namest->inumber)));
1071 xfs_da_buf_done(bp);
1072 blk->bp = NULL;
1073 retval = 0;
1074 } else {
1075 i = state->path.active - 1;
1076 xfs_da_brelse(args->trans, state->path.blk[i].bp);
1077 state->path.blk[i].bp = NULL;
1078 }
1079 for (i = 0; i < state->path.active - 1; i++) {
1080 xfs_da_brelse(args->trans, state->path.blk[i].bp);
1081 state->path.blk[i].bp = NULL;
1082 }
1083
1084 xfs_da_state_free(state);
1085 return(retval);
1086}
1087
1088#if defined(XFS_DIR_TRACE)
1089/*
1090 * Add a trace buffer entry for an inode and a uio.
1091 */
1092void
1093xfs_dir_trace_g_du(char *where, xfs_inode_t *dp, uio_t *uio)
1094{
1095 xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DU, where,
1096 (void *)dp, (void *)dp->i_mount,
1097 (void *)((unsigned long)(uio->uio_offset >> 32)),
1098 (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
1099 (void *)(unsigned long)uio->uio_resid,
1100 NULL, NULL, NULL, NULL, NULL, NULL, NULL);
1101}
1102
1103/*
1104 * Add a trace buffer entry for an inode and a uio.
1105 */
1106void
1107xfs_dir_trace_g_dub(char *where, xfs_inode_t *dp, uio_t *uio, xfs_dablk_t bno)
1108{
1109 xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUB, where,
1110 (void *)dp, (void *)dp->i_mount,
1111 (void *)((unsigned long)(uio->uio_offset >> 32)),
1112 (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
1113 (void *)(unsigned long)uio->uio_resid,
1114 (void *)(unsigned long)bno,
1115 NULL, NULL, NULL, NULL, NULL, NULL);
1116}
1117
1118/*
1119 * Add a trace buffer entry for an inode and a uio.
1120 */
1121void
1122xfs_dir_trace_g_dun(char *where, xfs_inode_t *dp, uio_t *uio,
1123 xfs_da_intnode_t *node)
1124{
1125 int last = INT_GET(node->hdr.count, ARCH_CONVERT) - 1;
1126
1127 xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUN, where,
1128 (void *)dp, (void *)dp->i_mount,
1129 (void *)((unsigned long)(uio->uio_offset >> 32)),
1130 (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
1131 (void *)(unsigned long)uio->uio_resid,
1132 (void *)(unsigned long)
1133 INT_GET(node->hdr.info.forw, ARCH_CONVERT),
1134 (void *)(unsigned long)
1135 INT_GET(node->hdr.count, ARCH_CONVERT),
1136 (void *)(unsigned long)
1137 INT_GET(node->btree[0].hashval, ARCH_CONVERT),
1138 (void *)(unsigned long)
1139 INT_GET(node->btree[last].hashval, ARCH_CONVERT),
1140 NULL, NULL, NULL);
1141}
1142
1143/*
1144 * Add a trace buffer entry for an inode and a uio.
1145 */
1146void
1147xfs_dir_trace_g_dul(char *where, xfs_inode_t *dp, uio_t *uio,
1148 xfs_dir_leafblock_t *leaf)
1149{
1150 int last = INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1;
1151
1152 xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUL, where,
1153 (void *)dp, (void *)dp->i_mount,
1154 (void *)((unsigned long)(uio->uio_offset >> 32)),
1155 (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
1156 (void *)(unsigned long)uio->uio_resid,
1157 (void *)(unsigned long)
1158 INT_GET(leaf->hdr.info.forw, ARCH_CONVERT),
1159 (void *)(unsigned long)
1160 INT_GET(leaf->hdr.count, ARCH_CONVERT),
1161 (void *)(unsigned long)
1162 INT_GET(leaf->entries[0].hashval, ARCH_CONVERT),
1163 (void *)(unsigned long)
1164 INT_GET(leaf->entries[last].hashval, ARCH_CONVERT),
1165 NULL, NULL, NULL);
1166}
1167
1168/*
1169 * Add a trace buffer entry for an inode and a uio.
1170 */
1171void
1172xfs_dir_trace_g_due(char *where, xfs_inode_t *dp, uio_t *uio,
1173 xfs_dir_leaf_entry_t *entry)
1174{
1175 xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUE, where,
1176 (void *)dp, (void *)dp->i_mount,
1177 (void *)((unsigned long)(uio->uio_offset >> 32)),
1178 (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
1179 (void *)(unsigned long)uio->uio_resid,
1180 (void *)(unsigned long)
1181 INT_GET(entry->hashval, ARCH_CONVERT),
1182 NULL, NULL, NULL, NULL, NULL, NULL);
1183}
1184
1185/*
1186 * Add a trace buffer entry for an inode and a uio.
1187 */
1188void
1189xfs_dir_trace_g_duc(char *where, xfs_inode_t *dp, uio_t *uio, xfs_off_t cookie)
1190{
1191 xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUC, where,
1192 (void *)dp, (void *)dp->i_mount,
1193 (void *)((unsigned long)(uio->uio_offset >> 32)),
1194 (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
1195 (void *)(unsigned long)uio->uio_resid,
1196 (void *)((unsigned long)(cookie >> 32)),
1197 (void *)((unsigned long)(cookie & 0xFFFFFFFF)),
1198 NULL, NULL, NULL, NULL, NULL);
1199}
1200
1201/*
1202 * Add a trace buffer entry for the arguments given to the routine,
1203 * generic form.
1204 */
1205void
1206xfs_dir_trace_enter(int type, char *where,
1207 void * a0, void * a1,
1208 void * a2, void * a3,
1209 void * a4, void * a5,
1210 void * a6, void * a7,
1211 void * a8, void * a9,
1212 void * a10, void * a11)
1213{
1214 ASSERT(xfs_dir_trace_buf);
1215 ktrace_enter(xfs_dir_trace_buf, (void *)(unsigned long)type,
1216 (void *)where,
1217 (void *)a0, (void *)a1, (void *)a2,
1218 (void *)a3, (void *)a4, (void *)a5,
1219 (void *)a6, (void *)a7, (void *)a8,
1220 (void *)a9, (void *)a10, (void *)a11,
1221 NULL, NULL);
1222}
1223#endif /* XFS_DIR_TRACE */
diff --git a/fs/xfs/xfs_dir.h b/fs/xfs/xfs_dir.h
new file mode 100644
index 000000000000..4dbc9f54cca5
--- /dev/null
+++ b/fs/xfs/xfs_dir.h
@@ -0,0 +1,154 @@
1/*
2 * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_DIR_H__
33#define __XFS_DIR_H__
34
35/*
36 * Large directories are structured around Btrees where all the data
37 * elements are in the leaf nodes. Filenames are hashed into an int,
38 * then that int is used as the index into the Btree. Since the hashval
39 * of a filename may not be unique, we may have duplicate keys. The
40 * internal links in the Btree are logical block offsets into the file.
41 *
42 * Small directories use a different format and are packed as tightly
43 * as possible so as to fit into the literal area of the inode.
44 */
45
46/*========================================================================
47 * Function prototypes for the kernel.
48 *========================================================================*/
49
50struct uio;
51struct xfs_bmap_free;
52struct xfs_da_args;
53struct xfs_dinode;
54struct xfs_inode;
55struct xfs_mount;
56struct xfs_trans;
57
58/*
59 * Directory function types.
60 * Put in structures (xfs_dirops_t) for v1 and v2 directories.
61 */
62typedef void (*xfs_dir_mount_t)(struct xfs_mount *mp);
63typedef int (*xfs_dir_isempty_t)(struct xfs_inode *dp);
64typedef int (*xfs_dir_init_t)(struct xfs_trans *tp,
65 struct xfs_inode *dp,
66 struct xfs_inode *pdp);
67typedef int (*xfs_dir_createname_t)(struct xfs_trans *tp,
68 struct xfs_inode *dp,
69 char *name,
70 int namelen,
71 xfs_ino_t inum,
72 xfs_fsblock_t *first,
73 struct xfs_bmap_free *flist,
74 xfs_extlen_t total);
75typedef int (*xfs_dir_lookup_t)(struct xfs_trans *tp,
76 struct xfs_inode *dp,
77 char *name,
78 int namelen,
79 xfs_ino_t *inum);
80typedef int (*xfs_dir_removename_t)(struct xfs_trans *tp,
81 struct xfs_inode *dp,
82 char *name,
83 int namelen,
84 xfs_ino_t ino,
85 xfs_fsblock_t *first,
86 struct xfs_bmap_free *flist,
87 xfs_extlen_t total);
88typedef int (*xfs_dir_getdents_t)(struct xfs_trans *tp,
89 struct xfs_inode *dp,
90 struct uio *uio,
91 int *eofp);
92typedef int (*xfs_dir_replace_t)(struct xfs_trans *tp,
93 struct xfs_inode *dp,
94 char *name,
95 int namelen,
96 xfs_ino_t inum,
97 xfs_fsblock_t *first,
98 struct xfs_bmap_free *flist,
99 xfs_extlen_t total);
100typedef int (*xfs_dir_canenter_t)(struct xfs_trans *tp,
101 struct xfs_inode *dp,
102 char *name,
103 int namelen);
104typedef int (*xfs_dir_shortform_validate_ondisk_t)(struct xfs_mount *mp,
105 struct xfs_dinode *dip);
106typedef int (*xfs_dir_shortform_to_single_t)(struct xfs_da_args *args);
107
108typedef struct xfs_dirops {
109 xfs_dir_mount_t xd_mount;
110 xfs_dir_isempty_t xd_isempty;
111 xfs_dir_init_t xd_init;
112 xfs_dir_createname_t xd_createname;
113 xfs_dir_lookup_t xd_lookup;
114 xfs_dir_removename_t xd_removename;
115 xfs_dir_getdents_t xd_getdents;
116 xfs_dir_replace_t xd_replace;
117 xfs_dir_canenter_t xd_canenter;
118 xfs_dir_shortform_validate_ondisk_t xd_shortform_validate_ondisk;
119 xfs_dir_shortform_to_single_t xd_shortform_to_single;
120} xfs_dirops_t;
121
122/*
123 * Overall external interface routines.
124 */
125void xfs_dir_startup(void); /* called exactly once */
126
127#define XFS_DIR_MOUNT(mp) \
128 ((mp)->m_dirops.xd_mount(mp))
129#define XFS_DIR_ISEMPTY(mp,dp) \
130 ((mp)->m_dirops.xd_isempty(dp))
131#define XFS_DIR_INIT(mp,tp,dp,pdp) \
132 ((mp)->m_dirops.xd_init(tp,dp,pdp))
133#define XFS_DIR_CREATENAME(mp,tp,dp,name,namelen,inum,first,flist,total) \
134 ((mp)->m_dirops.xd_createname(tp,dp,name,namelen,inum,first,flist,\
135 total))
136#define XFS_DIR_LOOKUP(mp,tp,dp,name,namelen,inum) \
137 ((mp)->m_dirops.xd_lookup(tp,dp,name,namelen,inum))
138#define XFS_DIR_REMOVENAME(mp,tp,dp,name,namelen,ino,first,flist,total) \
139 ((mp)->m_dirops.xd_removename(tp,dp,name,namelen,ino,first,flist,total))
140#define XFS_DIR_GETDENTS(mp,tp,dp,uio,eofp) \
141 ((mp)->m_dirops.xd_getdents(tp,dp,uio,eofp))
142#define XFS_DIR_REPLACE(mp,tp,dp,name,namelen,inum,first,flist,total) \
143 ((mp)->m_dirops.xd_replace(tp,dp,name,namelen,inum,first,flist,total))
144#define XFS_DIR_CANENTER(mp,tp,dp,name,namelen) \
145 ((mp)->m_dirops.xd_canenter(tp,dp,name,namelen))
146#define XFS_DIR_SHORTFORM_VALIDATE_ONDISK(mp,dip) \
147 ((mp)->m_dirops.xd_shortform_validate_ondisk(mp,dip))
148#define XFS_DIR_SHORTFORM_TO_SINGLE(mp,args) \
149 ((mp)->m_dirops.xd_shortform_to_single(args))
150
151#define XFS_DIR_IS_V1(mp) ((mp)->m_dirversion == 1)
152extern xfs_dirops_t xfsv1_dirops;
153
154#endif /* __XFS_DIR_H__ */
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
new file mode 100644
index 000000000000..49fc0a3695ae
--- /dev/null
+++ b/fs/xfs/xfs_dir2.c
@@ -0,0 +1,859 @@
1/*
2 * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33/*
34 * XFS v2 directory implmentation.
35 * Top-level and utility routines.
36 */
37
38#include "xfs.h"
39
40#include "xfs_macros.h"
41#include "xfs_types.h"
42#include "xfs_inum.h"
43#include "xfs_log.h"
44#include "xfs_trans.h"
45#include "xfs_sb.h"
46#include "xfs_ag.h"
47#include "xfs_dir.h"
48#include "xfs_dir2.h"
49#include "xfs_dmapi.h"
50#include "xfs_mount.h"
51#include "xfs_alloc_btree.h"
52#include "xfs_bmap_btree.h"
53#include "xfs_attr_sf.h"
54#include "xfs_dir_sf.h"
55#include "xfs_dir2_sf.h"
56#include "xfs_dinode.h"
57#include "xfs_inode_item.h"
58#include "xfs_inode.h"
59#include "xfs_bmap.h"
60#include "xfs_da_btree.h"
61#include "xfs_dir_leaf.h"
62#include "xfs_dir2_data.h"
63#include "xfs_dir2_leaf.h"
64#include "xfs_dir2_block.h"
65#include "xfs_dir2_node.h"
66#include "xfs_dir2_trace.h"
67#include "xfs_error.h"
68#include "xfs_bit.h"
69
70/*
71 * Declarations for interface routines.
72 */
73static void xfs_dir2_mount(xfs_mount_t *mp);
74static int xfs_dir2_isempty(xfs_inode_t *dp);
75static int xfs_dir2_init(xfs_trans_t *tp, xfs_inode_t *dp,
76 xfs_inode_t *pdp);
77static int xfs_dir2_createname(xfs_trans_t *tp, xfs_inode_t *dp,
78 char *name, int namelen, xfs_ino_t inum,
79 xfs_fsblock_t *first,
80 xfs_bmap_free_t *flist, xfs_extlen_t total);
81static int xfs_dir2_lookup(xfs_trans_t *tp, xfs_inode_t *dp, char *name,
82 int namelen, xfs_ino_t *inum);
83static int xfs_dir2_removename(xfs_trans_t *tp, xfs_inode_t *dp,
84 char *name, int namelen, xfs_ino_t ino,
85 xfs_fsblock_t *first,
86 xfs_bmap_free_t *flist, xfs_extlen_t total);
87static int xfs_dir2_getdents(xfs_trans_t *tp, xfs_inode_t *dp, uio_t *uio,
88 int *eofp);
89static int xfs_dir2_replace(xfs_trans_t *tp, xfs_inode_t *dp, char *name,
90 int namelen, xfs_ino_t inum,
91 xfs_fsblock_t *first, xfs_bmap_free_t *flist,
92 xfs_extlen_t total);
93static int xfs_dir2_canenter(xfs_trans_t *tp, xfs_inode_t *dp, char *name,
94 int namelen);
95static int xfs_dir2_shortform_validate_ondisk(xfs_mount_t *mp,
96 xfs_dinode_t *dip);
97
98/*
99 * Utility routine declarations.
100 */
101static int xfs_dir2_put_dirent64_direct(xfs_dir2_put_args_t *pa);
102static int xfs_dir2_put_dirent64_uio(xfs_dir2_put_args_t *pa);
103
104/*
105 * Directory operations vector.
106 */
107xfs_dirops_t xfsv2_dirops = {
108 .xd_mount = xfs_dir2_mount,
109 .xd_isempty = xfs_dir2_isempty,
110 .xd_init = xfs_dir2_init,
111 .xd_createname = xfs_dir2_createname,
112 .xd_lookup = xfs_dir2_lookup,
113 .xd_removename = xfs_dir2_removename,
114 .xd_getdents = xfs_dir2_getdents,
115 .xd_replace = xfs_dir2_replace,
116 .xd_canenter = xfs_dir2_canenter,
117 .xd_shortform_validate_ondisk = xfs_dir2_shortform_validate_ondisk,
118 .xd_shortform_to_single = xfs_dir2_sf_to_block,
119};
120
121/*
122 * Interface routines.
123 */
124
125/*
126 * Initialize directory-related fields in the mount structure.
127 */
128static void
129xfs_dir2_mount(
130 xfs_mount_t *mp) /* filesystem mount point */
131{
132 mp->m_dirversion = 2;
133 ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <=
134 XFS_MAX_BLOCKSIZE);
135 mp->m_dirblksize = 1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog);
136 mp->m_dirblkfsbs = 1 << mp->m_sb.sb_dirblklog;
137 mp->m_dirdatablk = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_DATA_FIRSTDB(mp));
138 mp->m_dirleafblk = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_LEAF_FIRSTDB(mp));
139 mp->m_dirfreeblk = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_FREE_FIRSTDB(mp));
140 mp->m_attr_node_ents =
141 (mp->m_sb.sb_blocksize - (uint)sizeof(xfs_da_node_hdr_t)) /
142 (uint)sizeof(xfs_da_node_entry_t);
143 mp->m_dir_node_ents =
144 (mp->m_dirblksize - (uint)sizeof(xfs_da_node_hdr_t)) /
145 (uint)sizeof(xfs_da_node_entry_t);
146 mp->m_dir_magicpct = (mp->m_dirblksize * 37) / 100;
147}
148
149/*
150 * Return 1 if directory contains only "." and "..".
151 */
152static int /* return code */
153xfs_dir2_isempty(
154 xfs_inode_t *dp) /* incore inode structure */
155{
156 xfs_dir2_sf_t *sfp; /* shortform directory structure */
157
158 ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
159 /*
160 * Might happen during shutdown.
161 */
162 if (dp->i_d.di_size == 0) {
163 return 1;
164 }
165 if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp))
166 return 0;
167 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
168 return !sfp->hdr.count;
169}
170
171/*
172 * Initialize a directory with its "." and ".." entries.
173 */
174static int /* error */
175xfs_dir2_init(
176 xfs_trans_t *tp, /* transaction pointer */
177 xfs_inode_t *dp, /* incore directory inode */
178 xfs_inode_t *pdp) /* incore parent directory inode */
179{
180 xfs_da_args_t args; /* operation arguments */
181 int error; /* error return value */
182
183 memset((char *)&args, 0, sizeof(args));
184 args.dp = dp;
185 args.trans = tp;
186 ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
187 if ((error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino))) {
188 return error;
189 }
190 return xfs_dir2_sf_create(&args, pdp->i_ino);
191}
192
193/*
194 Enter a name in a directory.
195 */
196static int /* error */
197xfs_dir2_createname(
198 xfs_trans_t *tp, /* transaction pointer */
199 xfs_inode_t *dp, /* incore directory inode */
200 char *name, /* new entry name */
201 int namelen, /* new entry name length */
202 xfs_ino_t inum, /* new entry inode number */
203 xfs_fsblock_t *first, /* bmap's firstblock */
204 xfs_bmap_free_t *flist, /* bmap's freeblock list */
205 xfs_extlen_t total) /* bmap's total block count */
206{
207 xfs_da_args_t args; /* operation arguments */
208 int rval; /* return value */
209 int v; /* type-checking value */
210
211 ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
212 if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) {
213 return rval;
214 }
215 XFS_STATS_INC(xs_dir_create);
216 /*
217 * Fill in the arg structure for this request.
218 */
219 args.name = name;
220 args.namelen = namelen;
221 args.hashval = xfs_da_hashname(name, namelen);
222 args.inumber = inum;
223 args.dp = dp;
224 args.firstblock = first;
225 args.flist = flist;
226 args.total = total;
227 args.whichfork = XFS_DATA_FORK;
228 args.trans = tp;
229 args.justcheck = 0;
230 args.addname = args.oknoent = 1;
231 /*
232 * Decide on what work routines to call based on the inode size.
233 */
234 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
235 rval = xfs_dir2_sf_addname(&args);
236 else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
237 return rval;
238 } else if (v)
239 rval = xfs_dir2_block_addname(&args);
240 else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
241 return rval;
242 } else if (v)
243 rval = xfs_dir2_leaf_addname(&args);
244 else
245 rval = xfs_dir2_node_addname(&args);
246 return rval;
247}
248
249/*
250 * Lookup a name in a directory, give back the inode number.
251 */
252static int /* error */
253xfs_dir2_lookup(
254 xfs_trans_t *tp, /* transaction pointer */
255 xfs_inode_t *dp, /* incore directory inode */
256 char *name, /* lookup name */
257 int namelen, /* lookup name length */
258 xfs_ino_t *inum) /* out: inode number */
259{
260 xfs_da_args_t args; /* operation arguments */
261 int rval; /* return value */
262 int v; /* type-checking value */
263
264 ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
265 XFS_STATS_INC(xs_dir_lookup);
266
267 /*
268 * Fill in the arg structure for this request.
269 */
270 args.name = name;
271 args.namelen = namelen;
272 args.hashval = xfs_da_hashname(name, namelen);
273 args.inumber = 0;
274 args.dp = dp;
275 args.firstblock = NULL;
276 args.flist = NULL;
277 args.total = 0;
278 args.whichfork = XFS_DATA_FORK;
279 args.trans = tp;
280 args.justcheck = args.addname = 0;
281 args.oknoent = 1;
282 /*
283 * Decide on what work routines to call based on the inode size.
284 */
285 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
286 rval = xfs_dir2_sf_lookup(&args);
287 else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
288 return rval;
289 } else if (v)
290 rval = xfs_dir2_block_lookup(&args);
291 else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
292 return rval;
293 } else if (v)
294 rval = xfs_dir2_leaf_lookup(&args);
295 else
296 rval = xfs_dir2_node_lookup(&args);
297 if (rval == EEXIST)
298 rval = 0;
299 if (rval == 0)
300 *inum = args.inumber;
301 return rval;
302}
303
304/*
305 * Remove an entry from a directory.
306 */
307static int /* error */
308xfs_dir2_removename(
309 xfs_trans_t *tp, /* transaction pointer */
310 xfs_inode_t *dp, /* incore directory inode */
311 char *name, /* name of entry to remove */
312 int namelen, /* name length of entry to remove */
313 xfs_ino_t ino, /* inode number of entry to remove */
314 xfs_fsblock_t *first, /* bmap's firstblock */
315 xfs_bmap_free_t *flist, /* bmap's freeblock list */
316 xfs_extlen_t total) /* bmap's total block count */
317{
318 xfs_da_args_t args; /* operation arguments */
319 int rval; /* return value */
320 int v; /* type-checking value */
321
322 ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
323 XFS_STATS_INC(xs_dir_remove);
324 /*
325 * Fill in the arg structure for this request.
326 */
327 args.name = name;
328 args.namelen = namelen;
329 args.hashval = xfs_da_hashname(name, namelen);
330 args.inumber = ino;
331 args.dp = dp;
332 args.firstblock = first;
333 args.flist = flist;
334 args.total = total;
335 args.whichfork = XFS_DATA_FORK;
336 args.trans = tp;
337 args.justcheck = args.addname = args.oknoent = 0;
338 /*
339 * Decide on what work routines to call based on the inode size.
340 */
341 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
342 rval = xfs_dir2_sf_removename(&args);
343 else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
344 return rval;
345 } else if (v)
346 rval = xfs_dir2_block_removename(&args);
347 else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
348 return rval;
349 } else if (v)
350 rval = xfs_dir2_leaf_removename(&args);
351 else
352 rval = xfs_dir2_node_removename(&args);
353 return rval;
354}
355
356/*
357 * Read a directory.
358 */
359static int /* error */
360xfs_dir2_getdents(
361 xfs_trans_t *tp, /* transaction pointer */
362 xfs_inode_t *dp, /* incore directory inode */
363 uio_t *uio, /* caller's buffer control */
364 int *eofp) /* out: eof reached */
365{
366 int alignment; /* alignment required for ABI */
367 xfs_dirent_t *dbp; /* malloc'ed buffer */
368 xfs_dir2_put_t put; /* entry formatting routine */
369 int rval; /* return value */
370 int v; /* type-checking value */
371
372 ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
373 XFS_STATS_INC(xs_dir_getdents);
374 /*
375 * If our caller has given us a single contiguous aligned memory buffer,
376 * just work directly within that buffer. If it's in user memory,
377 * lock it down first.
378 */
379 alignment = sizeof(xfs_off_t) - 1;
380 if ((uio->uio_iovcnt == 1) &&
381 (((__psint_t)uio->uio_iov[0].iov_base & alignment) == 0) &&
382 ((uio->uio_iov[0].iov_len & alignment) == 0)) {
383 dbp = NULL;
384 put = xfs_dir2_put_dirent64_direct;
385 } else {
386 dbp = kmem_alloc(sizeof(*dbp) + MAXNAMELEN, KM_SLEEP);
387 put = xfs_dir2_put_dirent64_uio;
388 }
389
390 *eofp = 0;
391 /*
392 * Decide on what work routines to call based on the inode size.
393 */
394 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
395 rval = xfs_dir2_sf_getdents(dp, uio, eofp, dbp, put);
396 else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
397 ;
398 } else if (v)
399 rval = xfs_dir2_block_getdents(tp, dp, uio, eofp, dbp, put);
400 else
401 rval = xfs_dir2_leaf_getdents(tp, dp, uio, eofp, dbp, put);
402 if (dbp != NULL)
403 kmem_free(dbp, sizeof(*dbp) + MAXNAMELEN);
404 return rval;
405}
406
407/*
408 * Replace the inode number of a directory entry.
409 */
410static int /* error */
411xfs_dir2_replace(
412 xfs_trans_t *tp, /* transaction pointer */
413 xfs_inode_t *dp, /* incore directory inode */
414 char *name, /* name of entry to replace */
415 int namelen, /* name length of entry to replace */
416 xfs_ino_t inum, /* new inode number */
417 xfs_fsblock_t *first, /* bmap's firstblock */
418 xfs_bmap_free_t *flist, /* bmap's freeblock list */
419 xfs_extlen_t total) /* bmap's total block count */
420{
421 xfs_da_args_t args; /* operation arguments */
422 int rval; /* return value */
423 int v; /* type-checking value */
424
425 ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
426
427 if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) {
428 return rval;
429 }
430 /*
431 * Fill in the arg structure for this request.
432 */
433 args.name = name;
434 args.namelen = namelen;
435 args.hashval = xfs_da_hashname(name, namelen);
436 args.inumber = inum;
437 args.dp = dp;
438 args.firstblock = first;
439 args.flist = flist;
440 args.total = total;
441 args.whichfork = XFS_DATA_FORK;
442 args.trans = tp;
443 args.justcheck = args.addname = args.oknoent = 0;
444 /*
445 * Decide on what work routines to call based on the inode size.
446 */
447 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
448 rval = xfs_dir2_sf_replace(&args);
449 else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
450 return rval;
451 } else if (v)
452 rval = xfs_dir2_block_replace(&args);
453 else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
454 return rval;
455 } else if (v)
456 rval = xfs_dir2_leaf_replace(&args);
457 else
458 rval = xfs_dir2_node_replace(&args);
459 return rval;
460}
461
462/*
463 * See if this entry can be added to the directory without allocating space.
464 */
465static int /* error */
466xfs_dir2_canenter(
467 xfs_trans_t *tp, /* transaction pointer */
468 xfs_inode_t *dp, /* incore directory inode */
469 char *name, /* name of entry to add */
470 int namelen) /* name length of entry to add */
471{
472 xfs_da_args_t args; /* operation arguments */
473 int rval; /* return value */
474 int v; /* type-checking value */
475
476 ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
477 /*
478 * Fill in the arg structure for this request.
479 */
480 args.name = name;
481 args.namelen = namelen;
482 args.hashval = xfs_da_hashname(name, namelen);
483 args.inumber = 0;
484 args.dp = dp;
485 args.firstblock = NULL;
486 args.flist = NULL;
487 args.total = 0;
488 args.whichfork = XFS_DATA_FORK;
489 args.trans = tp;
490 args.justcheck = args.addname = args.oknoent = 1;
491 /*
492 * Decide on what work routines to call based on the inode size.
493 */
494 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
495 rval = xfs_dir2_sf_addname(&args);
496 else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
497 return rval;
498 } else if (v)
499 rval = xfs_dir2_block_addname(&args);
500 else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
501 return rval;
502 } else if (v)
503 rval = xfs_dir2_leaf_addname(&args);
504 else
505 rval = xfs_dir2_node_addname(&args);
506 return rval;
507}
508
509/*
510 * Dummy routine for shortform inode validation.
511 * Can't really do this.
512 */
513/* ARGSUSED */
514static int /* error */
515xfs_dir2_shortform_validate_ondisk(
516 xfs_mount_t *mp, /* filesystem mount point */
517 xfs_dinode_t *dip) /* ondisk inode */
518{
519 return 0;
520}
521
522/*
523 * Utility routines.
524 */
525
526/*
527 * Add a block to the directory.
528 * This routine is for data and free blocks, not leaf/node blocks
529 * which are handled by xfs_da_grow_inode.
530 */
531int /* error */
532xfs_dir2_grow_inode(
533 xfs_da_args_t *args, /* operation arguments */
534 int space, /* v2 dir's space XFS_DIR2_xxx_SPACE */
535 xfs_dir2_db_t *dbp) /* out: block number added */
536{
537 xfs_fileoff_t bno; /* directory offset of new block */
538 int count; /* count of filesystem blocks */
539 xfs_inode_t *dp; /* incore directory inode */
540 int error; /* error return value */
541 int got; /* blocks actually mapped */
542 int i; /* temp mapping index */
543 xfs_bmbt_irec_t map; /* single structure for bmap */
544 int mapi; /* mapping index */
545 xfs_bmbt_irec_t *mapp; /* bmap mapping structure(s) */
546 xfs_mount_t *mp; /* filesystem mount point */
547 int nmap; /* number of bmap entries */
548 xfs_trans_t *tp; /* transaction pointer */
549
550 xfs_dir2_trace_args_s("grow_inode", args, space);
551 dp = args->dp;
552 tp = args->trans;
553 mp = dp->i_mount;
554 /*
555 * Set lowest possible block in the space requested.
556 */
557 bno = XFS_B_TO_FSBT(mp, space * XFS_DIR2_SPACE_SIZE);
558 count = mp->m_dirblkfsbs;
559 /*
560 * Find the first hole for our block.
561 */
562 if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, XFS_DATA_FORK))) {
563 return error;
564 }
565 nmap = 1;
566 ASSERT(args->firstblock != NULL);
567 /*
568 * Try mapping the new block contiguously (one extent).
569 */
570 if ((error = xfs_bmapi(tp, dp, bno, count,
571 XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
572 args->firstblock, args->total, &map, &nmap,
573 args->flist))) {
574 return error;
575 }
576 ASSERT(nmap <= 1);
577 /*
578 * Got it in 1.
579 */
580 if (nmap == 1) {
581 mapp = &map;
582 mapi = 1;
583 }
584 /*
585 * Didn't work and this is a multiple-fsb directory block.
586 * Try again with contiguous flag turned on.
587 */
588 else if (nmap == 0 && count > 1) {
589 xfs_fileoff_t b; /* current file offset */
590
591 /*
592 * Space for maximum number of mappings.
593 */
594 mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP);
595 /*
596 * Iterate until we get to the end of our block.
597 */
598 for (b = bno, mapi = 0; b < bno + count; ) {
599 int c; /* current fsb count */
600
601 /*
602 * Can't map more than MAX_NMAP at once.
603 */
604 nmap = MIN(XFS_BMAP_MAX_NMAP, count);
605 c = (int)(bno + count - b);
606 if ((error = xfs_bmapi(tp, dp, b, c,
607 XFS_BMAPI_WRITE|XFS_BMAPI_METADATA,
608 args->firstblock, args->total,
609 &mapp[mapi], &nmap, args->flist))) {
610 kmem_free(mapp, sizeof(*mapp) * count);
611 return error;
612 }
613 if (nmap < 1)
614 break;
615 /*
616 * Add this bunch into our table, go to the next offset.
617 */
618 mapi += nmap;
619 b = mapp[mapi - 1].br_startoff +
620 mapp[mapi - 1].br_blockcount;
621 }
622 }
623 /*
624 * Didn't work.
625 */
626 else {
627 mapi = 0;
628 mapp = NULL;
629 }
630 /*
631 * See how many fsb's we got.
632 */
633 for (i = 0, got = 0; i < mapi; i++)
634 got += mapp[i].br_blockcount;
635 /*
636 * Didn't get enough fsb's, or the first/last block's are wrong.
637 */
638 if (got != count || mapp[0].br_startoff != bno ||
639 mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
640 bno + count) {
641 if (mapp != &map)
642 kmem_free(mapp, sizeof(*mapp) * count);
643 return XFS_ERROR(ENOSPC);
644 }
645 /*
646 * Done with the temporary mapping table.
647 */
648 if (mapp != &map)
649 kmem_free(mapp, sizeof(*mapp) * count);
650 *dbp = XFS_DIR2_DA_TO_DB(mp, (xfs_dablk_t)bno);
651 /*
652 * Update file's size if this is the data space and it grew.
653 */
654 if (space == XFS_DIR2_DATA_SPACE) {
655 xfs_fsize_t size; /* directory file (data) size */
656
657 size = XFS_FSB_TO_B(mp, bno + count);
658 if (size > dp->i_d.di_size) {
659 dp->i_d.di_size = size;
660 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
661 }
662 }
663 return 0;
664}
665
666/*
667 * See if the directory is a single-block form directory.
668 */
669int /* error */
670xfs_dir2_isblock(
671 xfs_trans_t *tp, /* transaction pointer */
672 xfs_inode_t *dp, /* incore directory inode */
673 int *vp) /* out: 1 is block, 0 is not block */
674{
675 xfs_fileoff_t last; /* last file offset */
676 xfs_mount_t *mp; /* filesystem mount point */
677 int rval; /* return value */
678
679 mp = dp->i_mount;
680 if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK))) {
681 return rval;
682 }
683 rval = XFS_FSB_TO_B(mp, last) == mp->m_dirblksize;
684 ASSERT(rval == 0 || dp->i_d.di_size == mp->m_dirblksize);
685 *vp = rval;
686 return 0;
687}
688
689/*
690 * See if the directory is a single-leaf form directory.
691 */
692int /* error */
693xfs_dir2_isleaf(
694 xfs_trans_t *tp, /* transaction pointer */
695 xfs_inode_t *dp, /* incore directory inode */
696 int *vp) /* out: 1 is leaf, 0 is not leaf */
697{
698 xfs_fileoff_t last; /* last file offset */
699 xfs_mount_t *mp; /* filesystem mount point */
700 int rval; /* return value */
701
702 mp = dp->i_mount;
703 if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK))) {
704 return rval;
705 }
706 *vp = last == mp->m_dirleafblk + (1 << mp->m_sb.sb_dirblklog);
707 return 0;
708}
709
710/*
711 * Getdents put routine for 64-bit ABI, direct form.
712 */
713static int /* error */
714xfs_dir2_put_dirent64_direct(
715 xfs_dir2_put_args_t *pa) /* argument bundle */
716{
717 xfs_dirent_t *idbp; /* dirent pointer */
718 iovec_t *iovp; /* io vector */
719 int namelen; /* entry name length */
720 int reclen; /* entry total length */
721 uio_t *uio; /* I/O control */
722
723 namelen = pa->namelen;
724 reclen = DIRENTSIZE(namelen);
725 uio = pa->uio;
726 /*
727 * Won't fit in the remaining space.
728 */
729 if (reclen > uio->uio_resid) {
730 pa->done = 0;
731 return 0;
732 }
733 iovp = uio->uio_iov;
734 idbp = (xfs_dirent_t *)iovp->iov_base;
735 iovp->iov_base = (char *)idbp + reclen;
736 iovp->iov_len -= reclen;
737 uio->uio_resid -= reclen;
738 idbp->d_reclen = reclen;
739 idbp->d_ino = pa->ino;
740 idbp->d_off = pa->cook;
741 idbp->d_name[namelen] = '\0';
742 pa->done = 1;
743 memcpy(idbp->d_name, pa->name, namelen);
744 return 0;
745}
746
747/*
748 * Getdents put routine for 64-bit ABI, uio form.
749 */
750static int /* error */
751xfs_dir2_put_dirent64_uio(
752 xfs_dir2_put_args_t *pa) /* argument bundle */
753{
754 xfs_dirent_t *idbp; /* dirent pointer */
755 int namelen; /* entry name length */
756 int reclen; /* entry total length */
757 int rval; /* return value */
758 uio_t *uio; /* I/O control */
759
760 namelen = pa->namelen;
761 reclen = DIRENTSIZE(namelen);
762 uio = pa->uio;
763 /*
764 * Won't fit in the remaining space.
765 */
766 if (reclen > uio->uio_resid) {
767 pa->done = 0;
768 return 0;
769 }
770 idbp = pa->dbp;
771 idbp->d_reclen = reclen;
772 idbp->d_ino = pa->ino;
773 idbp->d_off = pa->cook;
774 idbp->d_name[namelen] = '\0';
775 memcpy(idbp->d_name, pa->name, namelen);
776 rval = uio_read((caddr_t)idbp, reclen, uio);
777 pa->done = (rval == 0);
778 return rval;
779}
780
781/*
782 * Remove the given block from the directory.
783 * This routine is used for data and free blocks, leaf/node are done
784 * by xfs_da_shrink_inode.
785 */
786int
787xfs_dir2_shrink_inode(
788 xfs_da_args_t *args, /* operation arguments */
789 xfs_dir2_db_t db, /* directory block number */
790 xfs_dabuf_t *bp) /* block's buffer */
791{
792 xfs_fileoff_t bno; /* directory file offset */
793 xfs_dablk_t da; /* directory file offset */
794 int done; /* bunmap is finished */
795 xfs_inode_t *dp; /* incore directory inode */
796 int error; /* error return value */
797 xfs_mount_t *mp; /* filesystem mount point */
798 xfs_trans_t *tp; /* transaction pointer */
799
800 xfs_dir2_trace_args_db("shrink_inode", args, db, bp);
801 dp = args->dp;
802 mp = dp->i_mount;
803 tp = args->trans;
804 da = XFS_DIR2_DB_TO_DA(mp, db);
805 /*
806 * Unmap the fsblock(s).
807 */
808 if ((error = xfs_bunmapi(tp, dp, da, mp->m_dirblkfsbs,
809 XFS_BMAPI_METADATA, 0, args->firstblock, args->flist,
810 &done))) {
811 /*
812 * ENOSPC actually can happen if we're in a removename with
813 * no space reservation, and the resulting block removal
814 * would cause a bmap btree split or conversion from extents
815 * to btree. This can only happen for un-fragmented
816 * directory blocks, since you need to be punching out
817 * the middle of an extent.
818 * In this case we need to leave the block in the file,
819 * and not binval it.
820 * So the block has to be in a consistent empty state
821 * and appropriately logged.
822 * We don't free up the buffer, the caller can tell it
823 * hasn't happened since it got an error back.
824 */
825 return error;
826 }
827 ASSERT(done);
828 /*
829 * Invalidate the buffer from the transaction.
830 */
831 xfs_da_binval(tp, bp);
832 /*
833 * If it's not a data block, we're done.
834 */
835 if (db >= XFS_DIR2_LEAF_FIRSTDB(mp))
836 return 0;
837 /*
838 * If the block isn't the last one in the directory, we're done.
839 */
840 if (dp->i_d.di_size > XFS_DIR2_DB_OFF_TO_BYTE(mp, db + 1, 0))
841 return 0;
842 bno = da;
843 if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) {
844 /*
845 * This can't really happen unless there's kernel corruption.
846 */
847 return error;
848 }
849 if (db == mp->m_dirdatablk)
850 ASSERT(bno == 0);
851 else
852 ASSERT(bno > 0);
853 /*
854 * Set the size to the new last block.
855 */
856 dp->i_d.di_size = XFS_FSB_TO_B(mp, bno);
857 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
858 return 0;
859}
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
new file mode 100644
index 000000000000..8f4fc7f23bcd
--- /dev/null
+++ b/fs/xfs/xfs_dir2.h
@@ -0,0 +1,109 @@
1/*
2 * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_DIR2_H__
33#define __XFS_DIR2_H__
34
35struct uio;
36struct xfs_dabuf;
37struct xfs_da_args;
38struct xfs_dir2_put_args;
39struct xfs_inode;
40struct xfs_trans;
41
42/*
43 * Directory version 2.
44 * There are 4 possible formats:
45 * shortform
46 * single block - data with embedded leaf at the end
47 * multiple data blocks, single leaf+freeindex block
48 * data blocks, node&leaf blocks (btree), freeindex blocks
49 *
50 * The shortform format is in xfs_dir2_sf.h.
51 * The single block format is in xfs_dir2_block.h.
52 * The data block format is in xfs_dir2_data.h.
53 * The leaf and freeindex block formats are in xfs_dir2_leaf.h.
54 * Node blocks are the same as the other version, in xfs_da_btree.h.
55 */
56
57/*
58 * Byte offset in data block and shortform entry.
59 */
60typedef __uint16_t xfs_dir2_data_off_t;
61#define NULLDATAOFF 0xffffU
62typedef uint xfs_dir2_data_aoff_t; /* argument form */
63
64/*
65 * Directory block number (logical dirblk in file)
66 */
67typedef __uint32_t xfs_dir2_db_t;
68
69/*
70 * Byte offset in a directory.
71 */
72typedef xfs_off_t xfs_dir2_off_t;
73
74/*
75 * For getdents, argument struct for put routines.
76 */
77typedef int (*xfs_dir2_put_t)(struct xfs_dir2_put_args *pa);
78typedef struct xfs_dir2_put_args {
79 xfs_off_t cook; /* cookie of (next) entry */
80 xfs_intino_t ino; /* inode number */
81 struct xfs_dirent *dbp; /* buffer pointer */
82 char *name; /* directory entry name */
83 int namelen; /* length of name */
84 int done; /* output: set if value was stored */
85 xfs_dir2_put_t put; /* put function ptr (i/o) */
86 struct uio *uio; /* uio control structure */
87} xfs_dir2_put_args_t;
88
89#define XFS_DIR_IS_V2(mp) ((mp)->m_dirversion == 2)
90extern xfs_dirops_t xfsv2_dirops;
91
92/*
93 * Other interfaces used by the rest of the dir v2 code.
94 */
95extern int
96 xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
97 xfs_dir2_db_t *dbp);
98
99extern int
100 xfs_dir2_isblock(struct xfs_trans *tp, struct xfs_inode *dp, int *vp);
101
102extern int
103 xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, int *vp);
104
105extern int
106 xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
107 struct xfs_dabuf *bp);
108
109#endif /* __XFS_DIR2_H__ */
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
new file mode 100644
index 000000000000..bc4c40fcd479
--- /dev/null
+++ b/fs/xfs/xfs_dir2_block.c
@@ -0,0 +1,1248 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33/*
34 * xfs_dir2_block.c
35 * XFS V2 directory implementation, single-block form.
36 * See xfs_dir2_block.h for the format.
37 */
38
39#include "xfs.h"
40
41#include "xfs_macros.h"
42#include "xfs_types.h"
43#include "xfs_inum.h"
44#include "xfs_log.h"
45#include "xfs_trans.h"
46#include "xfs_sb.h"
47#include "xfs_dir.h"
48#include "xfs_dir2.h"
49#include "xfs_dmapi.h"
50#include "xfs_mount.h"
51#include "xfs_bmap_btree.h"
52#include "xfs_attr_sf.h"
53#include "xfs_dir_sf.h"
54#include "xfs_dir2_sf.h"
55#include "xfs_dinode.h"
56#include "xfs_inode_item.h"
57#include "xfs_inode.h"
58#include "xfs_da_btree.h"
59#include "xfs_dir_leaf.h"
60#include "xfs_dir2_data.h"
61#include "xfs_dir2_leaf.h"
62#include "xfs_dir2_block.h"
63#include "xfs_dir2_trace.h"
64#include "xfs_error.h"
65
66/*
67 * Local function prototypes.
68 */
69static void xfs_dir2_block_log_leaf(xfs_trans_t *tp, xfs_dabuf_t *bp, int first,
70 int last);
71static void xfs_dir2_block_log_tail(xfs_trans_t *tp, xfs_dabuf_t *bp);
72static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, xfs_dabuf_t **bpp,
73 int *entno);
74static int xfs_dir2_block_sort(const void *a, const void *b);
75
76/*
77 * Add an entry to a block directory.
78 */
79int /* error */
80xfs_dir2_block_addname(
81 xfs_da_args_t *args) /* directory op arguments */
82{
83 xfs_dir2_data_free_t *bf; /* bestfree table in block */
84 xfs_dir2_block_t *block; /* directory block structure */
85 xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
86 xfs_dabuf_t *bp; /* buffer for block */
87 xfs_dir2_block_tail_t *btp; /* block tail */
88 int compact; /* need to compact leaf ents */
89 xfs_dir2_data_entry_t *dep; /* block data entry */
90 xfs_inode_t *dp; /* directory inode */
91 xfs_dir2_data_unused_t *dup; /* block unused entry */
92 int error; /* error return value */
93 xfs_dir2_data_unused_t *enddup=NULL; /* unused at end of data */
94 xfs_dahash_t hash; /* hash value of found entry */
95 int high; /* high index for binary srch */
96 int highstale; /* high stale index */
97 int lfloghigh=0; /* last final leaf to log */
98 int lfloglow=0; /* first final leaf to log */
99 int len; /* length of the new entry */
100 int low; /* low index for binary srch */
101 int lowstale; /* low stale index */
102 int mid=0; /* midpoint for binary srch */
103 xfs_mount_t *mp; /* filesystem mount point */
104 int needlog; /* need to log header */
105 int needscan; /* need to rescan freespace */
106 xfs_dir2_data_off_t *tagp; /* pointer to tag value */
107 xfs_trans_t *tp; /* transaction structure */
108
109 xfs_dir2_trace_args("block_addname", args);
110 dp = args->dp;
111 tp = args->trans;
112 mp = dp->i_mount;
113 /*
114 * Read the (one and only) directory block into dabuf bp.
115 */
116 if ((error =
117 xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
118 return error;
119 }
120 ASSERT(bp != NULL);
121 block = bp->data;
122 /*
123 * Check the magic number, corrupted if wrong.
124 */
125 if (unlikely(INT_GET(block->hdr.magic, ARCH_CONVERT)
126 != XFS_DIR2_BLOCK_MAGIC)) {
127 XFS_CORRUPTION_ERROR("xfs_dir2_block_addname",
128 XFS_ERRLEVEL_LOW, mp, block);
129 xfs_da_brelse(tp, bp);
130 return XFS_ERROR(EFSCORRUPTED);
131 }
132 len = XFS_DIR2_DATA_ENTSIZE(args->namelen);
133 /*
134 * Set up pointers to parts of the block.
135 */
136 bf = block->hdr.bestfree;
137 btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
138 blp = XFS_DIR2_BLOCK_LEAF_P(btp);
139 /*
140 * No stale entries? Need space for entry and new leaf.
141 */
142 if (!btp->stale) {
143 /*
144 * Tag just before the first leaf entry.
145 */
146 tagp = (xfs_dir2_data_off_t *)blp - 1;
147 /*
148 * Data object just before the first leaf entry.
149 */
150 enddup = (xfs_dir2_data_unused_t *)((char *)block + INT_GET(*tagp, ARCH_CONVERT));
151 /*
152 * If it's not free then can't do this add without cleaning up:
153 * the space before the first leaf entry needs to be free so it
154 * can be expanded to hold the pointer to the new entry.
155 */
156 if (INT_GET(enddup->freetag, ARCH_CONVERT) != XFS_DIR2_DATA_FREE_TAG)
157 dup = enddup = NULL;
158 /*
159 * Check out the biggest freespace and see if it's the same one.
160 */
161 else {
162 dup = (xfs_dir2_data_unused_t *)
163 ((char *)block + INT_GET(bf[0].offset, ARCH_CONVERT));
164 if (dup == enddup) {
165 /*
166 * It is the biggest freespace, is it too small
167 * to hold the new leaf too?
168 */
169 if (INT_GET(dup->length, ARCH_CONVERT) < len + (uint)sizeof(*blp)) {
170 /*
171 * Yes, we use the second-largest
172 * entry instead if it works.
173 */
174 if (INT_GET(bf[1].length, ARCH_CONVERT) >= len)
175 dup = (xfs_dir2_data_unused_t *)
176 ((char *)block +
177 INT_GET(bf[1].offset, ARCH_CONVERT));
178 else
179 dup = NULL;
180 }
181 } else {
182 /*
183 * Not the same free entry,
184 * just check its length.
185 */
186 if (INT_GET(dup->length, ARCH_CONVERT) < len) {
187 dup = NULL;
188 }
189 }
190 }
191 compact = 0;
192 }
193 /*
194 * If there are stale entries we'll use one for the leaf.
195 * Is the biggest entry enough to avoid compaction?
196 */
197 else if (INT_GET(bf[0].length, ARCH_CONVERT) >= len) {
198 dup = (xfs_dir2_data_unused_t *)
199 ((char *)block + INT_GET(bf[0].offset, ARCH_CONVERT));
200 compact = 0;
201 }
202 /*
203 * Will need to compact to make this work.
204 */
205 else {
206 /*
207 * Tag just before the first leaf entry.
208 */
209 tagp = (xfs_dir2_data_off_t *)blp - 1;
210 /*
211 * Data object just before the first leaf entry.
212 */
213 dup = (xfs_dir2_data_unused_t *)((char *)block + INT_GET(*tagp, ARCH_CONVERT));
214 /*
215 * If it's not free then the data will go where the
216 * leaf data starts now, if it works at all.
217 */
218 if (INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG) {
219 if (INT_GET(dup->length, ARCH_CONVERT) + (INT_GET(btp->stale, ARCH_CONVERT) - 1) *
220 (uint)sizeof(*blp) < len)
221 dup = NULL;
222 } else if ((INT_GET(btp->stale, ARCH_CONVERT) - 1) * (uint)sizeof(*blp) < len)
223 dup = NULL;
224 else
225 dup = (xfs_dir2_data_unused_t *)blp;
226 compact = 1;
227 }
228 /*
229 * If this isn't a real add, we're done with the buffer.
230 */
231 if (args->justcheck)
232 xfs_da_brelse(tp, bp);
233 /*
234 * If we don't have space for the new entry & leaf ...
235 */
236 if (!dup) {
237 /*
238 * Not trying to actually do anything, or don't have
239 * a space reservation: return no-space.
240 */
241 if (args->justcheck || args->total == 0)
242 return XFS_ERROR(ENOSPC);
243 /*
244 * Convert to the next larger format.
245 * Then add the new entry in that format.
246 */
247 error = xfs_dir2_block_to_leaf(args, bp);
248 xfs_da_buf_done(bp);
249 if (error)
250 return error;
251 return xfs_dir2_leaf_addname(args);
252 }
253 /*
254 * Just checking, and it would work, so say so.
255 */
256 if (args->justcheck)
257 return 0;
258 needlog = needscan = 0;
259 /*
260 * If need to compact the leaf entries, do it now.
261 * Leave the highest-numbered stale entry stale.
262 * XXX should be the one closest to mid but mid is not yet computed.
263 */
264 if (compact) {
265 int fromidx; /* source leaf index */
266 int toidx; /* target leaf index */
267
268 for (fromidx = toidx = INT_GET(btp->count, ARCH_CONVERT) - 1,
269 highstale = lfloghigh = -1;
270 fromidx >= 0;
271 fromidx--) {
272 if (INT_GET(blp[fromidx].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR) {
273 if (highstale == -1)
274 highstale = toidx;
275 else {
276 if (lfloghigh == -1)
277 lfloghigh = toidx;
278 continue;
279 }
280 }
281 if (fromidx < toidx)
282 blp[toidx] = blp[fromidx];
283 toidx--;
284 }
285 lfloglow = toidx + 1 - (INT_GET(btp->stale, ARCH_CONVERT) - 1);
286 lfloghigh -= INT_GET(btp->stale, ARCH_CONVERT) - 1;
287 INT_MOD(btp->count, ARCH_CONVERT, -(INT_GET(btp->stale, ARCH_CONVERT) - 1));
288 xfs_dir2_data_make_free(tp, bp,
289 (xfs_dir2_data_aoff_t)((char *)blp - (char *)block),
290 (xfs_dir2_data_aoff_t)((INT_GET(btp->stale, ARCH_CONVERT) - 1) * sizeof(*blp)),
291 &needlog, &needscan);
292 blp += INT_GET(btp->stale, ARCH_CONVERT) - 1;
293 INT_SET(btp->stale, ARCH_CONVERT, 1);
294 /*
295 * If we now need to rebuild the bestfree map, do so.
296 * This needs to happen before the next call to use_free.
297 */
298 if (needscan) {
299 xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block,
300 &needlog, NULL);
301 needscan = 0;
302 }
303 }
304 /*
305 * Set leaf logging boundaries to impossible state.
306 * For the no-stale case they're set explicitly.
307 */
308 else if (INT_GET(btp->stale, ARCH_CONVERT)) {
309 lfloglow = INT_GET(btp->count, ARCH_CONVERT);
310 lfloghigh = -1;
311 }
312 /*
313 * Find the slot that's first lower than our hash value, -1 if none.
314 */
315 for (low = 0, high = INT_GET(btp->count, ARCH_CONVERT) - 1; low <= high; ) {
316 mid = (low + high) >> 1;
317 if ((hash = INT_GET(blp[mid].hashval, ARCH_CONVERT)) == args->hashval)
318 break;
319 if (hash < args->hashval)
320 low = mid + 1;
321 else
322 high = mid - 1;
323 }
324 while (mid >= 0 && INT_GET(blp[mid].hashval, ARCH_CONVERT) >= args->hashval) {
325 mid--;
326 }
327 /*
328 * No stale entries, will use enddup space to hold new leaf.
329 */
330 if (!btp->stale) {
331 /*
332 * Mark the space needed for the new leaf entry, now in use.
333 */
334 xfs_dir2_data_use_free(tp, bp, enddup,
335 (xfs_dir2_data_aoff_t)
336 ((char *)enddup - (char *)block + INT_GET(enddup->length, ARCH_CONVERT) -
337 sizeof(*blp)),
338 (xfs_dir2_data_aoff_t)sizeof(*blp),
339 &needlog, &needscan);
340 /*
341 * Update the tail (entry count).
342 */
343 INT_MOD(btp->count, ARCH_CONVERT, +1);
344 /*
345 * If we now need to rebuild the bestfree map, do so.
346 * This needs to happen before the next call to use_free.
347 */
348 if (needscan) {
349 xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block,
350 &needlog, NULL);
351 needscan = 0;
352 }
353 /*
354 * Adjust pointer to the first leaf entry, we're about to move
355 * the table up one to open up space for the new leaf entry.
356 * Then adjust our index to match.
357 */
358 blp--;
359 mid++;
360 if (mid)
361 memmove(blp, &blp[1], mid * sizeof(*blp));
362 lfloglow = 0;
363 lfloghigh = mid;
364 }
365 /*
366 * Use a stale leaf for our new entry.
367 */
368 else {
369 for (lowstale = mid;
370 lowstale >= 0 &&
371 INT_GET(blp[lowstale].address, ARCH_CONVERT) != XFS_DIR2_NULL_DATAPTR;
372 lowstale--)
373 continue;
374 for (highstale = mid + 1;
375 highstale < INT_GET(btp->count, ARCH_CONVERT) &&
376 INT_GET(blp[highstale].address, ARCH_CONVERT) != XFS_DIR2_NULL_DATAPTR &&
377 (lowstale < 0 || mid - lowstale > highstale - mid);
378 highstale++)
379 continue;
380 /*
381 * Move entries toward the low-numbered stale entry.
382 */
383 if (lowstale >= 0 &&
384 (highstale == INT_GET(btp->count, ARCH_CONVERT) ||
385 mid - lowstale <= highstale - mid)) {
386 if (mid - lowstale)
387 memmove(&blp[lowstale], &blp[lowstale + 1],
388 (mid - lowstale) * sizeof(*blp));
389 lfloglow = MIN(lowstale, lfloglow);
390 lfloghigh = MAX(mid, lfloghigh);
391 }
392 /*
393 * Move entries toward the high-numbered stale entry.
394 */
395 else {
396 ASSERT(highstale < INT_GET(btp->count, ARCH_CONVERT));
397 mid++;
398 if (highstale - mid)
399 memmove(&blp[mid + 1], &blp[mid],
400 (highstale - mid) * sizeof(*blp));
401 lfloglow = MIN(mid, lfloglow);
402 lfloghigh = MAX(highstale, lfloghigh);
403 }
404 INT_MOD(btp->stale, ARCH_CONVERT, -1);
405 }
406 /*
407 * Point to the new data entry.
408 */
409 dep = (xfs_dir2_data_entry_t *)dup;
410 /*
411 * Fill in the leaf entry.
412 */
413 INT_SET(blp[mid].hashval, ARCH_CONVERT, args->hashval);
414 INT_SET(blp[mid].address, ARCH_CONVERT, XFS_DIR2_BYTE_TO_DATAPTR(mp, (char *)dep - (char *)block));
415 xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh);
416 /*
417 * Mark space for the data entry used.
418 */
419 xfs_dir2_data_use_free(tp, bp, dup,
420 (xfs_dir2_data_aoff_t)((char *)dup - (char *)block),
421 (xfs_dir2_data_aoff_t)len, &needlog, &needscan);
422 /*
423 * Create the new data entry.
424 */
425 INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
426 dep->namelen = args->namelen;
427 memcpy(dep->name, args->name, args->namelen);
428 tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
429 INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)block));
430 /*
431 * Clean up the bestfree array and log the header, tail, and entry.
432 */
433 if (needscan)
434 xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog,
435 NULL);
436 if (needlog)
437 xfs_dir2_data_log_header(tp, bp);
438 xfs_dir2_block_log_tail(tp, bp);
439 xfs_dir2_data_log_entry(tp, bp, dep);
440 xfs_dir2_data_check(dp, bp);
441 xfs_da_buf_done(bp);
442 return 0;
443}
444
445/*
446 * Readdir for block directories.
447 */
448int /* error */
449xfs_dir2_block_getdents(
450 xfs_trans_t *tp, /* transaction (NULL) */
451 xfs_inode_t *dp, /* incore inode */
452 uio_t *uio, /* caller's buffer control */
453 int *eofp, /* eof reached? (out) */
454 xfs_dirent_t *dbp, /* caller's buffer */
455 xfs_dir2_put_t put) /* abi's formatting function */
456{
457 xfs_dir2_block_t *block; /* directory block structure */
458 xfs_dabuf_t *bp; /* buffer for block */
459 xfs_dir2_block_tail_t *btp; /* block tail */
460 xfs_dir2_data_entry_t *dep; /* block data entry */
461 xfs_dir2_data_unused_t *dup; /* block unused entry */
462 char *endptr; /* end of the data entries */
463 int error; /* error return value */
464 xfs_mount_t *mp; /* filesystem mount point */
465 xfs_dir2_put_args_t p; /* arg package for put rtn */
466 char *ptr; /* current data entry */
467 int wantoff; /* starting block offset */
468
469 mp = dp->i_mount;
470 /*
471 * If the block number in the offset is out of range, we're done.
472 */
473 if (XFS_DIR2_DATAPTR_TO_DB(mp, uio->uio_offset) > mp->m_dirdatablk) {
474 *eofp = 1;
475 return 0;
476 }
477 /*
478 * Can't read the block, give up, else get dabuf in bp.
479 */
480 if ((error =
481 xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
482 return error;
483 }
484 ASSERT(bp != NULL);
485 /*
486 * Extract the byte offset we start at from the seek pointer.
487 * We'll skip entries before this.
488 */
489 wantoff = XFS_DIR2_DATAPTR_TO_OFF(mp, uio->uio_offset);
490 block = bp->data;
491 xfs_dir2_data_check(dp, bp);
492 /*
493 * Set up values for the loop.
494 */
495 btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
496 ptr = (char *)block->u;
497 endptr = (char *)XFS_DIR2_BLOCK_LEAF_P(btp);
498 p.dbp = dbp;
499 p.put = put;
500 p.uio = uio;
501 /*
502 * Loop over the data portion of the block.
503 * Each object is a real entry (dep) or an unused one (dup).
504 */
505 while (ptr < endptr) {
506 dup = (xfs_dir2_data_unused_t *)ptr;
507 /*
508 * Unused, skip it.
509 */
510 if (INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG) {
511 ptr += INT_GET(dup->length, ARCH_CONVERT);
512 continue;
513 }
514
515 dep = (xfs_dir2_data_entry_t *)ptr;
516
517 /*
518 * Bump pointer for the next iteration.
519 */
520 ptr += XFS_DIR2_DATA_ENTSIZE(dep->namelen);
521 /*
522 * The entry is before the desired starting point, skip it.
523 */
524 if ((char *)dep - (char *)block < wantoff)
525 continue;
526 /*
527 * Set up argument structure for put routine.
528 */
529 p.namelen = dep->namelen;
530
531 p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
532 ptr - (char *)block);
533 p.ino = INT_GET(dep->inumber, ARCH_CONVERT);
534#if XFS_BIG_INUMS
535 p.ino += mp->m_inoadd;
536#endif
537 p.name = (char *)dep->name;
538
539 /*
540 * Put the entry in the caller's buffer.
541 */
542 error = p.put(&p);
543
544 /*
545 * If it didn't fit, set the final offset to here & return.
546 */
547 if (!p.done) {
548 uio->uio_offset =
549 XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
550 (char *)dep - (char *)block);
551 xfs_da_brelse(tp, bp);
552 return error;
553 }
554 }
555
556 /*
557 * Reached the end of the block.
558 * Set the offset to a nonexistent block 1 and return.
559 */
560 *eofp = 1;
561
562 uio->uio_offset =
563 XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk + 1, 0);
564
565 xfs_da_brelse(tp, bp);
566
567 return 0;
568}
569
570/*
571 * Log leaf entries from the block.
572 */
573static void
574xfs_dir2_block_log_leaf(
575 xfs_trans_t *tp, /* transaction structure */
576 xfs_dabuf_t *bp, /* block buffer */
577 int first, /* index of first logged leaf */
578 int last) /* index of last logged leaf */
579{
580 xfs_dir2_block_t *block; /* directory block structure */
581 xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
582 xfs_dir2_block_tail_t *btp; /* block tail */
583 xfs_mount_t *mp; /* filesystem mount point */
584
585 mp = tp->t_mountp;
586 block = bp->data;
587 btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
588 blp = XFS_DIR2_BLOCK_LEAF_P(btp);
589 xfs_da_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)block),
590 (uint)((char *)&blp[last + 1] - (char *)block - 1));
591}
592
593/*
594 * Log the block tail.
595 */
596static void
597xfs_dir2_block_log_tail(
598 xfs_trans_t *tp, /* transaction structure */
599 xfs_dabuf_t *bp) /* block buffer */
600{
601 xfs_dir2_block_t *block; /* directory block structure */
602 xfs_dir2_block_tail_t *btp; /* block tail */
603 xfs_mount_t *mp; /* filesystem mount point */
604
605 mp = tp->t_mountp;
606 block = bp->data;
607 btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
608 xfs_da_log_buf(tp, bp, (uint)((char *)btp - (char *)block),
609 (uint)((char *)(btp + 1) - (char *)block - 1));
610}
611
612/*
613 * Look up an entry in the block. This is the external routine,
614 * xfs_dir2_block_lookup_int does the real work.
615 */
616int /* error */
617xfs_dir2_block_lookup(
618 xfs_da_args_t *args) /* dir lookup arguments */
619{
620 xfs_dir2_block_t *block; /* block structure */
621 xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
622 xfs_dabuf_t *bp; /* block buffer */
623 xfs_dir2_block_tail_t *btp; /* block tail */
624 xfs_dir2_data_entry_t *dep; /* block data entry */
625 xfs_inode_t *dp; /* incore inode */
626 int ent; /* entry index */
627 int error; /* error return value */
628 xfs_mount_t *mp; /* filesystem mount point */
629
630 xfs_dir2_trace_args("block_lookup", args);
631 /*
632 * Get the buffer, look up the entry.
633 * If not found (ENOENT) then return, have no buffer.
634 */
635 if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent)))
636 return error;
637 dp = args->dp;
638 mp = dp->i_mount;
639 block = bp->data;
640 xfs_dir2_data_check(dp, bp);
641 btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
642 blp = XFS_DIR2_BLOCK_LEAF_P(btp);
643 /*
644 * Get the offset from the leaf entry, to point to the data.
645 */
646 dep = (xfs_dir2_data_entry_t *)
647 ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(blp[ent].address, ARCH_CONVERT)));
648 /*
649 * Fill in inode number, release the block.
650 */
651 args->inumber = INT_GET(dep->inumber, ARCH_CONVERT);
652 xfs_da_brelse(args->trans, bp);
653 return XFS_ERROR(EEXIST);
654}
655
656/*
657 * Internal block lookup routine.
658 */
659static int /* error */
660xfs_dir2_block_lookup_int(
661 xfs_da_args_t *args, /* dir lookup arguments */
662 xfs_dabuf_t **bpp, /* returned block buffer */
663 int *entno) /* returned entry number */
664{
665 xfs_dir2_dataptr_t addr; /* data entry address */
666 xfs_dir2_block_t *block; /* block structure */
667 xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
668 xfs_dabuf_t *bp; /* block buffer */
669 xfs_dir2_block_tail_t *btp; /* block tail */
670 xfs_dir2_data_entry_t *dep; /* block data entry */
671 xfs_inode_t *dp; /* incore inode */
672 int error; /* error return value */
673 xfs_dahash_t hash; /* found hash value */
674 int high; /* binary search high index */
675 int low; /* binary search low index */
676 int mid; /* binary search current idx */
677 xfs_mount_t *mp; /* filesystem mount point */
678 xfs_trans_t *tp; /* transaction pointer */
679
680 dp = args->dp;
681 tp = args->trans;
682 mp = dp->i_mount;
683 /*
684 * Read the buffer, return error if we can't get it.
685 */
686 if ((error =
687 xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
688 return error;
689 }
690 ASSERT(bp != NULL);
691 block = bp->data;
692 xfs_dir2_data_check(dp, bp);
693 btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
694 blp = XFS_DIR2_BLOCK_LEAF_P(btp);
695 /*
696 * Loop doing a binary search for our hash value.
697 * Find our entry, ENOENT if it's not there.
698 */
699 for (low = 0, high = INT_GET(btp->count, ARCH_CONVERT) - 1; ; ) {
700 ASSERT(low <= high);
701 mid = (low + high) >> 1;
702 if ((hash = INT_GET(blp[mid].hashval, ARCH_CONVERT)) == args->hashval)
703 break;
704 if (hash < args->hashval)
705 low = mid + 1;
706 else
707 high = mid - 1;
708 if (low > high) {
709 ASSERT(args->oknoent);
710 xfs_da_brelse(tp, bp);
711 return XFS_ERROR(ENOENT);
712 }
713 }
714 /*
715 * Back up to the first one with the right hash value.
716 */
717 while (mid > 0 && INT_GET(blp[mid - 1].hashval, ARCH_CONVERT) == args->hashval) {
718 mid--;
719 }
720 /*
721 * Now loop forward through all the entries with the
722 * right hash value looking for our name.
723 */
724 do {
725 if ((addr = INT_GET(blp[mid].address, ARCH_CONVERT)) == XFS_DIR2_NULL_DATAPTR)
726 continue;
727 /*
728 * Get pointer to the entry from the leaf.
729 */
730 dep = (xfs_dir2_data_entry_t *)
731 ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, addr));
732 /*
733 * Compare, if it's right give back buffer & entry number.
734 */
735 if (dep->namelen == args->namelen &&
736 dep->name[0] == args->name[0] &&
737 memcmp(dep->name, args->name, args->namelen) == 0) {
738 *bpp = bp;
739 *entno = mid;
740 return 0;
741 }
742 } while (++mid < INT_GET(btp->count, ARCH_CONVERT) && INT_GET(blp[mid].hashval, ARCH_CONVERT) == hash);
743 /*
744 * No match, release the buffer and return ENOENT.
745 */
746 ASSERT(args->oknoent);
747 xfs_da_brelse(tp, bp);
748 return XFS_ERROR(ENOENT);
749}
750
751/*
752 * Remove an entry from a block format directory.
753 * If that makes the block small enough to fit in shortform, transform it.
754 */
755int /* error */
756xfs_dir2_block_removename(
757 xfs_da_args_t *args) /* directory operation args */
758{
759 xfs_dir2_block_t *block; /* block structure */
760 xfs_dir2_leaf_entry_t *blp; /* block leaf pointer */
761 xfs_dabuf_t *bp; /* block buffer */
762 xfs_dir2_block_tail_t *btp; /* block tail */
763 xfs_dir2_data_entry_t *dep; /* block data entry */
764 xfs_inode_t *dp; /* incore inode */
765 int ent; /* block leaf entry index */
766 int error; /* error return value */
767 xfs_mount_t *mp; /* filesystem mount point */
768 int needlog; /* need to log block header */
769 int needscan; /* need to fixup bestfree */
770 xfs_dir2_sf_hdr_t sfh; /* shortform header */
771 int size; /* shortform size */
772 xfs_trans_t *tp; /* transaction pointer */
773
774 xfs_dir2_trace_args("block_removename", args);
775 /*
776 * Look up the entry in the block. Gets the buffer and entry index.
777 * It will always be there, the vnodeops level does a lookup first.
778 */
779 if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) {
780 return error;
781 }
782 dp = args->dp;
783 tp = args->trans;
784 mp = dp->i_mount;
785 block = bp->data;
786 btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
787 blp = XFS_DIR2_BLOCK_LEAF_P(btp);
788 /*
789 * Point to the data entry using the leaf entry.
790 */
791 dep = (xfs_dir2_data_entry_t *)
792 ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(blp[ent].address, ARCH_CONVERT)));
793 /*
794 * Mark the data entry's space free.
795 */
796 needlog = needscan = 0;
797 xfs_dir2_data_make_free(tp, bp,
798 (xfs_dir2_data_aoff_t)((char *)dep - (char *)block),
799 XFS_DIR2_DATA_ENTSIZE(dep->namelen), &needlog, &needscan);
800 /*
801 * Fix up the block tail.
802 */
803 INT_MOD(btp->stale, ARCH_CONVERT, +1);
804 xfs_dir2_block_log_tail(tp, bp);
805 /*
806 * Remove the leaf entry by marking it stale.
807 */
808 INT_SET(blp[ent].address, ARCH_CONVERT, XFS_DIR2_NULL_DATAPTR);
809 xfs_dir2_block_log_leaf(tp, bp, ent, ent);
810 /*
811 * Fix up bestfree, log the header if necessary.
812 */
813 if (needscan)
814 xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog,
815 NULL);
816 if (needlog)
817 xfs_dir2_data_log_header(tp, bp);
818 xfs_dir2_data_check(dp, bp);
819 /*
820 * See if the size as a shortform is good enough.
821 */
822 if ((size = xfs_dir2_block_sfsize(dp, block, &sfh)) >
823 XFS_IFORK_DSIZE(dp)) {
824 xfs_da_buf_done(bp);
825 return 0;
826 }
827 /*
828 * If it works, do the conversion.
829 */
830 return xfs_dir2_block_to_sf(args, bp, size, &sfh);
831}
832
833/*
834 * Replace an entry in a V2 block directory.
835 * Change the inode number to the new value.
836 */
837int /* error */
838xfs_dir2_block_replace(
839 xfs_da_args_t *args) /* directory operation args */
840{
841 xfs_dir2_block_t *block; /* block structure */
842 xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
843 xfs_dabuf_t *bp; /* block buffer */
844 xfs_dir2_block_tail_t *btp; /* block tail */
845 xfs_dir2_data_entry_t *dep; /* block data entry */
846 xfs_inode_t *dp; /* incore inode */
847 int ent; /* leaf entry index */
848 int error; /* error return value */
849 xfs_mount_t *mp; /* filesystem mount point */
850
851 xfs_dir2_trace_args("block_replace", args);
852 /*
853 * Lookup the entry in the directory. Get buffer and entry index.
854 * This will always succeed since the caller has already done a lookup.
855 */
856 if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) {
857 return error;
858 }
859 dp = args->dp;
860 mp = dp->i_mount;
861 block = bp->data;
862 btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
863 blp = XFS_DIR2_BLOCK_LEAF_P(btp);
864 /*
865 * Point to the data entry we need to change.
866 */
867 dep = (xfs_dir2_data_entry_t *)
868 ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(blp[ent].address, ARCH_CONVERT)));
869 ASSERT(INT_GET(dep->inumber, ARCH_CONVERT) != args->inumber);
870 /*
871 * Change the inode number to the new value.
872 */
873 INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
874 xfs_dir2_data_log_entry(args->trans, bp, dep);
875 xfs_dir2_data_check(dp, bp);
876 xfs_da_buf_done(bp);
877 return 0;
878}
879
880/*
881 * Qsort comparison routine for the block leaf entries.
882 */
883static int /* sort order */
884xfs_dir2_block_sort(
885 const void *a, /* first leaf entry */
886 const void *b) /* second leaf entry */
887{
888 const xfs_dir2_leaf_entry_t *la; /* first leaf entry */
889 const xfs_dir2_leaf_entry_t *lb; /* second leaf entry */
890
891 la = a;
892 lb = b;
893 return INT_GET(la->hashval, ARCH_CONVERT) < INT_GET(lb->hashval, ARCH_CONVERT) ? -1 :
894 (INT_GET(la->hashval, ARCH_CONVERT) > INT_GET(lb->hashval, ARCH_CONVERT) ? 1 : 0);
895}
896
897/*
898 * Convert a V2 leaf directory to a V2 block directory if possible.
899 */
900int /* error */
901xfs_dir2_leaf_to_block(
902 xfs_da_args_t *args, /* operation arguments */
903 xfs_dabuf_t *lbp, /* leaf buffer */
904 xfs_dabuf_t *dbp) /* data buffer */
905{
906 xfs_dir2_data_off_t *bestsp; /* leaf bests table */
907 xfs_dir2_block_t *block; /* block structure */
908 xfs_dir2_block_tail_t *btp; /* block tail */
909 xfs_inode_t *dp; /* incore directory inode */
910 xfs_dir2_data_unused_t *dup; /* unused data entry */
911 int error; /* error return value */
912 int from; /* leaf from index */
913 xfs_dir2_leaf_t *leaf; /* leaf structure */
914 xfs_dir2_leaf_entry_t *lep; /* leaf entry */
915 xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
916 xfs_mount_t *mp; /* file system mount point */
917 int needlog; /* need to log data header */
918 int needscan; /* need to scan for bestfree */
919 xfs_dir2_sf_hdr_t sfh; /* shortform header */
920 int size; /* bytes used */
921 xfs_dir2_data_off_t *tagp; /* end of entry (tag) */
922 int to; /* block/leaf to index */
923 xfs_trans_t *tp; /* transaction pointer */
924
925 xfs_dir2_trace_args_bb("leaf_to_block", args, lbp, dbp);
926 dp = args->dp;
927 tp = args->trans;
928 mp = dp->i_mount;
929 leaf = lbp->data;
930 ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC);
931 ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
932 /*
933 * If there are data blocks other than the first one, take this
934 * opportunity to remove trailing empty data blocks that may have
935 * been left behind during no-space-reservation operations.
936 * These will show up in the leaf bests table.
937 */
938 while (dp->i_d.di_size > mp->m_dirblksize) {
939 bestsp = XFS_DIR2_LEAF_BESTS_P(ltp);
940 if (INT_GET(bestsp[INT_GET(ltp->bestcount, ARCH_CONVERT) - 1], ARCH_CONVERT) ==
941 mp->m_dirblksize - (uint)sizeof(block->hdr)) {
942 if ((error =
943 xfs_dir2_leaf_trim_data(args, lbp,
944 (xfs_dir2_db_t)(INT_GET(ltp->bestcount, ARCH_CONVERT) - 1))))
945 goto out;
946 } else {
947 error = 0;
948 goto out;
949 }
950 }
951 /*
952 * Read the data block if we don't already have it, give up if it fails.
953 */
954 if (dbp == NULL &&
955 (error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp,
956 XFS_DATA_FORK))) {
957 goto out;
958 }
959 block = dbp->data;
960 ASSERT(INT_GET(block->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC);
961 /*
962 * Size of the "leaf" area in the block.
963 */
964 size = (uint)sizeof(block->tail) +
965 (uint)sizeof(*lep) * (INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT));
966 /*
967 * Look at the last data entry.
968 */
969 tagp = (xfs_dir2_data_off_t *)((char *)block + mp->m_dirblksize) - 1;
970 dup = (xfs_dir2_data_unused_t *)((char *)block + INT_GET(*tagp, ARCH_CONVERT));
971 /*
972 * If it's not free or is too short we can't do it.
973 */
974 if (INT_GET(dup->freetag, ARCH_CONVERT) != XFS_DIR2_DATA_FREE_TAG || INT_GET(dup->length, ARCH_CONVERT) < size) {
975 error = 0;
976 goto out;
977 }
978 /*
979 * Start converting it to block form.
980 */
981 INT_SET(block->hdr.magic, ARCH_CONVERT, XFS_DIR2_BLOCK_MAGIC);
982 needlog = 1;
983 needscan = 0;
984 /*
985 * Use up the space at the end of the block (blp/btp).
986 */
987 xfs_dir2_data_use_free(tp, dbp, dup, mp->m_dirblksize - size, size,
988 &needlog, &needscan);
989 /*
990 * Initialize the block tail.
991 */
992 btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
993 INT_SET(btp->count, ARCH_CONVERT, INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT));
994 btp->stale = 0;
995 xfs_dir2_block_log_tail(tp, dbp);
996 /*
997 * Initialize the block leaf area. We compact out stale entries.
998 */
999 lep = XFS_DIR2_BLOCK_LEAF_P(btp);
1000 for (from = to = 0; from < INT_GET(leaf->hdr.count, ARCH_CONVERT); from++) {
1001 if (INT_GET(leaf->ents[from].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
1002 continue;
1003 lep[to++] = leaf->ents[from];
1004 }
1005 ASSERT(to == INT_GET(btp->count, ARCH_CONVERT));
1006 xfs_dir2_block_log_leaf(tp, dbp, 0, INT_GET(btp->count, ARCH_CONVERT) - 1);
1007 /*
1008 * Scan the bestfree if we need it and log the data block header.
1009 */
1010 if (needscan)
1011 xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog,
1012 NULL);
1013 if (needlog)
1014 xfs_dir2_data_log_header(tp, dbp);
1015 /*
1016 * Pitch the old leaf block.
1017 */
1018 error = xfs_da_shrink_inode(args, mp->m_dirleafblk, lbp);
1019 lbp = NULL;
1020 if (error) {
1021 goto out;
1022 }
1023 /*
1024 * Now see if the resulting block can be shrunken to shortform.
1025 */
1026 if ((size = xfs_dir2_block_sfsize(dp, block, &sfh)) >
1027 XFS_IFORK_DSIZE(dp)) {
1028 error = 0;
1029 goto out;
1030 }
1031 return xfs_dir2_block_to_sf(args, dbp, size, &sfh);
1032out:
1033 if (lbp)
1034 xfs_da_buf_done(lbp);
1035 if (dbp)
1036 xfs_da_buf_done(dbp);
1037 return error;
1038}
1039
1040/*
1041 * Convert the shortform directory to block form.
1042 */
1043int /* error */
1044xfs_dir2_sf_to_block(
1045 xfs_da_args_t *args) /* operation arguments */
1046{
1047 xfs_dir2_db_t blkno; /* dir-relative block # (0) */
1048 xfs_dir2_block_t *block; /* block structure */
1049 xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
1050 xfs_dabuf_t *bp; /* block buffer */
1051 xfs_dir2_block_tail_t *btp; /* block tail pointer */
1052 char *buf; /* sf buffer */
1053 int buf_len;
1054 xfs_dir2_data_entry_t *dep; /* data entry pointer */
1055 xfs_inode_t *dp; /* incore directory inode */
1056 int dummy; /* trash */
1057 xfs_dir2_data_unused_t *dup; /* unused entry pointer */
1058 int endoffset; /* end of data objects */
1059 int error; /* error return value */
1060 int i; /* index */
1061 xfs_mount_t *mp; /* filesystem mount point */
1062 int needlog; /* need to log block header */
1063 int needscan; /* need to scan block freespc */
1064 int newoffset; /* offset from current entry */
1065 int offset; /* target block offset */
1066 xfs_dir2_sf_entry_t *sfep; /* sf entry pointer */
1067 xfs_dir2_sf_t *sfp; /* shortform structure */
1068 xfs_dir2_data_off_t *tagp; /* end of data entry */
1069 xfs_trans_t *tp; /* transaction pointer */
1070
1071 xfs_dir2_trace_args("sf_to_block", args);
1072 dp = args->dp;
1073 tp = args->trans;
1074 mp = dp->i_mount;
1075 ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
1076 /*
1077 * Bomb out if the shortform directory is way too short.
1078 */
1079 if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
1080 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1081 return XFS_ERROR(EIO);
1082 }
1083 ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
1084 ASSERT(dp->i_df.if_u1.if_data != NULL);
1085 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
1086 ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count));
1087 /*
1088 * Copy the directory into the stack buffer.
1089 * Then pitch the incore inode data so we can make extents.
1090 */
1091
1092 buf_len = dp->i_df.if_bytes;
1093 buf = kmem_alloc(dp->i_df.if_bytes, KM_SLEEP);
1094
1095 memcpy(buf, sfp, dp->i_df.if_bytes);
1096 xfs_idata_realloc(dp, -dp->i_df.if_bytes, XFS_DATA_FORK);
1097 dp->i_d.di_size = 0;
1098 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1099 /*
1100 * Reset pointer - old sfp is gone.
1101 */
1102 sfp = (xfs_dir2_sf_t *)buf;
1103 /*
1104 * Add block 0 to the inode.
1105 */
1106 error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &blkno);
1107 if (error) {
1108 kmem_free(buf, buf_len);
1109 return error;
1110 }
1111 /*
1112 * Initialize the data block.
1113 */
1114 error = xfs_dir2_data_init(args, blkno, &bp);
1115 if (error) {
1116 kmem_free(buf, buf_len);
1117 return error;
1118 }
1119 block = bp->data;
1120 INT_SET(block->hdr.magic, ARCH_CONVERT, XFS_DIR2_BLOCK_MAGIC);
1121 /*
1122 * Compute size of block "tail" area.
1123 */
1124 i = (uint)sizeof(*btp) +
1125 (INT_GET(sfp->hdr.count, ARCH_CONVERT) + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t);
1126 /*
1127 * The whole thing is initialized to free by the init routine.
1128 * Say we're using the leaf and tail area.
1129 */
1130 dup = (xfs_dir2_data_unused_t *)block->u;
1131 needlog = needscan = 0;
1132 xfs_dir2_data_use_free(tp, bp, dup, mp->m_dirblksize - i, i, &needlog,
1133 &needscan);
1134 ASSERT(needscan == 0);
1135 /*
1136 * Fill in the tail.
1137 */
1138 btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
1139 INT_SET(btp->count, ARCH_CONVERT, INT_GET(sfp->hdr.count, ARCH_CONVERT) + 2); /* ., .. */
1140 btp->stale = 0;
1141 blp = XFS_DIR2_BLOCK_LEAF_P(btp);
1142 endoffset = (uint)((char *)blp - (char *)block);
1143 /*
1144 * Remove the freespace, we'll manage it.
1145 */
1146 xfs_dir2_data_use_free(tp, bp, dup,
1147 (xfs_dir2_data_aoff_t)((char *)dup - (char *)block),
1148 INT_GET(dup->length, ARCH_CONVERT), &needlog, &needscan);
1149 /*
1150 * Create entry for .
1151 */
1152 dep = (xfs_dir2_data_entry_t *)
1153 ((char *)block + XFS_DIR2_DATA_DOT_OFFSET);
1154 INT_SET(dep->inumber, ARCH_CONVERT, dp->i_ino);
1155 dep->namelen = 1;
1156 dep->name[0] = '.';
1157 tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
1158 INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)block));
1159 xfs_dir2_data_log_entry(tp, bp, dep);
1160 INT_SET(blp[0].hashval, ARCH_CONVERT, xfs_dir_hash_dot);
1161 INT_SET(blp[0].address, ARCH_CONVERT, XFS_DIR2_BYTE_TO_DATAPTR(mp, (char *)dep - (char *)block));
1162 /*
1163 * Create entry for ..
1164 */
1165 dep = (xfs_dir2_data_entry_t *)
1166 ((char *)block + XFS_DIR2_DATA_DOTDOT_OFFSET);
1167 INT_SET(dep->inumber, ARCH_CONVERT, XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent));
1168 dep->namelen = 2;
1169 dep->name[0] = dep->name[1] = '.';
1170 tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
1171 INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)block));
1172 xfs_dir2_data_log_entry(tp, bp, dep);
1173 INT_SET(blp[1].hashval, ARCH_CONVERT, xfs_dir_hash_dotdot);
1174 INT_SET(blp[1].address, ARCH_CONVERT, XFS_DIR2_BYTE_TO_DATAPTR(mp, (char *)dep - (char *)block));
1175 offset = XFS_DIR2_DATA_FIRST_OFFSET;
1176 /*
1177 * Loop over existing entries, stuff them in.
1178 */
1179 if ((i = 0) == INT_GET(sfp->hdr.count, ARCH_CONVERT))
1180 sfep = NULL;
1181 else
1182 sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
1183 /*
1184 * Need to preserve the existing offset values in the sf directory.
1185 * Insert holes (unused entries) where necessary.
1186 */
1187 while (offset < endoffset) {
1188 /*
1189 * sfep is null when we reach the end of the list.
1190 */
1191 if (sfep == NULL)
1192 newoffset = endoffset;
1193 else
1194 newoffset = XFS_DIR2_SF_GET_OFFSET(sfep);
1195 /*
1196 * There should be a hole here, make one.
1197 */
1198 if (offset < newoffset) {
1199 dup = (xfs_dir2_data_unused_t *)
1200 ((char *)block + offset);
1201 INT_SET(dup->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG);
1202 INT_SET(dup->length, ARCH_CONVERT, newoffset - offset);
1203 INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(dup), ARCH_CONVERT,
1204 (xfs_dir2_data_off_t)
1205 ((char *)dup - (char *)block));
1206 xfs_dir2_data_log_unused(tp, bp, dup);
1207 (void)xfs_dir2_data_freeinsert((xfs_dir2_data_t *)block,
1208 dup, &dummy);
1209 offset += INT_GET(dup->length, ARCH_CONVERT);
1210 continue;
1211 }
1212 /*
1213 * Copy a real entry.
1214 */
1215 dep = (xfs_dir2_data_entry_t *)((char *)block + newoffset);
1216 INT_SET(dep->inumber, ARCH_CONVERT, XFS_DIR2_SF_GET_INUMBER(sfp,
1217 XFS_DIR2_SF_INUMBERP(sfep)));
1218 dep->namelen = sfep->namelen;
1219 memcpy(dep->name, sfep->name, dep->namelen);
1220 tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
1221 INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)block));
1222 xfs_dir2_data_log_entry(tp, bp, dep);
1223 INT_SET(blp[2 + i].hashval, ARCH_CONVERT, xfs_da_hashname((char *)sfep->name, sfep->namelen));
1224 INT_SET(blp[2 + i].address, ARCH_CONVERT, XFS_DIR2_BYTE_TO_DATAPTR(mp,
1225 (char *)dep - (char *)block));
1226 offset = (int)((char *)(tagp + 1) - (char *)block);
1227 if (++i == INT_GET(sfp->hdr.count, ARCH_CONVERT))
1228 sfep = NULL;
1229 else
1230 sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep);
1231 }
1232 /* Done with the temporary buffer */
1233 kmem_free(buf, buf_len);
1234 /*
1235 * Sort the leaf entries by hash value.
1236 */
1237 qsort(blp, INT_GET(btp->count, ARCH_CONVERT), sizeof(*blp), xfs_dir2_block_sort);
1238 /*
1239 * Log the leaf entry area and tail.
1240 * Already logged the header in data_init, ignore needlog.
1241 */
1242 ASSERT(needscan == 0);
1243 xfs_dir2_block_log_leaf(tp, bp, 0, INT_GET(btp->count, ARCH_CONVERT) - 1);
1244 xfs_dir2_block_log_tail(tp, bp);
1245 xfs_dir2_data_check(dp, bp);
1246 xfs_da_buf_done(bp);
1247 return 0;
1248}
diff --git a/fs/xfs/xfs_dir2_block.h b/fs/xfs/xfs_dir2_block.h
new file mode 100644
index 000000000000..5a578b84e246
--- /dev/null
+++ b/fs/xfs/xfs_dir2_block.h
@@ -0,0 +1,126 @@
1/*
2 * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_DIR2_BLOCK_H__
33#define __XFS_DIR2_BLOCK_H__
34
35/*
36 * xfs_dir2_block.h
37 * Directory version 2, single block format structures
38 */
39
40struct uio;
41struct xfs_dabuf;
42struct xfs_da_args;
43struct xfs_dir2_data_hdr;
44struct xfs_dir2_leaf_entry;
45struct xfs_inode;
46struct xfs_mount;
47struct xfs_trans;
48
49/*
50 * The single block format is as follows:
51 * xfs_dir2_data_hdr_t structure
52 * xfs_dir2_data_entry_t and xfs_dir2_data_unused_t structures
53 * xfs_dir2_leaf_entry_t structures
54 * xfs_dir2_block_tail_t structure
55 */
56
57#define XFS_DIR2_BLOCK_MAGIC 0x58443242 /* XD2B: for one block dirs */
58
59typedef struct xfs_dir2_block_tail {
60 __uint32_t count; /* count of leaf entries */
61 __uint32_t stale; /* count of stale lf entries */
62} xfs_dir2_block_tail_t;
63
64/*
65 * Generic single-block structure, for xfs_db.
66 */
67typedef struct xfs_dir2_block {
68 xfs_dir2_data_hdr_t hdr; /* magic XFS_DIR2_BLOCK_MAGIC */
69 xfs_dir2_data_union_t u[1];
70 xfs_dir2_leaf_entry_t leaf[1];
71 xfs_dir2_block_tail_t tail;
72} xfs_dir2_block_t;
73
74/*
75 * Pointer to the leaf header embedded in a data block (1-block format)
76 */
77#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_BLOCK_TAIL_P)
78xfs_dir2_block_tail_t *
79xfs_dir2_block_tail_p(struct xfs_mount *mp, xfs_dir2_block_t *block);
80#define XFS_DIR2_BLOCK_TAIL_P(mp,block) xfs_dir2_block_tail_p(mp,block)
81#else
82#define XFS_DIR2_BLOCK_TAIL_P(mp,block) \
83 (((xfs_dir2_block_tail_t *)((char *)(block) + (mp)->m_dirblksize)) - 1)
84#endif
85
86/*
87 * Pointer to the leaf entries embedded in a data block (1-block format)
88 */
89#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_BLOCK_LEAF_P)
90struct xfs_dir2_leaf_entry *xfs_dir2_block_leaf_p(xfs_dir2_block_tail_t *btp);
91#define XFS_DIR2_BLOCK_LEAF_P(btp) \
92 xfs_dir2_block_leaf_p(btp)
93#else
94#define XFS_DIR2_BLOCK_LEAF_P(btp) \
95 (((struct xfs_dir2_leaf_entry *)(btp)) - INT_GET((btp)->count, ARCH_CONVERT))
96#endif
97
98/*
99 * Function declarations.
100 */
101
102extern int
103 xfs_dir2_block_addname(struct xfs_da_args *args);
104
105extern int
106 xfs_dir2_block_getdents(struct xfs_trans *tp, struct xfs_inode *dp,
107 struct uio *uio, int *eofp, struct xfs_dirent *dbp,
108 xfs_dir2_put_t put);
109
110extern int
111 xfs_dir2_block_lookup(struct xfs_da_args *args);
112
113extern int
114 xfs_dir2_block_removename(struct xfs_da_args *args);
115
116extern int
117 xfs_dir2_block_replace(struct xfs_da_args *args);
118
119extern int
120 xfs_dir2_leaf_to_block(struct xfs_da_args *args, struct xfs_dabuf *lbp,
121 struct xfs_dabuf *dbp);
122
123extern int
124 xfs_dir2_sf_to_block(struct xfs_da_args *args);
125
126#endif /* __XFS_DIR2_BLOCK_H__ */
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
new file mode 100644
index 000000000000..db9887a107de
--- /dev/null
+++ b/fs/xfs/xfs_dir2_data.c
@@ -0,0 +1,855 @@
1/*
2 * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33/*
34 * xfs_dir2_data.c
35 * Core data block handling routines for XFS V2 directories.
36 * See xfs_dir2_data.h for data structures.
37 */
38
39#include "xfs.h"
40
41#include "xfs_macros.h"
42#include "xfs_types.h"
43#include "xfs_inum.h"
44#include "xfs_log.h"
45#include "xfs_trans.h"
46#include "xfs_sb.h"
47#include "xfs_dir.h"
48#include "xfs_dir2.h"
49#include "xfs_dmapi.h"
50#include "xfs_mount.h"
51#include "xfs_bmap_btree.h"
52#include "xfs_attr_sf.h"
53#include "xfs_dir_sf.h"
54#include "xfs_dir2_sf.h"
55#include "xfs_dinode.h"
56#include "xfs_inode.h"
57#include "xfs_da_btree.h"
58#include "xfs_dir_leaf.h"
59#include "xfs_dir2_data.h"
60#include "xfs_dir2_leaf.h"
61#include "xfs_dir2_block.h"
62#include "xfs_error.h"
63
64#ifdef DEBUG
65/*
66 * Check the consistency of the data block.
67 * The input can also be a block-format directory.
68 * Pop an assert if we find anything bad.
69 */
70void
71xfs_dir2_data_check(
72 xfs_inode_t *dp, /* incore inode pointer */
73 xfs_dabuf_t *bp) /* data block's buffer */
74{
75 xfs_dir2_dataptr_t addr; /* addr for leaf lookup */
76 xfs_dir2_data_free_t *bf; /* bestfree table */
77 xfs_dir2_block_tail_t *btp=NULL; /* block tail */
78 int count; /* count of entries found */
79 xfs_dir2_data_t *d; /* data block pointer */
80 xfs_dir2_data_entry_t *dep; /* data entry */
81 xfs_dir2_data_free_t *dfp; /* bestfree entry */
82 xfs_dir2_data_unused_t *dup; /* unused entry */
83 char *endp; /* end of useful data */
84 int freeseen; /* mask of bestfrees seen */
85 xfs_dahash_t hash; /* hash of current name */
86 int i; /* leaf index */
87 int lastfree; /* last entry was unused */
88 xfs_dir2_leaf_entry_t *lep=NULL; /* block leaf entries */
89 xfs_mount_t *mp; /* filesystem mount point */
90 char *p; /* current data position */
91 int stale; /* count of stale leaves */
92
93 mp = dp->i_mount;
94 d = bp->data;
95 ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
96 INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
97 bf = d->hdr.bestfree;
98 p = (char *)d->u;
99 if (INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC) {
100 btp = XFS_DIR2_BLOCK_TAIL_P(mp, (xfs_dir2_block_t *)d);
101 lep = XFS_DIR2_BLOCK_LEAF_P(btp);
102 endp = (char *)lep;
103 } else
104 endp = (char *)d + mp->m_dirblksize;
105 count = lastfree = freeseen = 0;
106 /*
107 * Account for zero bestfree entries.
108 */
109 if (!bf[0].length) {
110 ASSERT(!bf[0].offset);
111 freeseen |= 1 << 0;
112 }
113 if (!bf[1].length) {
114 ASSERT(!bf[1].offset);
115 freeseen |= 1 << 1;
116 }
117 if (!bf[2].length) {
118 ASSERT(!bf[2].offset);
119 freeseen |= 1 << 2;
120 }
121 ASSERT(INT_GET(bf[0].length, ARCH_CONVERT) >= INT_GET(bf[1].length, ARCH_CONVERT));
122 ASSERT(INT_GET(bf[1].length, ARCH_CONVERT) >= INT_GET(bf[2].length, ARCH_CONVERT));
123 /*
124 * Loop over the data/unused entries.
125 */
126 while (p < endp) {
127 dup = (xfs_dir2_data_unused_t *)p;
128 /*
129 * If it's unused, look for the space in the bestfree table.
130 * If we find it, account for that, else make sure it
131 * doesn't need to be there.
132 */
133 if (INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG) {
134 ASSERT(lastfree == 0);
135 ASSERT(INT_GET(*XFS_DIR2_DATA_UNUSED_TAG_P(dup), ARCH_CONVERT) ==
136 (char *)dup - (char *)d);
137 dfp = xfs_dir2_data_freefind(d, dup);
138 if (dfp) {
139 i = (int)(dfp - bf);
140 ASSERT((freeseen & (1 << i)) == 0);
141 freeseen |= 1 << i;
142 } else
143 ASSERT(INT_GET(dup->length, ARCH_CONVERT) <= INT_GET(bf[2].length, ARCH_CONVERT));
144 p += INT_GET(dup->length, ARCH_CONVERT);
145 lastfree = 1;
146 continue;
147 }
148 /*
149 * It's a real entry. Validate the fields.
150 * If this is a block directory then make sure it's
151 * in the leaf section of the block.
152 * The linear search is crude but this is DEBUG code.
153 */
154 dep = (xfs_dir2_data_entry_t *)p;
155 ASSERT(dep->namelen != 0);
156 ASSERT(xfs_dir_ino_validate(mp, INT_GET(dep->inumber, ARCH_CONVERT)) == 0);
157 ASSERT(INT_GET(*XFS_DIR2_DATA_ENTRY_TAG_P(dep), ARCH_CONVERT) ==
158 (char *)dep - (char *)d);
159 count++;
160 lastfree = 0;
161 if (INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC) {
162 addr = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
163 (xfs_dir2_data_aoff_t)
164 ((char *)dep - (char *)d));
165 hash = xfs_da_hashname((char *)dep->name, dep->namelen);
166 for (i = 0; i < INT_GET(btp->count, ARCH_CONVERT); i++) {
167 if (INT_GET(lep[i].address, ARCH_CONVERT) == addr &&
168 INT_GET(lep[i].hashval, ARCH_CONVERT) == hash)
169 break;
170 }
171 ASSERT(i < INT_GET(btp->count, ARCH_CONVERT));
172 }
173 p += XFS_DIR2_DATA_ENTSIZE(dep->namelen);
174 }
175 /*
176 * Need to have seen all the entries and all the bestfree slots.
177 */
178 ASSERT(freeseen == 7);
179 if (INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC) {
180 for (i = stale = 0; i < INT_GET(btp->count, ARCH_CONVERT); i++) {
181 if (INT_GET(lep[i].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
182 stale++;
183 if (i > 0)
184 ASSERT(INT_GET(lep[i].hashval, ARCH_CONVERT) >= INT_GET(lep[i - 1].hashval, ARCH_CONVERT));
185 }
186 ASSERT(count == INT_GET(btp->count, ARCH_CONVERT) - INT_GET(btp->stale, ARCH_CONVERT));
187 ASSERT(stale == INT_GET(btp->stale, ARCH_CONVERT));
188 }
189}
190#endif
191
192/*
193 * Given a data block and an unused entry from that block,
194 * return the bestfree entry if any that corresponds to it.
195 */
196xfs_dir2_data_free_t *
197xfs_dir2_data_freefind(
198 xfs_dir2_data_t *d, /* data block */
199 xfs_dir2_data_unused_t *dup) /* data unused entry */
200{
201 xfs_dir2_data_free_t *dfp; /* bestfree entry */
202 xfs_dir2_data_aoff_t off; /* offset value needed */
203#if defined(DEBUG) && defined(__KERNEL__)
204 int matched; /* matched the value */
205 int seenzero; /* saw a 0 bestfree entry */
206#endif
207
208 off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)d);
209#if defined(DEBUG) && defined(__KERNEL__)
210 /*
211 * Validate some consistency in the bestfree table.
212 * Check order, non-overlapping entries, and if we find the
213 * one we're looking for it has to be exact.
214 */
215 ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
216 INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
217 for (dfp = &d->hdr.bestfree[0], seenzero = matched = 0;
218 dfp < &d->hdr.bestfree[XFS_DIR2_DATA_FD_COUNT];
219 dfp++) {
220 if (!dfp->offset) {
221 ASSERT(!dfp->length);
222 seenzero = 1;
223 continue;
224 }
225 ASSERT(seenzero == 0);
226 if (INT_GET(dfp->offset, ARCH_CONVERT) == off) {
227 matched = 1;
228 ASSERT(INT_GET(dfp->length, ARCH_CONVERT) == INT_GET(dup->length, ARCH_CONVERT));
229 } else if (off < INT_GET(dfp->offset, ARCH_CONVERT))
230 ASSERT(off + INT_GET(dup->length, ARCH_CONVERT) <= INT_GET(dfp->offset, ARCH_CONVERT));
231 else
232 ASSERT(INT_GET(dfp->offset, ARCH_CONVERT) + INT_GET(dfp->length, ARCH_CONVERT) <= off);
233 ASSERT(matched || INT_GET(dfp->length, ARCH_CONVERT) >= INT_GET(dup->length, ARCH_CONVERT));
234 if (dfp > &d->hdr.bestfree[0])
235 ASSERT(INT_GET(dfp[-1].length, ARCH_CONVERT) >= INT_GET(dfp[0].length, ARCH_CONVERT));
236 }
237#endif
238 /*
239 * If this is smaller than the smallest bestfree entry,
240 * it can't be there since they're sorted.
241 */
242 if (INT_GET(dup->length, ARCH_CONVERT) < INT_GET(d->hdr.bestfree[XFS_DIR2_DATA_FD_COUNT - 1].length, ARCH_CONVERT))
243 return NULL;
244 /*
245 * Look at the three bestfree entries for our guy.
246 */
247 for (dfp = &d->hdr.bestfree[0];
248 dfp < &d->hdr.bestfree[XFS_DIR2_DATA_FD_COUNT];
249 dfp++) {
250 if (!dfp->offset)
251 return NULL;
252 if (INT_GET(dfp->offset, ARCH_CONVERT) == off)
253 return dfp;
254 }
255 /*
256 * Didn't find it. This only happens if there are duplicate lengths.
257 */
258 return NULL;
259}
260
261/*
262 * Insert an unused-space entry into the bestfree table.
263 */
264xfs_dir2_data_free_t * /* entry inserted */
265xfs_dir2_data_freeinsert(
266 xfs_dir2_data_t *d, /* data block pointer */
267 xfs_dir2_data_unused_t *dup, /* unused space */
268 int *loghead) /* log the data header (out) */
269{
270 xfs_dir2_data_free_t *dfp; /* bestfree table pointer */
271 xfs_dir2_data_free_t new; /* new bestfree entry */
272
273#ifdef __KERNEL__
274 ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
275 INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
276#endif
277 dfp = d->hdr.bestfree;
278 INT_COPY(new.length, dup->length, ARCH_CONVERT);
279 INT_SET(new.offset, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dup - (char *)d));
280 /*
281 * Insert at position 0, 1, or 2; or not at all.
282 */
283 if (INT_GET(new.length, ARCH_CONVERT) > INT_GET(dfp[0].length, ARCH_CONVERT)) {
284 dfp[2] = dfp[1];
285 dfp[1] = dfp[0];
286 dfp[0] = new;
287 *loghead = 1;
288 return &dfp[0];
289 }
290 if (INT_GET(new.length, ARCH_CONVERT) > INT_GET(dfp[1].length, ARCH_CONVERT)) {
291 dfp[2] = dfp[1];
292 dfp[1] = new;
293 *loghead = 1;
294 return &dfp[1];
295 }
296 if (INT_GET(new.length, ARCH_CONVERT) > INT_GET(dfp[2].length, ARCH_CONVERT)) {
297 dfp[2] = new;
298 *loghead = 1;
299 return &dfp[2];
300 }
301 return NULL;
302}
303
304/*
305 * Remove a bestfree entry from the table.
306 */
307void
308xfs_dir2_data_freeremove(
309 xfs_dir2_data_t *d, /* data block pointer */
310 xfs_dir2_data_free_t *dfp, /* bestfree entry pointer */
311 int *loghead) /* out: log data header */
312{
313#ifdef __KERNEL__
314 ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
315 INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
316#endif
317 /*
318 * It's the first entry, slide the next 2 up.
319 */
320 if (dfp == &d->hdr.bestfree[0]) {
321 d->hdr.bestfree[0] = d->hdr.bestfree[1];
322 d->hdr.bestfree[1] = d->hdr.bestfree[2];
323 }
324 /*
325 * It's the second entry, slide the 3rd entry up.
326 */
327 else if (dfp == &d->hdr.bestfree[1])
328 d->hdr.bestfree[1] = d->hdr.bestfree[2];
329 /*
330 * Must be the last entry.
331 */
332 else
333 ASSERT(dfp == &d->hdr.bestfree[2]);
334 /*
335 * Clear the 3rd entry, must be zero now.
336 */
337 d->hdr.bestfree[2].length = 0;
338 d->hdr.bestfree[2].offset = 0;
339 *loghead = 1;
340}
341
342/*
343 * Given a data block, reconstruct its bestfree map.
344 */
345void
346xfs_dir2_data_freescan(
347 xfs_mount_t *mp, /* filesystem mount point */
348 xfs_dir2_data_t *d, /* data block pointer */
349 int *loghead, /* out: log data header */
350 char *aendp) /* in: caller's endp */
351{
352 xfs_dir2_block_tail_t *btp; /* block tail */
353 xfs_dir2_data_entry_t *dep; /* active data entry */
354 xfs_dir2_data_unused_t *dup; /* unused data entry */
355 char *endp; /* end of block's data */
356 char *p; /* current entry pointer */
357
358#ifdef __KERNEL__
359 ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
360 INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
361#endif
362 /*
363 * Start by clearing the table.
364 */
365 memset(d->hdr.bestfree, 0, sizeof(d->hdr.bestfree));
366 *loghead = 1;
367 /*
368 * Set up pointers.
369 */
370 p = (char *)d->u;
371 if (aendp)
372 endp = aendp;
373 else if (INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC) {
374 btp = XFS_DIR2_BLOCK_TAIL_P(mp, (xfs_dir2_block_t *)d);
375 endp = (char *)XFS_DIR2_BLOCK_LEAF_P(btp);
376 } else
377 endp = (char *)d + mp->m_dirblksize;
378 /*
379 * Loop over the block's entries.
380 */
381 while (p < endp) {
382 dup = (xfs_dir2_data_unused_t *)p;
383 /*
384 * If it's a free entry, insert it.
385 */
386 if (INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG) {
387 ASSERT((char *)dup - (char *)d ==
388 INT_GET(*XFS_DIR2_DATA_UNUSED_TAG_P(dup), ARCH_CONVERT));
389 xfs_dir2_data_freeinsert(d, dup, loghead);
390 p += INT_GET(dup->length, ARCH_CONVERT);
391 }
392 /*
393 * For active entries, check their tags and skip them.
394 */
395 else {
396 dep = (xfs_dir2_data_entry_t *)p;
397 ASSERT((char *)dep - (char *)d ==
398 INT_GET(*XFS_DIR2_DATA_ENTRY_TAG_P(dep), ARCH_CONVERT));
399 p += XFS_DIR2_DATA_ENTSIZE(dep->namelen);
400 }
401 }
402}
403
404/*
405 * Initialize a data block at the given block number in the directory.
406 * Give back the buffer for the created block.
407 */
408int /* error */
409xfs_dir2_data_init(
410 xfs_da_args_t *args, /* directory operation args */
411 xfs_dir2_db_t blkno, /* logical dir block number */
412 xfs_dabuf_t **bpp) /* output block buffer */
413{
414 xfs_dabuf_t *bp; /* block buffer */
415 xfs_dir2_data_t *d; /* pointer to block */
416 xfs_inode_t *dp; /* incore directory inode */
417 xfs_dir2_data_unused_t *dup; /* unused entry pointer */
418 int error; /* error return value */
419 int i; /* bestfree index */
420 xfs_mount_t *mp; /* filesystem mount point */
421 xfs_trans_t *tp; /* transaction pointer */
422 int t; /* temp */
423
424 dp = args->dp;
425 mp = dp->i_mount;
426 tp = args->trans;
427 /*
428 * Get the buffer set up for the block.
429 */
430 error = xfs_da_get_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, blkno), -1, &bp,
431 XFS_DATA_FORK);
432 if (error) {
433 return error;
434 }
435 ASSERT(bp != NULL);
436 /*
437 * Initialize the header.
438 */
439 d = bp->data;
440 INT_SET(d->hdr.magic, ARCH_CONVERT, XFS_DIR2_DATA_MAGIC);
441 INT_SET(d->hdr.bestfree[0].offset, ARCH_CONVERT, (xfs_dir2_data_off_t)sizeof(d->hdr));
442 for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) {
443 d->hdr.bestfree[i].length = 0;
444 d->hdr.bestfree[i].offset = 0;
445 }
446 /*
447 * Set up an unused entry for the block's body.
448 */
449 dup = &d->u[0].unused;
450 INT_SET(dup->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG);
451
452 t=mp->m_dirblksize - (uint)sizeof(d->hdr);
453 INT_SET(d->hdr.bestfree[0].length, ARCH_CONVERT, t);
454 INT_SET(dup->length, ARCH_CONVERT, t);
455 INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(dup), ARCH_CONVERT,
456 (xfs_dir2_data_off_t)((char *)dup - (char *)d));
457 /*
458 * Log it and return it.
459 */
460 xfs_dir2_data_log_header(tp, bp);
461 xfs_dir2_data_log_unused(tp, bp, dup);
462 *bpp = bp;
463 return 0;
464}
465
466/*
467 * Log an active data entry from the block.
468 */
469void
470xfs_dir2_data_log_entry(
471 xfs_trans_t *tp, /* transaction pointer */
472 xfs_dabuf_t *bp, /* block buffer */
473 xfs_dir2_data_entry_t *dep) /* data entry pointer */
474{
475 xfs_dir2_data_t *d; /* data block pointer */
476
477 d = bp->data;
478 ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
479 INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
480 xfs_da_log_buf(tp, bp, (uint)((char *)dep - (char *)d),
481 (uint)((char *)(XFS_DIR2_DATA_ENTRY_TAG_P(dep) + 1) -
482 (char *)d - 1));
483}
484
485/*
486 * Log a data block header.
487 */
488void
489xfs_dir2_data_log_header(
490 xfs_trans_t *tp, /* transaction pointer */
491 xfs_dabuf_t *bp) /* block buffer */
492{
493 xfs_dir2_data_t *d; /* data block pointer */
494
495 d = bp->data;
496 ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
497 INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
498 xfs_da_log_buf(tp, bp, (uint)((char *)&d->hdr - (char *)d),
499 (uint)(sizeof(d->hdr) - 1));
500}
501
502/*
503 * Log a data unused entry.
504 */
505void
506xfs_dir2_data_log_unused(
507 xfs_trans_t *tp, /* transaction pointer */
508 xfs_dabuf_t *bp, /* block buffer */
509 xfs_dir2_data_unused_t *dup) /* data unused pointer */
510{
511 xfs_dir2_data_t *d; /* data block pointer */
512
513 d = bp->data;
514 ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
515 INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
516 /*
517 * Log the first part of the unused entry.
518 */
519 xfs_da_log_buf(tp, bp, (uint)((char *)dup - (char *)d),
520 (uint)((char *)&dup->length + sizeof(dup->length) -
521 1 - (char *)d));
522 /*
523 * Log the end (tag) of the unused entry.
524 */
525 xfs_da_log_buf(tp, bp,
526 (uint)((char *)XFS_DIR2_DATA_UNUSED_TAG_P(dup) - (char *)d),
527 (uint)((char *)XFS_DIR2_DATA_UNUSED_TAG_P(dup) - (char *)d +
528 sizeof(xfs_dir2_data_off_t) - 1));
529}
530
531/*
532 * Make a byte range in the data block unused.
533 * Its current contents are unimportant.
534 */
535void
536xfs_dir2_data_make_free(
537 xfs_trans_t *tp, /* transaction pointer */
538 xfs_dabuf_t *bp, /* block buffer */
539 xfs_dir2_data_aoff_t offset, /* starting byte offset */
540 xfs_dir2_data_aoff_t len, /* length in bytes */
541 int *needlogp, /* out: log header */
542 int *needscanp) /* out: regen bestfree */
543{
544 xfs_dir2_data_t *d; /* data block pointer */
545 xfs_dir2_data_free_t *dfp; /* bestfree pointer */
546 char *endptr; /* end of data area */
547 xfs_mount_t *mp; /* filesystem mount point */
548 int needscan; /* need to regen bestfree */
549 xfs_dir2_data_unused_t *newdup; /* new unused entry */
550 xfs_dir2_data_unused_t *postdup; /* unused entry after us */
551 xfs_dir2_data_unused_t *prevdup; /* unused entry before us */
552
553 mp = tp->t_mountp;
554 d = bp->data;
555 /*
556 * Figure out where the end of the data area is.
557 */
558 if (INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC)
559 endptr = (char *)d + mp->m_dirblksize;
560 else {
561 xfs_dir2_block_tail_t *btp; /* block tail */
562
563 ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
564 btp = XFS_DIR2_BLOCK_TAIL_P(mp, (xfs_dir2_block_t *)d);
565 endptr = (char *)XFS_DIR2_BLOCK_LEAF_P(btp);
566 }
567 /*
568 * If this isn't the start of the block, then back up to
569 * the previous entry and see if it's free.
570 */
571 if (offset > sizeof(d->hdr)) {
572 xfs_dir2_data_off_t *tagp; /* tag just before us */
573
574 tagp = (xfs_dir2_data_off_t *)((char *)d + offset) - 1;
575 prevdup = (xfs_dir2_data_unused_t *)((char *)d + INT_GET(*tagp, ARCH_CONVERT));
576 if (INT_GET(prevdup->freetag, ARCH_CONVERT) != XFS_DIR2_DATA_FREE_TAG)
577 prevdup = NULL;
578 } else
579 prevdup = NULL;
580 /*
581 * If this isn't the end of the block, see if the entry after
582 * us is free.
583 */
584 if ((char *)d + offset + len < endptr) {
585 postdup =
586 (xfs_dir2_data_unused_t *)((char *)d + offset + len);
587 if (INT_GET(postdup->freetag, ARCH_CONVERT) != XFS_DIR2_DATA_FREE_TAG)
588 postdup = NULL;
589 } else
590 postdup = NULL;
591 ASSERT(*needscanp == 0);
592 needscan = 0;
593 /*
594 * Previous and following entries are both free,
595 * merge everything into a single free entry.
596 */
597 if (prevdup && postdup) {
598 xfs_dir2_data_free_t *dfp2; /* another bestfree pointer */
599
600 /*
601 * See if prevdup and/or postdup are in bestfree table.
602 */
603 dfp = xfs_dir2_data_freefind(d, prevdup);
604 dfp2 = xfs_dir2_data_freefind(d, postdup);
605 /*
606 * We need a rescan unless there are exactly 2 free entries
607 * namely our two. Then we know what's happening, otherwise
608 * since the third bestfree is there, there might be more
609 * entries.
610 */
611 needscan = d->hdr.bestfree[2].length;
612 /*
613 * Fix up the new big freespace.
614 */
615 INT_MOD(prevdup->length, ARCH_CONVERT, len + INT_GET(postdup->length, ARCH_CONVERT));
616 INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(prevdup), ARCH_CONVERT,
617 (xfs_dir2_data_off_t)((char *)prevdup - (char *)d));
618 xfs_dir2_data_log_unused(tp, bp, prevdup);
619 if (!needscan) {
620 /*
621 * Has to be the case that entries 0 and 1 are
622 * dfp and dfp2 (don't know which is which), and
623 * entry 2 is empty.
624 * Remove entry 1 first then entry 0.
625 */
626 ASSERT(dfp && dfp2);
627 if (dfp == &d->hdr.bestfree[1]) {
628 dfp = &d->hdr.bestfree[0];
629 ASSERT(dfp2 == dfp);
630 dfp2 = &d->hdr.bestfree[1];
631 }
632 xfs_dir2_data_freeremove(d, dfp2, needlogp);
633 xfs_dir2_data_freeremove(d, dfp, needlogp);
634 /*
635 * Now insert the new entry.
636 */
637 dfp = xfs_dir2_data_freeinsert(d, prevdup, needlogp);
638 ASSERT(dfp == &d->hdr.bestfree[0]);
639 ASSERT(INT_GET(dfp->length, ARCH_CONVERT) == INT_GET(prevdup->length, ARCH_CONVERT));
640 ASSERT(!dfp[1].length);
641 ASSERT(!dfp[2].length);
642 }
643 }
644 /*
645 * The entry before us is free, merge with it.
646 */
647 else if (prevdup) {
648 dfp = xfs_dir2_data_freefind(d, prevdup);
649 INT_MOD(prevdup->length, ARCH_CONVERT, len);
650 INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(prevdup), ARCH_CONVERT,
651 (xfs_dir2_data_off_t)((char *)prevdup - (char *)d));
652 xfs_dir2_data_log_unused(tp, bp, prevdup);
653 /*
654 * If the previous entry was in the table, the new entry
655 * is longer, so it will be in the table too. Remove
656 * the old one and add the new one.
657 */
658 if (dfp) {
659 xfs_dir2_data_freeremove(d, dfp, needlogp);
660 (void)xfs_dir2_data_freeinsert(d, prevdup, needlogp);
661 }
662 /*
663 * Otherwise we need a scan if the new entry is big enough.
664 */
665 else
666 needscan = INT_GET(prevdup->length, ARCH_CONVERT) > INT_GET(d->hdr.bestfree[2].length, ARCH_CONVERT);
667 }
668 /*
669 * The following entry is free, merge with it.
670 */
671 else if (postdup) {
672 dfp = xfs_dir2_data_freefind(d, postdup);
673 newdup = (xfs_dir2_data_unused_t *)((char *)d + offset);
674 INT_SET(newdup->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG);
675 INT_SET(newdup->length, ARCH_CONVERT, len + INT_GET(postdup->length, ARCH_CONVERT));
676 INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(newdup), ARCH_CONVERT,
677 (xfs_dir2_data_off_t)((char *)newdup - (char *)d));
678 xfs_dir2_data_log_unused(tp, bp, newdup);
679 /*
680 * If the following entry was in the table, the new entry
681 * is longer, so it will be in the table too. Remove
682 * the old one and add the new one.
683 */
684 if (dfp) {
685 xfs_dir2_data_freeremove(d, dfp, needlogp);
686 (void)xfs_dir2_data_freeinsert(d, newdup, needlogp);
687 }
688 /*
689 * Otherwise we need a scan if the new entry is big enough.
690 */
691 else
692 needscan = INT_GET(newdup->length, ARCH_CONVERT) > INT_GET(d->hdr.bestfree[2].length, ARCH_CONVERT);
693 }
694 /*
695 * Neither neighbor is free. Make a new entry.
696 */
697 else {
698 newdup = (xfs_dir2_data_unused_t *)((char *)d + offset);
699 INT_SET(newdup->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG);
700 INT_SET(newdup->length, ARCH_CONVERT, len);
701 INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(newdup), ARCH_CONVERT,
702 (xfs_dir2_data_off_t)((char *)newdup - (char *)d));
703 xfs_dir2_data_log_unused(tp, bp, newdup);
704 (void)xfs_dir2_data_freeinsert(d, newdup, needlogp);
705 }
706 *needscanp = needscan;
707}
708
709/*
710 * Take a byte range out of an existing unused space and make it un-free.
711 */
712void
713xfs_dir2_data_use_free(
714 xfs_trans_t *tp, /* transaction pointer */
715 xfs_dabuf_t *bp, /* data block buffer */
716 xfs_dir2_data_unused_t *dup, /* unused entry */
717 xfs_dir2_data_aoff_t offset, /* starting offset to use */
718 xfs_dir2_data_aoff_t len, /* length to use */
719 int *needlogp, /* out: need to log header */
720 int *needscanp) /* out: need regen bestfree */
721{
722 xfs_dir2_data_t *d; /* data block */
723 xfs_dir2_data_free_t *dfp; /* bestfree pointer */
724 int matchback; /* matches end of freespace */
725 int matchfront; /* matches start of freespace */
726 int needscan; /* need to regen bestfree */
727 xfs_dir2_data_unused_t *newdup; /* new unused entry */
728 xfs_dir2_data_unused_t *newdup2; /* another new unused entry */
729 int oldlen; /* old unused entry's length */
730
731 d = bp->data;
732 ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
733 INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
734 ASSERT(INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG);
735 ASSERT(offset >= (char *)dup - (char *)d);
736 ASSERT(offset + len <= (char *)dup + INT_GET(dup->length, ARCH_CONVERT) - (char *)d);
737 ASSERT((char *)dup - (char *)d == INT_GET(*XFS_DIR2_DATA_UNUSED_TAG_P(dup), ARCH_CONVERT));
738 /*
739 * Look up the entry in the bestfree table.
740 */
741 dfp = xfs_dir2_data_freefind(d, dup);
742 oldlen = INT_GET(dup->length, ARCH_CONVERT);
743 ASSERT(dfp || oldlen <= INT_GET(d->hdr.bestfree[2].length, ARCH_CONVERT));
744 /*
745 * Check for alignment with front and back of the entry.
746 */
747 matchfront = (char *)dup - (char *)d == offset;
748 matchback = (char *)dup + oldlen - (char *)d == offset + len;
749 ASSERT(*needscanp == 0);
750 needscan = 0;
751 /*
752 * If we matched it exactly we just need to get rid of it from
753 * the bestfree table.
754 */
755 if (matchfront && matchback) {
756 if (dfp) {
757 needscan = d->hdr.bestfree[2].offset;
758 if (!needscan)
759 xfs_dir2_data_freeremove(d, dfp, needlogp);
760 }
761 }
762 /*
763 * We match the first part of the entry.
764 * Make a new entry with the remaining freespace.
765 */
766 else if (matchfront) {
767 newdup = (xfs_dir2_data_unused_t *)((char *)d + offset + len);
768 INT_SET(newdup->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG);
769 INT_SET(newdup->length, ARCH_CONVERT, oldlen - len);
770 INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(newdup), ARCH_CONVERT,
771 (xfs_dir2_data_off_t)((char *)newdup - (char *)d));
772 xfs_dir2_data_log_unused(tp, bp, newdup);
773 /*
774 * If it was in the table, remove it and add the new one.
775 */
776 if (dfp) {
777 xfs_dir2_data_freeremove(d, dfp, needlogp);
778 dfp = xfs_dir2_data_freeinsert(d, newdup, needlogp);
779 ASSERT(dfp != NULL);
780 ASSERT(INT_GET(dfp->length, ARCH_CONVERT) == INT_GET(newdup->length, ARCH_CONVERT));
781 ASSERT(INT_GET(dfp->offset, ARCH_CONVERT) == (char *)newdup - (char *)d);
782 /*
783 * If we got inserted at the last slot,
784 * that means we don't know if there was a better
785 * choice for the last slot, or not. Rescan.
786 */
787 needscan = dfp == &d->hdr.bestfree[2];
788 }
789 }
790 /*
791 * We match the last part of the entry.
792 * Trim the allocated space off the tail of the entry.
793 */
794 else if (matchback) {
795 newdup = dup;
796 INT_SET(newdup->length, ARCH_CONVERT, (xfs_dir2_data_off_t)
797 (((char *)d + offset) - (char *)newdup));
798 INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(newdup), ARCH_CONVERT,
799 (xfs_dir2_data_off_t)((char *)newdup - (char *)d));
800 xfs_dir2_data_log_unused(tp, bp, newdup);
801 /*
802 * If it was in the table, remove it and add the new one.
803 */
804 if (dfp) {
805 xfs_dir2_data_freeremove(d, dfp, needlogp);
806 dfp = xfs_dir2_data_freeinsert(d, newdup, needlogp);
807 ASSERT(dfp != NULL);
808 ASSERT(INT_GET(dfp->length, ARCH_CONVERT) == INT_GET(newdup->length, ARCH_CONVERT));
809 ASSERT(INT_GET(dfp->offset, ARCH_CONVERT) == (char *)newdup - (char *)d);
810 /*
811 * If we got inserted at the last slot,
812 * that means we don't know if there was a better
813 * choice for the last slot, or not. Rescan.
814 */
815 needscan = dfp == &d->hdr.bestfree[2];
816 }
817 }
818 /*
819 * Poking out the middle of an entry.
820 * Make two new entries.
821 */
822 else {
823 newdup = dup;
824 INT_SET(newdup->length, ARCH_CONVERT, (xfs_dir2_data_off_t)
825 (((char *)d + offset) - (char *)newdup));
826 INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(newdup), ARCH_CONVERT,
827 (xfs_dir2_data_off_t)((char *)newdup - (char *)d));
828 xfs_dir2_data_log_unused(tp, bp, newdup);
829 newdup2 = (xfs_dir2_data_unused_t *)((char *)d + offset + len);
830 INT_SET(newdup2->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG);
831 INT_SET(newdup2->length, ARCH_CONVERT, oldlen - len - INT_GET(newdup->length, ARCH_CONVERT));
832 INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(newdup2), ARCH_CONVERT,
833 (xfs_dir2_data_off_t)((char *)newdup2 - (char *)d));
834 xfs_dir2_data_log_unused(tp, bp, newdup2);
835 /*
836 * If the old entry was in the table, we need to scan
837 * if the 3rd entry was valid, since these entries
838 * are smaller than the old one.
839 * If we don't need to scan that means there were 1 or 2
840 * entries in the table, and removing the old and adding
841 * the 2 new will work.
842 */
843 if (dfp) {
844 needscan = d->hdr.bestfree[2].length;
845 if (!needscan) {
846 xfs_dir2_data_freeremove(d, dfp, needlogp);
847 (void)xfs_dir2_data_freeinsert(d, newdup,
848 needlogp);
849 (void)xfs_dir2_data_freeinsert(d, newdup2,
850 needlogp);
851 }
852 }
853 }
854 *needscanp = needscan;
855}
diff --git a/fs/xfs/xfs_dir2_data.h b/fs/xfs/xfs_dir2_data.h
new file mode 100644
index 000000000000..3f02294ccff0
--- /dev/null
+++ b/fs/xfs/xfs_dir2_data.h
@@ -0,0 +1,231 @@
1/*
2 * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_DIR2_DATA_H__
33#define __XFS_DIR2_DATA_H__
34
35/*
36 * Directory format 2, data block structures.
37 */
38
39struct xfs_dabuf;
40struct xfs_da_args;
41struct xfs_inode;
42struct xfs_trans;
43
44/*
45 * Constants.
46 */
47#define XFS_DIR2_DATA_MAGIC 0x58443244 /* XD2D: for multiblock dirs */
48#define XFS_DIR2_DATA_ALIGN_LOG 3 /* i.e., 8 bytes */
49#define XFS_DIR2_DATA_ALIGN (1 << XFS_DIR2_DATA_ALIGN_LOG)
50#define XFS_DIR2_DATA_FREE_TAG 0xffff
51#define XFS_DIR2_DATA_FD_COUNT 3
52
53/*
54 * Directory address space divided into sections,
55 * spaces separated by 32gb.
56 */
57#define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
58#define XFS_DIR2_DATA_SPACE 0
59#define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE)
60#define XFS_DIR2_DATA_FIRSTDB(mp) \
61 XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_DATA_OFFSET)
62
63/*
64 * Offsets of . and .. in data space (always block 0)
65 */
66#define XFS_DIR2_DATA_DOT_OFFSET \
67 ((xfs_dir2_data_aoff_t)sizeof(xfs_dir2_data_hdr_t))
68#define XFS_DIR2_DATA_DOTDOT_OFFSET \
69 (XFS_DIR2_DATA_DOT_OFFSET + XFS_DIR2_DATA_ENTSIZE(1))
70#define XFS_DIR2_DATA_FIRST_OFFSET \
71 (XFS_DIR2_DATA_DOTDOT_OFFSET + XFS_DIR2_DATA_ENTSIZE(2))
72
73/*
74 * Structures.
75 */
76
77/*
78 * Describe a free area in the data block.
79 * The freespace will be formatted as a xfs_dir2_data_unused_t.
80 */
81typedef struct xfs_dir2_data_free {
82 xfs_dir2_data_off_t offset; /* start of freespace */
83 xfs_dir2_data_off_t length; /* length of freespace */
84} xfs_dir2_data_free_t;
85
86/*
87 * Header for the data blocks.
88 * Always at the beginning of a directory-sized block.
89 * The code knows that XFS_DIR2_DATA_FD_COUNT is 3.
90 */
91typedef struct xfs_dir2_data_hdr {
92 __uint32_t magic; /* XFS_DIR2_DATA_MAGIC */
93 /* or XFS_DIR2_BLOCK_MAGIC */
94 xfs_dir2_data_free_t bestfree[XFS_DIR2_DATA_FD_COUNT];
95} xfs_dir2_data_hdr_t;
96
97/*
98 * Active entry in a data block. Aligned to 8 bytes.
99 * Tag appears as the last 2 bytes.
100 */
101typedef struct xfs_dir2_data_entry {
102 xfs_ino_t inumber; /* inode number */
103 __uint8_t namelen; /* name length */
104 __uint8_t name[1]; /* name bytes, no null */
105 /* variable offset */
106 xfs_dir2_data_off_t tag; /* starting offset of us */
107} xfs_dir2_data_entry_t;
108
109/*
110 * Unused entry in a data block. Aligned to 8 bytes.
111 * Tag appears as the last 2 bytes.
112 */
113typedef struct xfs_dir2_data_unused {
114 __uint16_t freetag; /* XFS_DIR2_DATA_FREE_TAG */
115 xfs_dir2_data_off_t length; /* total free length */
116 /* variable offset */
117 xfs_dir2_data_off_t tag; /* starting offset of us */
118} xfs_dir2_data_unused_t;
119
120typedef union {
121 xfs_dir2_data_entry_t entry;
122 xfs_dir2_data_unused_t unused;
123} xfs_dir2_data_union_t;
124
125/*
126 * Generic data block structure, for xfs_db.
127 */
128typedef struct xfs_dir2_data {
129 xfs_dir2_data_hdr_t hdr; /* magic XFS_DIR2_DATA_MAGIC */
130 xfs_dir2_data_union_t u[1];
131} xfs_dir2_data_t;
132
133/*
134 * Macros.
135 */
136
137/*
138 * Size of a data entry.
139 */
140#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DATA_ENTSIZE)
141int xfs_dir2_data_entsize(int n);
142#define XFS_DIR2_DATA_ENTSIZE(n) xfs_dir2_data_entsize(n)
143#else
144#define XFS_DIR2_DATA_ENTSIZE(n) \
145 ((int)(roundup(offsetof(xfs_dir2_data_entry_t, name[0]) + (n) + \
146 (uint)sizeof(xfs_dir2_data_off_t), XFS_DIR2_DATA_ALIGN)))
147#endif
148
149/*
150 * Pointer to an entry's tag word.
151 */
152#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DATA_ENTRY_TAG_P)
153xfs_dir2_data_off_t *xfs_dir2_data_entry_tag_p(xfs_dir2_data_entry_t *dep);
154#define XFS_DIR2_DATA_ENTRY_TAG_P(dep) xfs_dir2_data_entry_tag_p(dep)
155#else
156#define XFS_DIR2_DATA_ENTRY_TAG_P(dep) \
157 ((xfs_dir2_data_off_t *)\
158 ((char *)(dep) + XFS_DIR2_DATA_ENTSIZE((dep)->namelen) - \
159 (uint)sizeof(xfs_dir2_data_off_t)))
160#endif
161
162/*
163 * Pointer to a freespace's tag word.
164 */
165#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DATA_UNUSED_TAG_P)
166xfs_dir2_data_off_t *xfs_dir2_data_unused_tag_p(xfs_dir2_data_unused_t *dup);
167#define XFS_DIR2_DATA_UNUSED_TAG_P(dup) \
168 xfs_dir2_data_unused_tag_p(dup)
169#else
170#define XFS_DIR2_DATA_UNUSED_TAG_P(dup) \
171 ((xfs_dir2_data_off_t *)\
172 ((char *)(dup) + INT_GET((dup)->length, ARCH_CONVERT) \
173 - (uint)sizeof(xfs_dir2_data_off_t)))
174#endif
175
176/*
177 * Function declarations.
178 */
179
180#ifdef DEBUG
181extern void
182 xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_dabuf *bp);
183#else
184#define xfs_dir2_data_check(dp,bp)
185#endif
186
187extern xfs_dir2_data_free_t *
188 xfs_dir2_data_freefind(xfs_dir2_data_t *d,
189 xfs_dir2_data_unused_t *dup);
190
191extern xfs_dir2_data_free_t *
192 xfs_dir2_data_freeinsert(xfs_dir2_data_t *d,
193 xfs_dir2_data_unused_t *dup, int *loghead);
194
195extern void
196 xfs_dir2_data_freeremove(xfs_dir2_data_t *d,
197 xfs_dir2_data_free_t *dfp, int *loghead);
198
199extern void
200 xfs_dir2_data_freescan(struct xfs_mount *mp, xfs_dir2_data_t *d,
201 int *loghead, char *aendp);
202
203extern int
204 xfs_dir2_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno,
205 struct xfs_dabuf **bpp);
206
207extern void
208 xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_dabuf *bp,
209 xfs_dir2_data_entry_t *dep);
210
211extern void
212 xfs_dir2_data_log_header(struct xfs_trans *tp, struct xfs_dabuf *bp);
213
214extern void
215 xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_dabuf *bp,
216 xfs_dir2_data_unused_t *dup);
217
218extern void
219 xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_dabuf *bp,
220 xfs_dir2_data_aoff_t offset,
221 xfs_dir2_data_aoff_t len, int *needlogp,
222 int *needscanp);
223
224extern void
225 xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_dabuf *bp,
226 xfs_dir2_data_unused_t *dup,
227 xfs_dir2_data_aoff_t offset,
228 xfs_dir2_data_aoff_t len, int *needlogp,
229 int *needscanp);
230
231#endif /* __XFS_DIR2_DATA_H__ */
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
new file mode 100644
index 000000000000..262d1e86df30
--- /dev/null
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -0,0 +1,1896 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33/*
34 * xfs_dir2_leaf.c
35 * XFS directory version 2 implementation - single leaf form
36 * see xfs_dir2_leaf.h for data structures.
37 * These directories have multiple XFS_DIR2_DATA blocks and one
38 * XFS_DIR2_LEAF1 block containing the hash table and freespace map.
39 */
40
41#include "xfs.h"
42
43#include "xfs_macros.h"
44#include "xfs_types.h"
45#include "xfs_inum.h"
46#include "xfs_log.h"
47#include "xfs_trans.h"
48#include "xfs_sb.h"
49#include "xfs_ag.h"
50#include "xfs_dir.h"
51#include "xfs_dir2.h"
52#include "xfs_dmapi.h"
53#include "xfs_mount.h"
54#include "xfs_bmap_btree.h"
55#include "xfs_attr_sf.h"
56#include "xfs_dir_sf.h"
57#include "xfs_dir2_sf.h"
58#include "xfs_dinode.h"
59#include "xfs_inode.h"
60#include "xfs_bmap.h"
61#include "xfs_da_btree.h"
62#include "xfs_dir2_data.h"
63#include "xfs_dir2_leaf.h"
64#include "xfs_dir2_block.h"
65#include "xfs_dir2_node.h"
66#include "xfs_dir2_trace.h"
67#include "xfs_error.h"
68#include "xfs_bit.h"
69
70/*
71 * Local function declarations.
72 */
73#ifdef DEBUG
74static void xfs_dir2_leaf_check(xfs_inode_t *dp, xfs_dabuf_t *bp);
75#else
76#define xfs_dir2_leaf_check(dp, bp)
77#endif
78static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, xfs_dabuf_t **lbpp,
79 int *indexp, xfs_dabuf_t **dbpp);
80
81/*
82 * Convert a block form directory to a leaf form directory.
83 */
84int /* error */
85xfs_dir2_block_to_leaf(
86 xfs_da_args_t *args, /* operation arguments */
87 xfs_dabuf_t *dbp) /* input block's buffer */
88{
89 xfs_dir2_data_off_t *bestsp; /* leaf's bestsp entries */
90 xfs_dablk_t blkno; /* leaf block's bno */
91 xfs_dir2_block_t *block; /* block structure */
92 xfs_dir2_leaf_entry_t *blp; /* block's leaf entries */
93 xfs_dir2_block_tail_t *btp; /* block's tail */
94 xfs_inode_t *dp; /* incore directory inode */
95 int error; /* error return code */
96 xfs_dabuf_t *lbp; /* leaf block's buffer */
97 xfs_dir2_db_t ldb; /* leaf block's bno */
98 xfs_dir2_leaf_t *leaf; /* leaf structure */
99 xfs_dir2_leaf_tail_t *ltp; /* leaf's tail */
100 xfs_mount_t *mp; /* filesystem mount point */
101 int needlog; /* need to log block header */
102 int needscan; /* need to rescan bestfree */
103 xfs_trans_t *tp; /* transaction pointer */
104
105 xfs_dir2_trace_args_b("block_to_leaf", args, dbp);
106 dp = args->dp;
107 mp = dp->i_mount;
108 tp = args->trans;
109 /*
110 * Add the leaf block to the inode.
111 * This interface will only put blocks in the leaf/node range.
112 * Since that's empty now, we'll get the root (block 0 in range).
113 */
114 if ((error = xfs_da_grow_inode(args, &blkno))) {
115 return error;
116 }
117 ldb = XFS_DIR2_DA_TO_DB(mp, blkno);
118 ASSERT(ldb == XFS_DIR2_LEAF_FIRSTDB(mp));
119 /*
120 * Initialize the leaf block, get a buffer for it.
121 */
122 if ((error = xfs_dir2_leaf_init(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC))) {
123 return error;
124 }
125 ASSERT(lbp != NULL);
126 leaf = lbp->data;
127 block = dbp->data;
128 xfs_dir2_data_check(dp, dbp);
129 btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
130 blp = XFS_DIR2_BLOCK_LEAF_P(btp);
131 /*
132 * Set the counts in the leaf header.
133 */
134 INT_COPY(leaf->hdr.count, btp->count, ARCH_CONVERT); /* INT_: type change */
135 INT_COPY(leaf->hdr.stale, btp->stale, ARCH_CONVERT); /* INT_: type change */
136 /*
137 * Could compact these but I think we always do the conversion
138 * after squeezing out stale entries.
139 */
140 memcpy(leaf->ents, blp, INT_GET(btp->count, ARCH_CONVERT) * sizeof(xfs_dir2_leaf_entry_t));
141 xfs_dir2_leaf_log_ents(tp, lbp, 0, INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1);
142 needscan = 0;
143 needlog = 1;
144 /*
145 * Make the space formerly occupied by the leaf entries and block
146 * tail be free.
147 */
148 xfs_dir2_data_make_free(tp, dbp,
149 (xfs_dir2_data_aoff_t)((char *)blp - (char *)block),
150 (xfs_dir2_data_aoff_t)((char *)block + mp->m_dirblksize -
151 (char *)blp),
152 &needlog, &needscan);
153 /*
154 * Fix up the block header, make it a data block.
155 */
156 INT_SET(block->hdr.magic, ARCH_CONVERT, XFS_DIR2_DATA_MAGIC);
157 if (needscan)
158 xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog,
159 NULL);
160 /*
161 * Set up leaf tail and bests table.
162 */
163 ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
164 INT_SET(ltp->bestcount, ARCH_CONVERT, 1);
165 bestsp = XFS_DIR2_LEAF_BESTS_P(ltp);
166 INT_COPY(bestsp[0], block->hdr.bestfree[0].length, ARCH_CONVERT);
167 /*
168 * Log the data header and leaf bests table.
169 */
170 if (needlog)
171 xfs_dir2_data_log_header(tp, dbp);
172 xfs_dir2_leaf_check(dp, lbp);
173 xfs_dir2_data_check(dp, dbp);
174 xfs_dir2_leaf_log_bests(tp, lbp, 0, 0);
175 xfs_da_buf_done(lbp);
176 return 0;
177}
178
179/*
180 * Add an entry to a leaf form directory.
181 */
182int /* error */
183xfs_dir2_leaf_addname(
184 xfs_da_args_t *args) /* operation arguments */
185{
186 xfs_dir2_data_off_t *bestsp; /* freespace table in leaf */
187 int compact; /* need to compact leaves */
188 xfs_dir2_data_t *data; /* data block structure */
189 xfs_dabuf_t *dbp; /* data block buffer */
190 xfs_dir2_data_entry_t *dep; /* data block entry */
191 xfs_inode_t *dp; /* incore directory inode */
192 xfs_dir2_data_unused_t *dup; /* data unused entry */
193 int error; /* error return value */
194 int grown; /* allocated new data block */
195 int highstale; /* index of next stale leaf */
196 int i; /* temporary, index */
197 int index; /* leaf table position */
198 xfs_dabuf_t *lbp; /* leaf's buffer */
199 xfs_dir2_leaf_t *leaf; /* leaf structure */
200 int length; /* length of new entry */
201 xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */
202 int lfloglow; /* low leaf logging index */
203 int lfloghigh; /* high leaf logging index */
204 int lowstale; /* index of prev stale leaf */
205 xfs_dir2_leaf_tail_t *ltp; /* leaf tail pointer */
206 xfs_mount_t *mp; /* filesystem mount point */
207 int needbytes; /* leaf block bytes needed */
208 int needlog; /* need to log data header */
209 int needscan; /* need to rescan data free */
210 xfs_dir2_data_off_t *tagp; /* end of data entry */
211 xfs_trans_t *tp; /* transaction pointer */
212 xfs_dir2_db_t use_block; /* data block number */
213
214 xfs_dir2_trace_args("leaf_addname", args);
215 dp = args->dp;
216 tp = args->trans;
217 mp = dp->i_mount;
218 /*
219 * Read the leaf block.
220 */
221 error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
222 XFS_DATA_FORK);
223 if (error) {
224 return error;
225 }
226 ASSERT(lbp != NULL);
227 /*
228 * Look up the entry by hash value and name.
229 * We know it's not there, our caller has already done a lookup.
230 * So the index is of the entry to insert in front of.
231 * But if there are dup hash values the index is of the first of those.
232 */
233 index = xfs_dir2_leaf_search_hash(args, lbp);
234 leaf = lbp->data;
235 ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
236 bestsp = XFS_DIR2_LEAF_BESTS_P(ltp);
237 length = XFS_DIR2_DATA_ENTSIZE(args->namelen);
238 /*
239 * See if there are any entries with the same hash value
240 * and space in their block for the new entry.
241 * This is good because it puts multiple same-hash value entries
242 * in a data block, improving the lookup of those entries.
243 */
244 for (use_block = -1, lep = &leaf->ents[index];
245 index < INT_GET(leaf->hdr.count, ARCH_CONVERT) && INT_GET(lep->hashval, ARCH_CONVERT) == args->hashval;
246 index++, lep++) {
247 if (INT_GET(lep->address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
248 continue;
249 i = XFS_DIR2_DATAPTR_TO_DB(mp, INT_GET(lep->address, ARCH_CONVERT));
250 ASSERT(i < INT_GET(ltp->bestcount, ARCH_CONVERT));
251 ASSERT(INT_GET(bestsp[i], ARCH_CONVERT) != NULLDATAOFF);
252 if (INT_GET(bestsp[i], ARCH_CONVERT) >= length) {
253 use_block = i;
254 break;
255 }
256 }
257 /*
258 * Didn't find a block yet, linear search all the data blocks.
259 */
260 if (use_block == -1) {
261 for (i = 0; i < INT_GET(ltp->bestcount, ARCH_CONVERT); i++) {
262 /*
263 * Remember a block we see that's missing.
264 */
265 if (INT_GET(bestsp[i], ARCH_CONVERT) == NULLDATAOFF && use_block == -1)
266 use_block = i;
267 else if (INT_GET(bestsp[i], ARCH_CONVERT) >= length) {
268 use_block = i;
269 break;
270 }
271 }
272 }
273 /*
274 * How many bytes do we need in the leaf block?
275 */
276 needbytes =
277 (leaf->hdr.stale ? 0 : (uint)sizeof(leaf->ents[0])) +
278 (use_block != -1 ? 0 : (uint)sizeof(leaf->bests[0]));
279 /*
280 * Now kill use_block if it refers to a missing block, so we
281 * can use it as an indication of allocation needed.
282 */
283 if (use_block != -1 && INT_GET(bestsp[use_block], ARCH_CONVERT) == NULLDATAOFF)
284 use_block = -1;
285 /*
286 * If we don't have enough free bytes but we can make enough
287 * by compacting out stale entries, we'll do that.
288 */
289 if ((char *)bestsp - (char *)&leaf->ents[INT_GET(leaf->hdr.count, ARCH_CONVERT)] < needbytes &&
290 INT_GET(leaf->hdr.stale, ARCH_CONVERT) > 1) {
291 compact = 1;
292 }
293 /*
294 * Otherwise if we don't have enough free bytes we need to
295 * convert to node form.
296 */
297 else if ((char *)bestsp - (char *)&leaf->ents[INT_GET(leaf->hdr.count, ARCH_CONVERT)] <
298 needbytes) {
299 /*
300 * Just checking or no space reservation, give up.
301 */
302 if (args->justcheck || args->total == 0) {
303 xfs_da_brelse(tp, lbp);
304 return XFS_ERROR(ENOSPC);
305 }
306 /*
307 * Convert to node form.
308 */
309 error = xfs_dir2_leaf_to_node(args, lbp);
310 xfs_da_buf_done(lbp);
311 if (error)
312 return error;
313 /*
314 * Then add the new entry.
315 */
316 return xfs_dir2_node_addname(args);
317 }
318 /*
319 * Otherwise it will fit without compaction.
320 */
321 else
322 compact = 0;
323 /*
324 * If just checking, then it will fit unless we needed to allocate
325 * a new data block.
326 */
327 if (args->justcheck) {
328 xfs_da_brelse(tp, lbp);
329 return use_block == -1 ? XFS_ERROR(ENOSPC) : 0;
330 }
331 /*
332 * If no allocations are allowed, return now before we've
333 * changed anything.
334 */
335 if (args->total == 0 && use_block == -1) {
336 xfs_da_brelse(tp, lbp);
337 return XFS_ERROR(ENOSPC);
338 }
339 /*
340 * Need to compact the leaf entries, removing stale ones.
341 * Leave one stale entry behind - the one closest to our
342 * insertion index - and we'll shift that one to our insertion
343 * point later.
344 */
345 if (compact) {
346 xfs_dir2_leaf_compact_x1(lbp, &index, &lowstale, &highstale,
347 &lfloglow, &lfloghigh);
348 }
349 /*
350 * There are stale entries, so we'll need log-low and log-high
351 * impossibly bad values later.
352 */
353 else if (INT_GET(leaf->hdr.stale, ARCH_CONVERT)) {
354 lfloglow = INT_GET(leaf->hdr.count, ARCH_CONVERT);
355 lfloghigh = -1;
356 }
357 /*
358 * If there was no data block space found, we need to allocate
359 * a new one.
360 */
361 if (use_block == -1) {
362 /*
363 * Add the new data block.
364 */
365 if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE,
366 &use_block))) {
367 xfs_da_brelse(tp, lbp);
368 return error;
369 }
370 /*
371 * Initialize the block.
372 */
373 if ((error = xfs_dir2_data_init(args, use_block, &dbp))) {
374 xfs_da_brelse(tp, lbp);
375 return error;
376 }
377 /*
378 * If we're adding a new data block on the end we need to
379 * extend the bests table. Copy it up one entry.
380 */
381 if (use_block >= INT_GET(ltp->bestcount, ARCH_CONVERT)) {
382 bestsp--;
383 memmove(&bestsp[0], &bestsp[1],
384 INT_GET(ltp->bestcount, ARCH_CONVERT) * sizeof(bestsp[0]));
385 INT_MOD(ltp->bestcount, ARCH_CONVERT, +1);
386 xfs_dir2_leaf_log_tail(tp, lbp);
387 xfs_dir2_leaf_log_bests(tp, lbp, 0, INT_GET(ltp->bestcount, ARCH_CONVERT) - 1);
388 }
389 /*
390 * If we're filling in a previously empty block just log it.
391 */
392 else
393 xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block);
394 data = dbp->data;
395 INT_COPY(bestsp[use_block], data->hdr.bestfree[0].length, ARCH_CONVERT);
396 grown = 1;
397 }
398 /*
399 * Already had space in some data block.
400 * Just read that one in.
401 */
402 else {
403 if ((error =
404 xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, use_block),
405 -1, &dbp, XFS_DATA_FORK))) {
406 xfs_da_brelse(tp, lbp);
407 return error;
408 }
409 data = dbp->data;
410 grown = 0;
411 }
412 xfs_dir2_data_check(dp, dbp);
413 /*
414 * Point to the biggest freespace in our data block.
415 */
416 dup = (xfs_dir2_data_unused_t *)
417 ((char *)data + INT_GET(data->hdr.bestfree[0].offset, ARCH_CONVERT));
418 ASSERT(INT_GET(dup->length, ARCH_CONVERT) >= length);
419 needscan = needlog = 0;
420 /*
421 * Mark the initial part of our freespace in use for the new entry.
422 */
423 xfs_dir2_data_use_free(tp, dbp, dup,
424 (xfs_dir2_data_aoff_t)((char *)dup - (char *)data), length,
425 &needlog, &needscan);
426 /*
427 * Initialize our new entry (at last).
428 */
429 dep = (xfs_dir2_data_entry_t *)dup;
430 INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
431 dep->namelen = args->namelen;
432 memcpy(dep->name, args->name, dep->namelen);
433 tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
434 INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)data));
435 /*
436 * Need to scan fix up the bestfree table.
437 */
438 if (needscan)
439 xfs_dir2_data_freescan(mp, data, &needlog, NULL);
440 /*
441 * Need to log the data block's header.
442 */
443 if (needlog)
444 xfs_dir2_data_log_header(tp, dbp);
445 xfs_dir2_data_log_entry(tp, dbp, dep);
446 /*
447 * If the bests table needs to be changed, do it.
448 * Log the change unless we've already done that.
449 */
450 if (INT_GET(bestsp[use_block], ARCH_CONVERT) != INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT)) {
451 INT_COPY(bestsp[use_block], data->hdr.bestfree[0].length, ARCH_CONVERT);
452 if (!grown)
453 xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block);
454 }
455 /*
456 * Now we need to make room to insert the leaf entry.
457 * If there are no stale entries, we just insert a hole at index.
458 */
459 if (!leaf->hdr.stale) {
460 /*
461 * lep is still good as the index leaf entry.
462 */
463 if (index < INT_GET(leaf->hdr.count, ARCH_CONVERT))
464 memmove(lep + 1, lep,
465 (INT_GET(leaf->hdr.count, ARCH_CONVERT) - index) * sizeof(*lep));
466 /*
467 * Record low and high logging indices for the leaf.
468 */
469 lfloglow = index;
470 lfloghigh = INT_GET(leaf->hdr.count, ARCH_CONVERT);
471 INT_MOD(leaf->hdr.count, ARCH_CONVERT, +1);
472 }
473 /*
474 * There are stale entries.
475 * We will use one of them for the new entry.
476 * It's probably not at the right location, so we'll have to
477 * shift some up or down first.
478 */
479 else {
480 /*
481 * If we didn't compact before, we need to find the nearest
482 * stale entries before and after our insertion point.
483 */
484 if (compact == 0) {
485 /*
486 * Find the first stale entry before the insertion
487 * point, if any.
488 */
489 for (lowstale = index - 1;
490 lowstale >= 0 &&
491 INT_GET(leaf->ents[lowstale].address, ARCH_CONVERT) !=
492 XFS_DIR2_NULL_DATAPTR;
493 lowstale--)
494 continue;
495 /*
496 * Find the next stale entry at or after the insertion
497 * point, if any. Stop if we go so far that the
498 * lowstale entry would be better.
499 */
500 for (highstale = index;
501 highstale < INT_GET(leaf->hdr.count, ARCH_CONVERT) &&
502 INT_GET(leaf->ents[highstale].address, ARCH_CONVERT) !=
503 XFS_DIR2_NULL_DATAPTR &&
504 (lowstale < 0 ||
505 index - lowstale - 1 >= highstale - index);
506 highstale++)
507 continue;
508 }
509 /*
510 * If the low one is better, use it.
511 */
512 if (lowstale >= 0 &&
513 (highstale == INT_GET(leaf->hdr.count, ARCH_CONVERT) ||
514 index - lowstale - 1 < highstale - index)) {
515 ASSERT(index - lowstale - 1 >= 0);
516 ASSERT(INT_GET(leaf->ents[lowstale].address, ARCH_CONVERT) ==
517 XFS_DIR2_NULL_DATAPTR);
518 /*
519 * Copy entries up to cover the stale entry
520 * and make room for the new entry.
521 */
522 if (index - lowstale - 1 > 0)
523 memmove(&leaf->ents[lowstale],
524 &leaf->ents[lowstale + 1],
525 (index - lowstale - 1) * sizeof(*lep));
526 lep = &leaf->ents[index - 1];
527 lfloglow = MIN(lowstale, lfloglow);
528 lfloghigh = MAX(index - 1, lfloghigh);
529 }
530 /*
531 * The high one is better, so use that one.
532 */
533 else {
534 ASSERT(highstale - index >= 0);
535 ASSERT(INT_GET(leaf->ents[highstale].address, ARCH_CONVERT) ==
536 XFS_DIR2_NULL_DATAPTR);
537 /*
538 * Copy entries down to copver the stale entry
539 * and make room for the new entry.
540 */
541 if (highstale - index > 0)
542 memmove(&leaf->ents[index + 1],
543 &leaf->ents[index],
544 (highstale - index) * sizeof(*lep));
545 lep = &leaf->ents[index];
546 lfloglow = MIN(index, lfloglow);
547 lfloghigh = MAX(highstale, lfloghigh);
548 }
549 INT_MOD(leaf->hdr.stale, ARCH_CONVERT, -1);
550 }
551 /*
552 * Fill in the new leaf entry.
553 */
554 INT_SET(lep->hashval, ARCH_CONVERT, args->hashval);
555 INT_SET(lep->address, ARCH_CONVERT, XFS_DIR2_DB_OFF_TO_DATAPTR(mp, use_block, INT_GET(*tagp, ARCH_CONVERT)));
556 /*
557 * Log the leaf fields and give up the buffers.
558 */
559 xfs_dir2_leaf_log_header(tp, lbp);
560 xfs_dir2_leaf_log_ents(tp, lbp, lfloglow, lfloghigh);
561 xfs_dir2_leaf_check(dp, lbp);
562 xfs_da_buf_done(lbp);
563 xfs_dir2_data_check(dp, dbp);
564 xfs_da_buf_done(dbp);
565 return 0;
566}
567
568#ifdef DEBUG
569/*
570 * Check the internal consistency of a leaf1 block.
571 * Pop an assert if something is wrong.
572 */
573void
574xfs_dir2_leaf_check(
575 xfs_inode_t *dp, /* incore directory inode */
576 xfs_dabuf_t *bp) /* leaf's buffer */
577{
578 int i; /* leaf index */
579 xfs_dir2_leaf_t *leaf; /* leaf structure */
580 xfs_dir2_leaf_tail_t *ltp; /* leaf tail pointer */
581 xfs_mount_t *mp; /* filesystem mount point */
582 int stale; /* count of stale leaves */
583
584 leaf = bp->data;
585 mp = dp->i_mount;
586 ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC);
587 /*
588 * This value is not restrictive enough.
589 * Should factor in the size of the bests table as well.
590 * We can deduce a value for that from di_size.
591 */
592 ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT) <= XFS_DIR2_MAX_LEAF_ENTS(mp));
593 ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
594 /*
595 * Leaves and bests don't overlap.
596 */
597 ASSERT((char *)&leaf->ents[INT_GET(leaf->hdr.count, ARCH_CONVERT)] <=
598 (char *)XFS_DIR2_LEAF_BESTS_P(ltp));
599 /*
600 * Check hash value order, count stale entries.
601 */
602 for (i = stale = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); i++) {
603 if (i + 1 < INT_GET(leaf->hdr.count, ARCH_CONVERT))
604 ASSERT(INT_GET(leaf->ents[i].hashval, ARCH_CONVERT) <=
605 INT_GET(leaf->ents[i + 1].hashval, ARCH_CONVERT));
606 if (INT_GET(leaf->ents[i].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
607 stale++;
608 }
609 ASSERT(INT_GET(leaf->hdr.stale, ARCH_CONVERT) == stale);
610}
611#endif /* DEBUG */
612
613/*
614 * Compact out any stale entries in the leaf.
615 * Log the header and changed leaf entries, if any.
616 */
617void
618xfs_dir2_leaf_compact(
619 xfs_da_args_t *args, /* operation arguments */
620 xfs_dabuf_t *bp) /* leaf buffer */
621{
622 int from; /* source leaf index */
623 xfs_dir2_leaf_t *leaf; /* leaf structure */
624 int loglow; /* first leaf entry to log */
625 int to; /* target leaf index */
626
627 leaf = bp->data;
628 if (!leaf->hdr.stale) {
629 return;
630 }
631 /*
632 * Compress out the stale entries in place.
633 */
634 for (from = to = 0, loglow = -1; from < INT_GET(leaf->hdr.count, ARCH_CONVERT); from++) {
635 if (INT_GET(leaf->ents[from].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
636 continue;
637 /*
638 * Only actually copy the entries that are different.
639 */
640 if (from > to) {
641 if (loglow == -1)
642 loglow = to;
643 leaf->ents[to] = leaf->ents[from];
644 }
645 to++;
646 }
647 /*
648 * Update and log the header, log the leaf entries.
649 */
650 ASSERT(INT_GET(leaf->hdr.stale, ARCH_CONVERT) == from - to);
651 INT_MOD(leaf->hdr.count, ARCH_CONVERT, -(INT_GET(leaf->hdr.stale, ARCH_CONVERT)));
652 leaf->hdr.stale = 0;
653 xfs_dir2_leaf_log_header(args->trans, bp);
654 if (loglow != -1)
655 xfs_dir2_leaf_log_ents(args->trans, bp, loglow, to - 1);
656}
657
658/*
659 * Compact the leaf entries, removing stale ones.
660 * Leave one stale entry behind - the one closest to our
661 * insertion index - and the caller will shift that one to our insertion
662 * point later.
663 * Return new insertion index, where the remaining stale entry is,
664 * and leaf logging indices.
665 */
666void
667xfs_dir2_leaf_compact_x1(
668 xfs_dabuf_t *bp, /* leaf buffer */
669 int *indexp, /* insertion index */
670 int *lowstalep, /* out: stale entry before us */
671 int *highstalep, /* out: stale entry after us */
672 int *lowlogp, /* out: low log index */
673 int *highlogp) /* out: high log index */
674{
675 int from; /* source copy index */
676 int highstale; /* stale entry at/after index */
677 int index; /* insertion index */
678 int keepstale; /* source index of kept stale */
679 xfs_dir2_leaf_t *leaf; /* leaf structure */
680 int lowstale; /* stale entry before index */
681 int newindex=0; /* new insertion index */
682 int to; /* destination copy index */
683
684 leaf = bp->data;
685 ASSERT(INT_GET(leaf->hdr.stale, ARCH_CONVERT) > 1);
686 index = *indexp;
687 /*
688 * Find the first stale entry before our index, if any.
689 */
690 for (lowstale = index - 1;
691 lowstale >= 0 &&
692 INT_GET(leaf->ents[lowstale].address, ARCH_CONVERT) != XFS_DIR2_NULL_DATAPTR;
693 lowstale--)
694 continue;
695 /*
696 * Find the first stale entry at or after our index, if any.
697 * Stop if the answer would be worse than lowstale.
698 */
699 for (highstale = index;
700 highstale < INT_GET(leaf->hdr.count, ARCH_CONVERT) &&
701 INT_GET(leaf->ents[highstale].address, ARCH_CONVERT) != XFS_DIR2_NULL_DATAPTR &&
702 (lowstale < 0 || index - lowstale > highstale - index);
703 highstale++)
704 continue;
705 /*
706 * Pick the better of lowstale and highstale.
707 */
708 if (lowstale >= 0 &&
709 (highstale == INT_GET(leaf->hdr.count, ARCH_CONVERT) ||
710 index - lowstale <= highstale - index))
711 keepstale = lowstale;
712 else
713 keepstale = highstale;
714 /*
715 * Copy the entries in place, removing all the stale entries
716 * except keepstale.
717 */
718 for (from = to = 0; from < INT_GET(leaf->hdr.count, ARCH_CONVERT); from++) {
719 /*
720 * Notice the new value of index.
721 */
722 if (index == from)
723 newindex = to;
724 if (from != keepstale &&
725 INT_GET(leaf->ents[from].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR) {
726 if (from == to)
727 *lowlogp = to;
728 continue;
729 }
730 /*
731 * Record the new keepstale value for the insertion.
732 */
733 if (from == keepstale)
734 lowstale = highstale = to;
735 /*
736 * Copy only the entries that have moved.
737 */
738 if (from > to)
739 leaf->ents[to] = leaf->ents[from];
740 to++;
741 }
742 ASSERT(from > to);
743 /*
744 * If the insertion point was past the last entry,
745 * set the new insertion point accordingly.
746 */
747 if (index == from)
748 newindex = to;
749 *indexp = newindex;
750 /*
751 * Adjust the leaf header values.
752 */
753 INT_MOD(leaf->hdr.count, ARCH_CONVERT, -(from - to));
754 INT_SET(leaf->hdr.stale, ARCH_CONVERT, 1);
755 /*
756 * Remember the low/high stale value only in the "right"
757 * direction.
758 */
759 if (lowstale >= newindex)
760 lowstale = -1;
761 else
762 highstale = INT_GET(leaf->hdr.count, ARCH_CONVERT);
763 *highlogp = INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1;
764 *lowstalep = lowstale;
765 *highstalep = highstale;
766}
767
768/*
769 * Getdents (readdir) for leaf and node directories.
770 * This reads the data blocks only, so is the same for both forms.
771 */
772int /* error */
773xfs_dir2_leaf_getdents(
774 xfs_trans_t *tp, /* transaction pointer */
775 xfs_inode_t *dp, /* incore directory inode */
776 uio_t *uio, /* I/O control & vectors */
777 int *eofp, /* out: reached end of dir */
778 xfs_dirent_t *dbp, /* caller's buffer */
779 xfs_dir2_put_t put) /* ABI formatting routine */
780{
781 xfs_dabuf_t *bp; /* data block buffer */
782 int byteoff; /* offset in current block */
783 xfs_dir2_db_t curdb; /* db for current block */
784 xfs_dir2_off_t curoff; /* current overall offset */
785 xfs_dir2_data_t *data; /* data block structure */
786 xfs_dir2_data_entry_t *dep; /* data entry */
787 xfs_dir2_data_unused_t *dup; /* unused entry */
788 int eof; /* reached end of directory */
789 int error=0; /* error return value */
790 int i; /* temporary loop index */
791 int j; /* temporary loop index */
792 int length; /* temporary length value */
793 xfs_bmbt_irec_t *map; /* map vector for blocks */
794 xfs_extlen_t map_blocks; /* number of fsbs in map */
795 xfs_dablk_t map_off; /* last mapped file offset */
796 int map_size; /* total entries in *map */
797 int map_valid; /* valid entries in *map */
798 xfs_mount_t *mp; /* filesystem mount point */
799 xfs_dir2_off_t newoff; /* new curoff after new blk */
800 int nmap; /* mappings to ask xfs_bmapi */
801 xfs_dir2_put_args_t p; /* formatting arg bundle */
802 char *ptr=NULL; /* pointer to current data */
803 int ra_current; /* number of read-ahead blks */
804 int ra_index; /* *map index for read-ahead */
805 int ra_offset; /* map entry offset for ra */
806 int ra_want; /* readahead count wanted */
807
808 /*
809 * If the offset is at or past the largest allowed value,
810 * give up right away, return eof.
811 */
812 if (uio->uio_offset >= XFS_DIR2_MAX_DATAPTR) {
813 *eofp = 1;
814 return 0;
815 }
816 mp = dp->i_mount;
817 /*
818 * Setup formatting arguments.
819 */
820 p.dbp = dbp;
821 p.put = put;
822 p.uio = uio;
823 /*
824 * Set up to bmap a number of blocks based on the caller's
825 * buffer size, the directory block size, and the filesystem
826 * block size.
827 */
828 map_size =
829 howmany(uio->uio_resid + mp->m_dirblksize,
830 mp->m_sb.sb_blocksize);
831 map = kmem_alloc(map_size * sizeof(*map), KM_SLEEP);
832 map_valid = ra_index = ra_offset = ra_current = map_blocks = 0;
833 bp = NULL;
834 eof = 1;
835 /*
836 * Inside the loop we keep the main offset value as a byte offset
837 * in the directory file.
838 */
839 curoff = XFS_DIR2_DATAPTR_TO_BYTE(mp, uio->uio_offset);
840 /*
841 * Force this conversion through db so we truncate the offset
842 * down to get the start of the data block.
843 */
844 map_off = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_BYTE_TO_DB(mp, curoff));
845 /*
846 * Loop over directory entries until we reach the end offset.
847 * Get more blocks and readahead as necessary.
848 */
849 while (curoff < XFS_DIR2_LEAF_OFFSET) {
850 /*
851 * If we have no buffer, or we're off the end of the
852 * current buffer, need to get another one.
853 */
854 if (!bp || ptr >= (char *)bp->data + mp->m_dirblksize) {
855 /*
856 * If we have a buffer, we need to release it and
857 * take it out of the mapping.
858 */
859 if (bp) {
860 xfs_da_brelse(tp, bp);
861 bp = NULL;
862 map_blocks -= mp->m_dirblkfsbs;
863 /*
864 * Loop to get rid of the extents for the
865 * directory block.
866 */
867 for (i = mp->m_dirblkfsbs; i > 0; ) {
868 j = MIN((int)map->br_blockcount, i);
869 map->br_blockcount -= j;
870 map->br_startblock += j;
871 map->br_startoff += j;
872 /*
873 * If mapping is done, pitch it from
874 * the table.
875 */
876 if (!map->br_blockcount && --map_valid)
877 memmove(&map[0], &map[1],
878 sizeof(map[0]) *
879 map_valid);
880 i -= j;
881 }
882 }
883 /*
884 * Recalculate the readahead blocks wanted.
885 */
886 ra_want = howmany(uio->uio_resid + mp->m_dirblksize,
887 mp->m_sb.sb_blocksize) - 1;
888 /*
889 * If we don't have as many as we want, and we haven't
890 * run out of data blocks, get some more mappings.
891 */
892 if (1 + ra_want > map_blocks &&
893 map_off <
894 XFS_DIR2_BYTE_TO_DA(mp, XFS_DIR2_LEAF_OFFSET)) {
895 /*
896 * Get more bmaps, fill in after the ones
897 * we already have in the table.
898 */
899 nmap = map_size - map_valid;
900 error = xfs_bmapi(tp, dp,
901 map_off,
902 XFS_DIR2_BYTE_TO_DA(mp,
903 XFS_DIR2_LEAF_OFFSET) - map_off,
904 XFS_BMAPI_METADATA, NULL, 0,
905 &map[map_valid], &nmap, NULL);
906 /*
907 * Don't know if we should ignore this or
908 * try to return an error.
909 * The trouble with returning errors
910 * is that readdir will just stop without
911 * actually passing the error through.
912 */
913 if (error)
914 break; /* XXX */
915 /*
916 * If we got all the mappings we asked for,
917 * set the final map offset based on the
918 * last bmap value received.
919 * Otherwise, we've reached the end.
920 */
921 if (nmap == map_size - map_valid)
922 map_off =
923 map[map_valid + nmap - 1].br_startoff +
924 map[map_valid + nmap - 1].br_blockcount;
925 else
926 map_off =
927 XFS_DIR2_BYTE_TO_DA(mp,
928 XFS_DIR2_LEAF_OFFSET);
929 /*
930 * Look for holes in the mapping, and
931 * eliminate them. Count up the valid blocks.
932 */
933 for (i = map_valid; i < map_valid + nmap; ) {
934 if (map[i].br_startblock ==
935 HOLESTARTBLOCK) {
936 nmap--;
937 length = map_valid + nmap - i;
938 if (length)
939 memmove(&map[i],
940 &map[i + 1],
941 sizeof(map[i]) *
942 length);
943 } else {
944 map_blocks +=
945 map[i].br_blockcount;
946 i++;
947 }
948 }
949 map_valid += nmap;
950 }
951 /*
952 * No valid mappings, so no more data blocks.
953 */
954 if (!map_valid) {
955 curoff = XFS_DIR2_DA_TO_BYTE(mp, map_off);
956 break;
957 }
958 /*
959 * Read the directory block starting at the first
960 * mapping.
961 */
962 curdb = XFS_DIR2_DA_TO_DB(mp, map->br_startoff);
963 error = xfs_da_read_buf(tp, dp, map->br_startoff,
964 map->br_blockcount >= mp->m_dirblkfsbs ?
965 XFS_FSB_TO_DADDR(mp, map->br_startblock) :
966 -1,
967 &bp, XFS_DATA_FORK);
968 /*
969 * Should just skip over the data block instead
970 * of giving up.
971 */
972 if (error)
973 break; /* XXX */
974 /*
975 * Adjust the current amount of read-ahead: we just
976 * read a block that was previously ra.
977 */
978 if (ra_current)
979 ra_current -= mp->m_dirblkfsbs;
980 /*
981 * Do we need more readahead?
982 */
983 for (ra_index = ra_offset = i = 0;
984 ra_want > ra_current && i < map_blocks;
985 i += mp->m_dirblkfsbs) {
986 ASSERT(ra_index < map_valid);
987 /*
988 * Read-ahead a contiguous directory block.
989 */
990 if (i > ra_current &&
991 map[ra_index].br_blockcount >=
992 mp->m_dirblkfsbs) {
993 xfs_baread(mp->m_ddev_targp,
994 XFS_FSB_TO_DADDR(mp,
995 map[ra_index].br_startblock +
996 ra_offset),
997 (int)BTOBB(mp->m_dirblksize));
998 ra_current = i;
999 }
1000 /*
1001 * Read-ahead a non-contiguous directory block.
1002 * This doesn't use our mapping, but this
1003 * is a very rare case.
1004 */
1005 else if (i > ra_current) {
1006 (void)xfs_da_reada_buf(tp, dp,
1007 map[ra_index].br_startoff +
1008 ra_offset, XFS_DATA_FORK);
1009 ra_current = i;
1010 }
1011 /*
1012 * Advance offset through the mapping table.
1013 */
1014 for (j = 0; j < mp->m_dirblkfsbs; j++) {
1015 /*
1016 * The rest of this extent but not
1017 * more than a dir block.
1018 */
1019 length = MIN(mp->m_dirblkfsbs,
1020 (int)(map[ra_index].br_blockcount -
1021 ra_offset));
1022 j += length;
1023 ra_offset += length;
1024 /*
1025 * Advance to the next mapping if
1026 * this one is used up.
1027 */
1028 if (ra_offset ==
1029 map[ra_index].br_blockcount) {
1030 ra_offset = 0;
1031 ra_index++;
1032 }
1033 }
1034 }
1035 /*
1036 * Having done a read, we need to set a new offset.
1037 */
1038 newoff = XFS_DIR2_DB_OFF_TO_BYTE(mp, curdb, 0);
1039 /*
1040 * Start of the current block.
1041 */
1042 if (curoff < newoff)
1043 curoff = newoff;
1044 /*
1045 * Make sure we're in the right block.
1046 */
1047 else if (curoff > newoff)
1048 ASSERT(XFS_DIR2_BYTE_TO_DB(mp, curoff) ==
1049 curdb);
1050 data = bp->data;
1051 xfs_dir2_data_check(dp, bp);
1052 /*
1053 * Find our position in the block.
1054 */
1055 ptr = (char *)&data->u;
1056 byteoff = XFS_DIR2_BYTE_TO_OFF(mp, curoff);
1057 /*
1058 * Skip past the header.
1059 */
1060 if (byteoff == 0)
1061 curoff += (uint)sizeof(data->hdr);
1062 /*
1063 * Skip past entries until we reach our offset.
1064 */
1065 else {
1066 while ((char *)ptr - (char *)data < byteoff) {
1067 dup = (xfs_dir2_data_unused_t *)ptr;
1068
1069 if (INT_GET(dup->freetag, ARCH_CONVERT)
1070 == XFS_DIR2_DATA_FREE_TAG) {
1071
1072 length = INT_GET(dup->length,
1073 ARCH_CONVERT);
1074 ptr += length;
1075 continue;
1076 }
1077 dep = (xfs_dir2_data_entry_t *)ptr;
1078 length =
1079 XFS_DIR2_DATA_ENTSIZE(dep->namelen);
1080 ptr += length;
1081 }
1082 /*
1083 * Now set our real offset.
1084 */
1085 curoff =
1086 XFS_DIR2_DB_OFF_TO_BYTE(mp,
1087 XFS_DIR2_BYTE_TO_DB(mp, curoff),
1088 (char *)ptr - (char *)data);
1089 if (ptr >= (char *)data + mp->m_dirblksize) {
1090 continue;
1091 }
1092 }
1093 }
1094 /*
1095 * We have a pointer to an entry.
1096 * Is it a live one?
1097 */
1098 dup = (xfs_dir2_data_unused_t *)ptr;
1099 /*
1100 * No, it's unused, skip over it.
1101 */
1102 if (INT_GET(dup->freetag, ARCH_CONVERT)
1103 == XFS_DIR2_DATA_FREE_TAG) {
1104 length = INT_GET(dup->length, ARCH_CONVERT);
1105 ptr += length;
1106 curoff += length;
1107 continue;
1108 }
1109
1110 /*
1111 * Copy the entry into the putargs, and try formatting it.
1112 */
1113 dep = (xfs_dir2_data_entry_t *)ptr;
1114
1115 p.namelen = dep->namelen;
1116
1117 length = XFS_DIR2_DATA_ENTSIZE(p.namelen);
1118
1119 p.cook = XFS_DIR2_BYTE_TO_DATAPTR(mp, curoff + length);
1120
1121 p.ino = INT_GET(dep->inumber, ARCH_CONVERT);
1122#if XFS_BIG_INUMS
1123 p.ino += mp->m_inoadd;
1124#endif
1125 p.name = (char *)dep->name;
1126
1127 error = p.put(&p);
1128
1129 /*
1130 * Won't fit. Return to caller.
1131 */
1132 if (!p.done) {
1133 eof = 0;
1134 break;
1135 }
1136 /*
1137 * Advance to next entry in the block.
1138 */
1139 ptr += length;
1140 curoff += length;
1141 }
1142
1143 /*
1144 * All done. Set output offset value to current offset.
1145 */
1146 *eofp = eof;
1147 if (curoff > XFS_DIR2_DATAPTR_TO_BYTE(mp, XFS_DIR2_MAX_DATAPTR))
1148 uio->uio_offset = XFS_DIR2_MAX_DATAPTR;
1149 else
1150 uio->uio_offset = XFS_DIR2_BYTE_TO_DATAPTR(mp, curoff);
1151 kmem_free(map, map_size * sizeof(*map));
1152 if (bp)
1153 xfs_da_brelse(tp, bp);
1154 return error;
1155}
1156
1157/*
1158 * Initialize a new leaf block, leaf1 or leafn magic accepted.
1159 */
1160int
1161xfs_dir2_leaf_init(
1162 xfs_da_args_t *args, /* operation arguments */
1163 xfs_dir2_db_t bno, /* directory block number */
1164 xfs_dabuf_t **bpp, /* out: leaf buffer */
1165 int magic) /* magic number for block */
1166{
1167 xfs_dabuf_t *bp; /* leaf buffer */
1168 xfs_inode_t *dp; /* incore directory inode */
1169 int error; /* error return code */
1170 xfs_dir2_leaf_t *leaf; /* leaf structure */
1171 xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
1172 xfs_mount_t *mp; /* filesystem mount point */
1173 xfs_trans_t *tp; /* transaction pointer */
1174
1175 dp = args->dp;
1176 ASSERT(dp != NULL);
1177 tp = args->trans;
1178 mp = dp->i_mount;
1179 ASSERT(bno >= XFS_DIR2_LEAF_FIRSTDB(mp) &&
1180 bno < XFS_DIR2_FREE_FIRSTDB(mp));
1181 /*
1182 * Get the buffer for the block.
1183 */
1184 error = xfs_da_get_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, bno), -1, &bp,
1185 XFS_DATA_FORK);
1186 if (error) {
1187 return error;
1188 }
1189 ASSERT(bp != NULL);
1190 leaf = bp->data;
1191 /*
1192 * Initialize the header.
1193 */
1194 INT_SET(leaf->hdr.info.magic, ARCH_CONVERT, magic);
1195 leaf->hdr.info.forw = 0;
1196 leaf->hdr.info.back = 0;
1197 leaf->hdr.count = 0;
1198 leaf->hdr.stale = 0;
1199 xfs_dir2_leaf_log_header(tp, bp);
1200 /*
1201 * If it's a leaf-format directory initialize the tail.
1202 * In this case our caller has the real bests table to copy into
1203 * the block.
1204 */
1205 if (magic == XFS_DIR2_LEAF1_MAGIC) {
1206 ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
1207 ltp->bestcount = 0;
1208 xfs_dir2_leaf_log_tail(tp, bp);
1209 }
1210 *bpp = bp;
1211 return 0;
1212}
1213
1214/*
1215 * Log the bests entries indicated from a leaf1 block.
1216 */
1217void
1218xfs_dir2_leaf_log_bests(
1219 xfs_trans_t *tp, /* transaction pointer */
1220 xfs_dabuf_t *bp, /* leaf buffer */
1221 int first, /* first entry to log */
1222 int last) /* last entry to log */
1223{
1224 xfs_dir2_data_off_t *firstb; /* pointer to first entry */
1225 xfs_dir2_data_off_t *lastb; /* pointer to last entry */
1226 xfs_dir2_leaf_t *leaf; /* leaf structure */
1227 xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
1228
1229 leaf = bp->data;
1230 ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC);
1231 ltp = XFS_DIR2_LEAF_TAIL_P(tp->t_mountp, leaf);
1232 firstb = XFS_DIR2_LEAF_BESTS_P(ltp) + first;
1233 lastb = XFS_DIR2_LEAF_BESTS_P(ltp) + last;
1234 xfs_da_log_buf(tp, bp, (uint)((char *)firstb - (char *)leaf),
1235 (uint)((char *)lastb - (char *)leaf + sizeof(*lastb) - 1));
1236}
1237
1238/*
1239 * Log the leaf entries indicated from a leaf1 or leafn block.
1240 */
1241void
1242xfs_dir2_leaf_log_ents(
1243 xfs_trans_t *tp, /* transaction pointer */
1244 xfs_dabuf_t *bp, /* leaf buffer */
1245 int first, /* first entry to log */
1246 int last) /* last entry to log */
1247{
1248 xfs_dir2_leaf_entry_t *firstlep; /* pointer to first entry */
1249 xfs_dir2_leaf_entry_t *lastlep; /* pointer to last entry */
1250 xfs_dir2_leaf_t *leaf; /* leaf structure */
1251
1252 leaf = bp->data;
1253 ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC ||
1254 INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
1255 firstlep = &leaf->ents[first];
1256 lastlep = &leaf->ents[last];
1257 xfs_da_log_buf(tp, bp, (uint)((char *)firstlep - (char *)leaf),
1258 (uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1));
1259}
1260
1261/*
1262 * Log the header of the leaf1 or leafn block.
1263 */
1264void
1265xfs_dir2_leaf_log_header(
1266 xfs_trans_t *tp, /* transaction pointer */
1267 xfs_dabuf_t *bp) /* leaf buffer */
1268{
1269 xfs_dir2_leaf_t *leaf; /* leaf structure */
1270
1271 leaf = bp->data;
1272 ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC ||
1273 INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
1274 xfs_da_log_buf(tp, bp, (uint)((char *)&leaf->hdr - (char *)leaf),
1275 (uint)(sizeof(leaf->hdr) - 1));
1276}
1277
1278/*
1279 * Log the tail of the leaf1 block.
1280 */
1281void
1282xfs_dir2_leaf_log_tail(
1283 xfs_trans_t *tp, /* transaction pointer */
1284 xfs_dabuf_t *bp) /* leaf buffer */
1285{
1286 xfs_dir2_leaf_t *leaf; /* leaf structure */
1287 xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
1288 xfs_mount_t *mp; /* filesystem mount point */
1289
1290 mp = tp->t_mountp;
1291 leaf = bp->data;
1292 ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC);
1293 ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
1294 xfs_da_log_buf(tp, bp, (uint)((char *)ltp - (char *)leaf),
1295 (uint)(mp->m_dirblksize - 1));
1296}
1297
1298/*
1299 * Look up the entry referred to by args in the leaf format directory.
1300 * Most of the work is done by the xfs_dir2_leaf_lookup_int routine which
1301 * is also used by the node-format code.
1302 */
1303int
1304xfs_dir2_leaf_lookup(
1305 xfs_da_args_t *args) /* operation arguments */
1306{
1307 xfs_dabuf_t *dbp; /* data block buffer */
1308 xfs_dir2_data_entry_t *dep; /* data block entry */
1309 xfs_inode_t *dp; /* incore directory inode */
1310 int error; /* error return code */
1311 int index; /* found entry index */
1312 xfs_dabuf_t *lbp; /* leaf buffer */
1313 xfs_dir2_leaf_t *leaf; /* leaf structure */
1314 xfs_dir2_leaf_entry_t *lep; /* leaf entry */
1315 xfs_trans_t *tp; /* transaction pointer */
1316
1317 xfs_dir2_trace_args("leaf_lookup", args);
1318 /*
1319 * Look up name in the leaf block, returning both buffers and index.
1320 */
1321 if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
1322 return error;
1323 }
1324 tp = args->trans;
1325 dp = args->dp;
1326 xfs_dir2_leaf_check(dp, lbp);
1327 leaf = lbp->data;
1328 /*
1329 * Get to the leaf entry and contained data entry address.
1330 */
1331 lep = &leaf->ents[index];
1332 /*
1333 * Point to the data entry.
1334 */
1335 dep = (xfs_dir2_data_entry_t *)
1336 ((char *)dbp->data +
1337 XFS_DIR2_DATAPTR_TO_OFF(dp->i_mount, INT_GET(lep->address, ARCH_CONVERT)));
1338 /*
1339 * Return the found inode number.
1340 */
1341 args->inumber = INT_GET(dep->inumber, ARCH_CONVERT);
1342 xfs_da_brelse(tp, dbp);
1343 xfs_da_brelse(tp, lbp);
1344 return XFS_ERROR(EEXIST);
1345}
1346
1347/*
1348 * Look up name/hash in the leaf block.
1349 * Fill in indexp with the found index, and dbpp with the data buffer.
1350 * If not found dbpp will be NULL, and ENOENT comes back.
1351 * lbpp will always be filled in with the leaf buffer unless there's an error.
1352 */
1353static int /* error */
1354xfs_dir2_leaf_lookup_int(
1355 xfs_da_args_t *args, /* operation arguments */
1356 xfs_dabuf_t **lbpp, /* out: leaf buffer */
1357 int *indexp, /* out: index in leaf block */
1358 xfs_dabuf_t **dbpp) /* out: data buffer */
1359{
1360 xfs_dir2_db_t curdb; /* current data block number */
1361 xfs_dabuf_t *dbp; /* data buffer */
1362 xfs_dir2_data_entry_t *dep; /* data entry */
1363 xfs_inode_t *dp; /* incore directory inode */
1364 int error; /* error return code */
1365 int index; /* index in leaf block */
1366 xfs_dabuf_t *lbp; /* leaf buffer */
1367 xfs_dir2_leaf_entry_t *lep; /* leaf entry */
1368 xfs_dir2_leaf_t *leaf; /* leaf structure */
1369 xfs_mount_t *mp; /* filesystem mount point */
1370 xfs_dir2_db_t newdb; /* new data block number */
1371 xfs_trans_t *tp; /* transaction pointer */
1372
1373 dp = args->dp;
1374 tp = args->trans;
1375 mp = dp->i_mount;
1376 /*
1377 * Read the leaf block into the buffer.
1378 */
1379 if ((error =
1380 xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
1381 XFS_DATA_FORK))) {
1382 return error;
1383 }
1384 *lbpp = lbp;
1385 leaf = lbp->data;
1386 xfs_dir2_leaf_check(dp, lbp);
1387 /*
1388 * Look for the first leaf entry with our hash value.
1389 */
1390 index = xfs_dir2_leaf_search_hash(args, lbp);
1391 /*
1392 * Loop over all the entries with the right hash value
1393 * looking to match the name.
1394 */
1395 for (lep = &leaf->ents[index], dbp = NULL, curdb = -1;
1396 index < INT_GET(leaf->hdr.count, ARCH_CONVERT) && INT_GET(lep->hashval, ARCH_CONVERT) == args->hashval;
1397 lep++, index++) {
1398 /*
1399 * Skip over stale leaf entries.
1400 */
1401 if (INT_GET(lep->address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
1402 continue;
1403 /*
1404 * Get the new data block number.
1405 */
1406 newdb = XFS_DIR2_DATAPTR_TO_DB(mp, INT_GET(lep->address, ARCH_CONVERT));
1407 /*
1408 * If it's not the same as the old data block number,
1409 * need to pitch the old one and read the new one.
1410 */
1411 if (newdb != curdb) {
1412 if (dbp)
1413 xfs_da_brelse(tp, dbp);
1414 if ((error =
1415 xfs_da_read_buf(tp, dp,
1416 XFS_DIR2_DB_TO_DA(mp, newdb), -1, &dbp,
1417 XFS_DATA_FORK))) {
1418 xfs_da_brelse(tp, lbp);
1419 return error;
1420 }
1421 xfs_dir2_data_check(dp, dbp);
1422 curdb = newdb;
1423 }
1424 /*
1425 * Point to the data entry.
1426 */
1427 dep = (xfs_dir2_data_entry_t *)
1428 ((char *)dbp->data +
1429 XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(lep->address, ARCH_CONVERT)));
1430 /*
1431 * If it matches then return it.
1432 */
1433 if (dep->namelen == args->namelen &&
1434 dep->name[0] == args->name[0] &&
1435 memcmp(dep->name, args->name, args->namelen) == 0) {
1436 *dbpp = dbp;
1437 *indexp = index;
1438 return 0;
1439 }
1440 }
1441 /*
1442 * No match found, return ENOENT.
1443 */
1444 ASSERT(args->oknoent);
1445 if (dbp)
1446 xfs_da_brelse(tp, dbp);
1447 xfs_da_brelse(tp, lbp);
1448 return XFS_ERROR(ENOENT);
1449}
1450
1451/*
1452 * Remove an entry from a leaf format directory.
1453 */
1454int /* error */
1455xfs_dir2_leaf_removename(
1456 xfs_da_args_t *args) /* operation arguments */
1457{
1458 xfs_dir2_data_off_t *bestsp; /* leaf block best freespace */
1459 xfs_dir2_data_t *data; /* data block structure */
1460 xfs_dir2_db_t db; /* data block number */
1461 xfs_dabuf_t *dbp; /* data block buffer */
1462 xfs_dir2_data_entry_t *dep; /* data entry structure */
1463 xfs_inode_t *dp; /* incore directory inode */
1464 int error; /* error return code */
1465 xfs_dir2_db_t i; /* temporary data block # */
1466 int index; /* index into leaf entries */
1467 xfs_dabuf_t *lbp; /* leaf buffer */
1468 xfs_dir2_leaf_t *leaf; /* leaf structure */
1469 xfs_dir2_leaf_entry_t *lep; /* leaf entry */
1470 xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
1471 xfs_mount_t *mp; /* filesystem mount point */
1472 int needlog; /* need to log data header */
1473 int needscan; /* need to rescan data frees */
1474 xfs_dir2_data_off_t oldbest; /* old value of best free */
1475 xfs_trans_t *tp; /* transaction pointer */
1476
1477 xfs_dir2_trace_args("leaf_removename", args);
1478 /*
1479 * Lookup the leaf entry, get the leaf and data blocks read in.
1480 */
1481 if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
1482 return error;
1483 }
1484 dp = args->dp;
1485 tp = args->trans;
1486 mp = dp->i_mount;
1487 leaf = lbp->data;
1488 data = dbp->data;
1489 xfs_dir2_data_check(dp, dbp);
1490 /*
1491 * Point to the leaf entry, use that to point to the data entry.
1492 */
1493 lep = &leaf->ents[index];
1494 db = XFS_DIR2_DATAPTR_TO_DB(mp, INT_GET(lep->address, ARCH_CONVERT));
1495 dep = (xfs_dir2_data_entry_t *)
1496 ((char *)data + XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(lep->address, ARCH_CONVERT)));
1497 needscan = needlog = 0;
1498 oldbest = INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT);
1499 ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
1500 bestsp = XFS_DIR2_LEAF_BESTS_P(ltp);
1501 ASSERT(INT_GET(bestsp[db], ARCH_CONVERT) == oldbest);
1502 /*
1503 * Mark the former data entry unused.
1504 */
1505 xfs_dir2_data_make_free(tp, dbp,
1506 (xfs_dir2_data_aoff_t)((char *)dep - (char *)data),
1507 XFS_DIR2_DATA_ENTSIZE(dep->namelen), &needlog, &needscan);
1508 /*
1509 * We just mark the leaf entry stale by putting a null in it.
1510 */
1511 INT_MOD(leaf->hdr.stale, ARCH_CONVERT, +1);
1512 xfs_dir2_leaf_log_header(tp, lbp);
1513 INT_SET(lep->address, ARCH_CONVERT, XFS_DIR2_NULL_DATAPTR);
1514 xfs_dir2_leaf_log_ents(tp, lbp, index, index);
1515 /*
1516 * Scan the freespace in the data block again if necessary,
1517 * log the data block header if necessary.
1518 */
1519 if (needscan)
1520 xfs_dir2_data_freescan(mp, data, &needlog, NULL);
1521 if (needlog)
1522 xfs_dir2_data_log_header(tp, dbp);
1523 /*
1524 * If the longest freespace in the data block has changed,
1525 * put the new value in the bests table and log that.
1526 */
1527 if (INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT) != oldbest) {
1528 INT_COPY(bestsp[db], data->hdr.bestfree[0].length, ARCH_CONVERT);
1529 xfs_dir2_leaf_log_bests(tp, lbp, db, db);
1530 }
1531 xfs_dir2_data_check(dp, dbp);
1532 /*
1533 * If the data block is now empty then get rid of the data block.
1534 */
1535 if (INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT) ==
1536 mp->m_dirblksize - (uint)sizeof(data->hdr)) {
1537 ASSERT(db != mp->m_dirdatablk);
1538 if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
1539 /*
1540 * Nope, can't get rid of it because it caused
1541 * allocation of a bmap btree block to do so.
1542 * Just go on, returning success, leaving the
1543 * empty block in place.
1544 */
1545 if (error == ENOSPC && args->total == 0) {
1546 xfs_da_buf_done(dbp);
1547 error = 0;
1548 }
1549 xfs_dir2_leaf_check(dp, lbp);
1550 xfs_da_buf_done(lbp);
1551 return error;
1552 }
1553 dbp = NULL;
1554 /*
1555 * If this is the last data block then compact the
1556 * bests table by getting rid of entries.
1557 */
1558 if (db == INT_GET(ltp->bestcount, ARCH_CONVERT) - 1) {
1559 /*
1560 * Look for the last active entry (i).
1561 */
1562 for (i = db - 1; i > 0; i--) {
1563 if (INT_GET(bestsp[i], ARCH_CONVERT) != NULLDATAOFF)
1564 break;
1565 }
1566 /*
1567 * Copy the table down so inactive entries at the
1568 * end are removed.
1569 */
1570 memmove(&bestsp[db - i], bestsp,
1571 (INT_GET(ltp->bestcount, ARCH_CONVERT) - (db - i)) * sizeof(*bestsp));
1572 INT_MOD(ltp->bestcount, ARCH_CONVERT, -(db - i));
1573 xfs_dir2_leaf_log_tail(tp, lbp);
1574 xfs_dir2_leaf_log_bests(tp, lbp, 0, INT_GET(ltp->bestcount, ARCH_CONVERT) - 1);
1575 } else
1576 INT_SET(bestsp[db], ARCH_CONVERT, NULLDATAOFF);
1577 }
1578 /*
1579 * If the data block was not the first one, drop it.
1580 */
1581 else if (db != mp->m_dirdatablk && dbp != NULL) {
1582 xfs_da_buf_done(dbp);
1583 dbp = NULL;
1584 }
1585 xfs_dir2_leaf_check(dp, lbp);
1586 /*
1587 * See if we can convert to block form.
1588 */
1589 return xfs_dir2_leaf_to_block(args, lbp, dbp);
1590}
1591
1592/*
1593 * Replace the inode number in a leaf format directory entry.
1594 */
1595int /* error */
1596xfs_dir2_leaf_replace(
1597 xfs_da_args_t *args) /* operation arguments */
1598{
1599 xfs_dabuf_t *dbp; /* data block buffer */
1600 xfs_dir2_data_entry_t *dep; /* data block entry */
1601 xfs_inode_t *dp; /* incore directory inode */
1602 int error; /* error return code */
1603 int index; /* index of leaf entry */
1604 xfs_dabuf_t *lbp; /* leaf buffer */
1605 xfs_dir2_leaf_t *leaf; /* leaf structure */
1606 xfs_dir2_leaf_entry_t *lep; /* leaf entry */
1607 xfs_trans_t *tp; /* transaction pointer */
1608
1609 xfs_dir2_trace_args("leaf_replace", args);
1610 /*
1611 * Look up the entry.
1612 */
1613 if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
1614 return error;
1615 }
1616 dp = args->dp;
1617 leaf = lbp->data;
1618 /*
1619 * Point to the leaf entry, get data address from it.
1620 */
1621 lep = &leaf->ents[index];
1622 /*
1623 * Point to the data entry.
1624 */
1625 dep = (xfs_dir2_data_entry_t *)
1626 ((char *)dbp->data +
1627 XFS_DIR2_DATAPTR_TO_OFF(dp->i_mount, INT_GET(lep->address, ARCH_CONVERT)));
1628 ASSERT(args->inumber != INT_GET(dep->inumber, ARCH_CONVERT));
1629 /*
1630 * Put the new inode number in, log it.
1631 */
1632 INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
1633 tp = args->trans;
1634 xfs_dir2_data_log_entry(tp, dbp, dep);
1635 xfs_da_buf_done(dbp);
1636 xfs_dir2_leaf_check(dp, lbp);
1637 xfs_da_brelse(tp, lbp);
1638 return 0;
1639}
1640
1641/*
1642 * Return index in the leaf block (lbp) which is either the first
1643 * one with this hash value, or if there are none, the insert point
1644 * for that hash value.
1645 */
1646int /* index value */
1647xfs_dir2_leaf_search_hash(
1648 xfs_da_args_t *args, /* operation arguments */
1649 xfs_dabuf_t *lbp) /* leaf buffer */
1650{
1651 xfs_dahash_t hash=0; /* hash from this entry */
1652 xfs_dahash_t hashwant; /* hash value looking for */
1653 int high; /* high leaf index */
1654 int low; /* low leaf index */
1655 xfs_dir2_leaf_t *leaf; /* leaf structure */
1656 xfs_dir2_leaf_entry_t *lep; /* leaf entry */
1657 int mid=0; /* current leaf index */
1658
1659 leaf = lbp->data;
1660#ifndef __KERNEL__
1661 if (!leaf->hdr.count)
1662 return 0;
1663#endif
1664 /*
1665 * Note, the table cannot be empty, so we have to go through the loop.
1666 * Binary search the leaf entries looking for our hash value.
1667 */
1668 for (lep = leaf->ents, low = 0, high = INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1,
1669 hashwant = args->hashval;
1670 low <= high; ) {
1671 mid = (low + high) >> 1;
1672 if ((hash = INT_GET(lep[mid].hashval, ARCH_CONVERT)) == hashwant)
1673 break;
1674 if (hash < hashwant)
1675 low = mid + 1;
1676 else
1677 high = mid - 1;
1678 }
1679 /*
1680 * Found one, back up through all the equal hash values.
1681 */
1682 if (hash == hashwant) {
1683 while (mid > 0 && INT_GET(lep[mid - 1].hashval, ARCH_CONVERT) == hashwant) {
1684 mid--;
1685 }
1686 }
1687 /*
1688 * Need to point to an entry higher than ours.
1689 */
1690 else if (hash < hashwant)
1691 mid++;
1692 return mid;
1693}
1694
1695/*
1696 * Trim off a trailing data block. We know it's empty since the leaf
1697 * freespace table says so.
1698 */
1699int /* error */
1700xfs_dir2_leaf_trim_data(
1701 xfs_da_args_t *args, /* operation arguments */
1702 xfs_dabuf_t *lbp, /* leaf buffer */
1703 xfs_dir2_db_t db) /* data block number */
1704{
1705 xfs_dir2_data_off_t *bestsp; /* leaf bests table */
1706#ifdef DEBUG
1707 xfs_dir2_data_t *data; /* data block structure */
1708#endif
1709 xfs_dabuf_t *dbp; /* data block buffer */
1710 xfs_inode_t *dp; /* incore directory inode */
1711 int error; /* error return value */
1712 xfs_dir2_leaf_t *leaf; /* leaf structure */
1713 xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
1714 xfs_mount_t *mp; /* filesystem mount point */
1715 xfs_trans_t *tp; /* transaction pointer */
1716
1717 dp = args->dp;
1718 mp = dp->i_mount;
1719 tp = args->trans;
1720 /*
1721 * Read the offending data block. We need its buffer.
1722 */
1723 if ((error = xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, db), -1, &dbp,
1724 XFS_DATA_FORK))) {
1725 return error;
1726 }
1727#ifdef DEBUG
1728 data = dbp->data;
1729 ASSERT(INT_GET(data->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC);
1730#endif
1731 /* this seems to be an error
1732 * data is only valid if DEBUG is defined?
1733 * RMC 09/08/1999
1734 */
1735
1736 leaf = lbp->data;
1737 ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
1738 ASSERT(INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT) ==
1739 mp->m_dirblksize - (uint)sizeof(data->hdr));
1740 ASSERT(db == INT_GET(ltp->bestcount, ARCH_CONVERT) - 1);
1741 /*
1742 * Get rid of the data block.
1743 */
1744 if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
1745 ASSERT(error != ENOSPC);
1746 xfs_da_brelse(tp, dbp);
1747 return error;
1748 }
1749 /*
1750 * Eliminate the last bests entry from the table.
1751 */
1752 bestsp = XFS_DIR2_LEAF_BESTS_P(ltp);
1753 INT_MOD(ltp->bestcount, ARCH_CONVERT, -1);
1754 memmove(&bestsp[1], &bestsp[0], INT_GET(ltp->bestcount, ARCH_CONVERT) * sizeof(*bestsp));
1755 xfs_dir2_leaf_log_tail(tp, lbp);
1756 xfs_dir2_leaf_log_bests(tp, lbp, 0, INT_GET(ltp->bestcount, ARCH_CONVERT) - 1);
1757 return 0;
1758}
1759
1760/*
1761 * Convert node form directory to leaf form directory.
1762 * The root of the node form dir needs to already be a LEAFN block.
1763 * Just return if we can't do anything.
1764 */
1765int /* error */
1766xfs_dir2_node_to_leaf(
1767 xfs_da_state_t *state) /* directory operation state */
1768{
1769 xfs_da_args_t *args; /* operation arguments */
1770 xfs_inode_t *dp; /* incore directory inode */
1771 int error; /* error return code */
1772 xfs_dabuf_t *fbp; /* buffer for freespace block */
1773 xfs_fileoff_t fo; /* freespace file offset */
1774 xfs_dir2_free_t *free; /* freespace structure */
1775 xfs_dabuf_t *lbp; /* buffer for leaf block */
1776 xfs_dir2_leaf_tail_t *ltp; /* tail of leaf structure */
1777 xfs_dir2_leaf_t *leaf; /* leaf structure */
1778 xfs_mount_t *mp; /* filesystem mount point */
1779 int rval; /* successful free trim? */
1780 xfs_trans_t *tp; /* transaction pointer */
1781
1782 /*
1783 * There's more than a leaf level in the btree, so there must
1784 * be multiple leafn blocks. Give up.
1785 */
1786 if (state->path.active > 1)
1787 return 0;
1788 args = state->args;
1789 xfs_dir2_trace_args("node_to_leaf", args);
1790 mp = state->mp;
1791 dp = args->dp;
1792 tp = args->trans;
1793 /*
1794 * Get the last offset in the file.
1795 */
1796 if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK))) {
1797 return error;
1798 }
1799 fo -= mp->m_dirblkfsbs;
1800 /*
1801 * If there are freespace blocks other than the first one,
1802 * take this opportunity to remove trailing empty freespace blocks
1803 * that may have been left behind during no-space-reservation
1804 * operations.
1805 */
1806 while (fo > mp->m_dirfreeblk) {
1807 if ((error = xfs_dir2_node_trim_free(args, fo, &rval))) {
1808 return error;
1809 }
1810 if (rval)
1811 fo -= mp->m_dirblkfsbs;
1812 else
1813 return 0;
1814 }
1815 /*
1816 * Now find the block just before the freespace block.
1817 */
1818 if ((error = xfs_bmap_last_before(tp, dp, &fo, XFS_DATA_FORK))) {
1819 return error;
1820 }
1821 /*
1822 * If it's not the single leaf block, give up.
1823 */
1824 if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + mp->m_dirblksize)
1825 return 0;
1826 lbp = state->path.blk[0].bp;
1827 leaf = lbp->data;
1828 ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
1829 /*
1830 * Read the freespace block.
1831 */
1832 if ((error = xfs_da_read_buf(tp, dp, mp->m_dirfreeblk, -1, &fbp,
1833 XFS_DATA_FORK))) {
1834 return error;
1835 }
1836 free = fbp->data;
1837 ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
1838 ASSERT(!free->hdr.firstdb);
1839 /*
1840 * Now see if the leafn and free data will fit in a leaf1.
1841 * If not, release the buffer and give up.
1842 */
1843 if ((uint)sizeof(leaf->hdr) +
1844 (INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT)) * (uint)sizeof(leaf->ents[0]) +
1845 INT_GET(free->hdr.nvalid, ARCH_CONVERT) * (uint)sizeof(leaf->bests[0]) +
1846 (uint)sizeof(leaf->tail) >
1847 mp->m_dirblksize) {
1848 xfs_da_brelse(tp, fbp);
1849 return 0;
1850 }
1851 /*
1852 * If the leaf has any stale entries in it, compress them out.
1853 * The compact routine will log the header.
1854 */
1855 if (INT_GET(leaf->hdr.stale, ARCH_CONVERT))
1856 xfs_dir2_leaf_compact(args, lbp);
1857 else
1858 xfs_dir2_leaf_log_header(tp, lbp);
1859 INT_SET(leaf->hdr.info.magic, ARCH_CONVERT, XFS_DIR2_LEAF1_MAGIC);
1860 /*
1861 * Set up the leaf tail from the freespace block.
1862 */
1863 ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
1864 INT_COPY(ltp->bestcount, free->hdr.nvalid, ARCH_CONVERT);
1865 /*
1866 * Set up the leaf bests table.
1867 */
1868 memcpy(XFS_DIR2_LEAF_BESTS_P(ltp), free->bests,
1869 INT_GET(ltp->bestcount, ARCH_CONVERT) * sizeof(leaf->bests[0]));
1870 xfs_dir2_leaf_log_bests(tp, lbp, 0, INT_GET(ltp->bestcount, ARCH_CONVERT) - 1);
1871 xfs_dir2_leaf_log_tail(tp, lbp);
1872 xfs_dir2_leaf_check(dp, lbp);
1873 /*
1874 * Get rid of the freespace block.
1875 */
1876 error = xfs_dir2_shrink_inode(args, XFS_DIR2_FREE_FIRSTDB(mp), fbp);
1877 if (error) {
1878 /*
1879 * This can't fail here because it can only happen when
1880 * punching out the middle of an extent, and this is an
1881 * isolated block.
1882 */
1883 ASSERT(error != ENOSPC);
1884 return error;
1885 }
1886 fbp = NULL;
1887 /*
1888 * Now see if we can convert the single-leaf directory
1889 * down to a block form directory.
1890 * This routine always kills the dabuf for the leaf, so
1891 * eliminate it from the path.
1892 */
1893 error = xfs_dir2_leaf_to_block(args, lbp, NULL);
1894 state->path.blk[0].bp = NULL;
1895 return error;
1896}
diff --git a/fs/xfs/xfs_dir2_leaf.h b/fs/xfs/xfs_dir2_leaf.h
new file mode 100644
index 000000000000..7f20eee56a52
--- /dev/null
+++ b/fs/xfs/xfs_dir2_leaf.h
@@ -0,0 +1,360 @@
1/*
2 * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_DIR2_LEAF_H__
33#define __XFS_DIR2_LEAF_H__
34
35/*
36 * Directory version 2, leaf block structures.
37 */
38
39struct uio;
40struct xfs_dabuf;
41struct xfs_da_args;
42struct xfs_inode;
43struct xfs_mount;
44struct xfs_trans;
45
46/*
47 * Constants.
48 */
49
50/*
51 * Offset of the leaf/node space. First block in this space
52 * is the btree root.
53 */
54#define XFS_DIR2_LEAF_SPACE 1
55#define XFS_DIR2_LEAF_OFFSET (XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE)
56#define XFS_DIR2_LEAF_FIRSTDB(mp) \
57 XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_LEAF_OFFSET)
58
59/*
60 * Types.
61 */
62
63/*
64 * Offset in data space of a data entry.
65 */
66typedef __uint32_t xfs_dir2_dataptr_t;
67#define XFS_DIR2_MAX_DATAPTR ((xfs_dir2_dataptr_t)0xffffffff)
68#define XFS_DIR2_NULL_DATAPTR ((xfs_dir2_dataptr_t)0)
69
70/*
71 * Structures.
72 */
73
74/*
75 * Leaf block header.
76 */
77typedef struct xfs_dir2_leaf_hdr {
78 xfs_da_blkinfo_t info; /* header for da routines */
79 __uint16_t count; /* count of entries */
80 __uint16_t stale; /* count of stale entries */
81} xfs_dir2_leaf_hdr_t;
82
83/*
84 * Leaf block entry.
85 */
86typedef struct xfs_dir2_leaf_entry {
87 xfs_dahash_t hashval; /* hash value of name */
88 xfs_dir2_dataptr_t address; /* address of data entry */
89} xfs_dir2_leaf_entry_t;
90
91/*
92 * Leaf block tail.
93 */
94typedef struct xfs_dir2_leaf_tail {
95 __uint32_t bestcount;
96} xfs_dir2_leaf_tail_t;
97
98/*
99 * Leaf block.
100 * bests and tail are at the end of the block for single-leaf only
101 * (magic = XFS_DIR2_LEAF1_MAGIC not XFS_DIR2_LEAFN_MAGIC).
102 */
103typedef struct xfs_dir2_leaf {
104 xfs_dir2_leaf_hdr_t hdr; /* leaf header */
105 xfs_dir2_leaf_entry_t ents[1]; /* entries */
106 /* ... */
107 xfs_dir2_data_off_t bests[1]; /* best free counts */
108 xfs_dir2_leaf_tail_t tail; /* leaf tail */
109} xfs_dir2_leaf_t;
110
111/*
112 * Macros.
113 * The DB blocks are logical directory block numbers, not filesystem blocks.
114 */
115
116#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_MAX_LEAF_ENTS)
117int
118xfs_dir2_max_leaf_ents(struct xfs_mount *mp);
119#define XFS_DIR2_MAX_LEAF_ENTS(mp) \
120 xfs_dir2_max_leaf_ents(mp)
121#else
122#define XFS_DIR2_MAX_LEAF_ENTS(mp) \
123 ((int)(((mp)->m_dirblksize - (uint)sizeof(xfs_dir2_leaf_hdr_t)) / \
124 (uint)sizeof(xfs_dir2_leaf_entry_t)))
125#endif
126
127/*
128 * Get address of the bestcount field in the single-leaf block.
129 */
130#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_LEAF_TAIL_P)
131xfs_dir2_leaf_tail_t *
132xfs_dir2_leaf_tail_p(struct xfs_mount *mp, xfs_dir2_leaf_t *lp);
133#define XFS_DIR2_LEAF_TAIL_P(mp,lp) \
134 xfs_dir2_leaf_tail_p(mp, lp)
135#else
136#define XFS_DIR2_LEAF_TAIL_P(mp,lp) \
137 ((xfs_dir2_leaf_tail_t *)\
138 ((char *)(lp) + (mp)->m_dirblksize - \
139 (uint)sizeof(xfs_dir2_leaf_tail_t)))
140#endif
141
142/*
143 * Get address of the bests array in the single-leaf block.
144 */
145#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_LEAF_BESTS_P)
146xfs_dir2_data_off_t *
147xfs_dir2_leaf_bests_p(xfs_dir2_leaf_tail_t *ltp);
148#define XFS_DIR2_LEAF_BESTS_P(ltp) xfs_dir2_leaf_bests_p(ltp)
149#else
150#define XFS_DIR2_LEAF_BESTS_P(ltp) \
151 ((xfs_dir2_data_off_t *)(ltp) - INT_GET((ltp)->bestcount, ARCH_CONVERT))
152#endif
153
154/*
155 * Convert dataptr to byte in file space
156 */
157#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DATAPTR_TO_BYTE)
158xfs_dir2_off_t
159xfs_dir2_dataptr_to_byte(struct xfs_mount *mp, xfs_dir2_dataptr_t dp);
160#define XFS_DIR2_DATAPTR_TO_BYTE(mp,dp) xfs_dir2_dataptr_to_byte(mp, dp)
161#else
162#define XFS_DIR2_DATAPTR_TO_BYTE(mp,dp) \
163 ((xfs_dir2_off_t)(dp) << XFS_DIR2_DATA_ALIGN_LOG)
164#endif
165
166/*
167 * Convert byte in file space to dataptr. It had better be aligned.
168 */
169#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_BYTE_TO_DATAPTR)
170xfs_dir2_dataptr_t
171xfs_dir2_byte_to_dataptr(struct xfs_mount *mp, xfs_dir2_off_t by);
172#define XFS_DIR2_BYTE_TO_DATAPTR(mp,by) xfs_dir2_byte_to_dataptr(mp,by)
173#else
174#define XFS_DIR2_BYTE_TO_DATAPTR(mp,by) \
175 ((xfs_dir2_dataptr_t)((by) >> XFS_DIR2_DATA_ALIGN_LOG))
176#endif
177
178/*
179 * Convert dataptr to a block number
180 */
181#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DATAPTR_TO_DB)
182xfs_dir2_db_t
183xfs_dir2_dataptr_to_db(struct xfs_mount *mp, xfs_dir2_dataptr_t dp);
184#define XFS_DIR2_DATAPTR_TO_DB(mp,dp) xfs_dir2_dataptr_to_db(mp, dp)
185#else
186#define XFS_DIR2_DATAPTR_TO_DB(mp,dp) \
187 XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_DATAPTR_TO_BYTE(mp, dp))
188#endif
189
190/*
191 * Convert dataptr to a byte offset in a block
192 */
193#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DATAPTR_TO_OFF)
194xfs_dir2_data_aoff_t
195xfs_dir2_dataptr_to_off(struct xfs_mount *mp, xfs_dir2_dataptr_t dp);
196#define XFS_DIR2_DATAPTR_TO_OFF(mp,dp) xfs_dir2_dataptr_to_off(mp, dp)
197#else
198#define XFS_DIR2_DATAPTR_TO_OFF(mp,dp) \
199 XFS_DIR2_BYTE_TO_OFF(mp, XFS_DIR2_DATAPTR_TO_BYTE(mp, dp))
200#endif
201
202/*
203 * Convert block and offset to byte in space
204 */
205#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DB_OFF_TO_BYTE)
206xfs_dir2_off_t
207xfs_dir2_db_off_to_byte(struct xfs_mount *mp, xfs_dir2_db_t db,
208 xfs_dir2_data_aoff_t o);
209#define XFS_DIR2_DB_OFF_TO_BYTE(mp,db,o) \
210 xfs_dir2_db_off_to_byte(mp, db, o)
211#else
212#define XFS_DIR2_DB_OFF_TO_BYTE(mp,db,o) \
213 (((xfs_dir2_off_t)(db) << \
214 ((mp)->m_sb.sb_blocklog + (mp)->m_sb.sb_dirblklog)) + (o))
215#endif
216
217/*
218 * Convert byte in space to (DB) block
219 */
220#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_BYTE_TO_DB)
221xfs_dir2_db_t xfs_dir2_byte_to_db(struct xfs_mount *mp, xfs_dir2_off_t by);
222#define XFS_DIR2_BYTE_TO_DB(mp,by) xfs_dir2_byte_to_db(mp, by)
223#else
224#define XFS_DIR2_BYTE_TO_DB(mp,by) \
225 ((xfs_dir2_db_t)((by) >> \
226 ((mp)->m_sb.sb_blocklog + (mp)->m_sb.sb_dirblklog)))
227#endif
228
229/*
230 * Convert byte in space to (DA) block
231 */
232#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_BYTE_TO_DA)
233xfs_dablk_t xfs_dir2_byte_to_da(struct xfs_mount *mp, xfs_dir2_off_t by);
234#define XFS_DIR2_BYTE_TO_DA(mp,by) xfs_dir2_byte_to_da(mp, by)
235#else
236#define XFS_DIR2_BYTE_TO_DA(mp,by) \
237 XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_BYTE_TO_DB(mp, by))
238#endif
239
240/*
241 * Convert byte in space to offset in a block
242 */
243#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_BYTE_TO_OFF)
244xfs_dir2_data_aoff_t
245xfs_dir2_byte_to_off(struct xfs_mount *mp, xfs_dir2_off_t by);
246#define XFS_DIR2_BYTE_TO_OFF(mp,by) xfs_dir2_byte_to_off(mp, by)
247#else
248#define XFS_DIR2_BYTE_TO_OFF(mp,by) \
249 ((xfs_dir2_data_aoff_t)((by) & \
250 ((1 << ((mp)->m_sb.sb_blocklog + \
251 (mp)->m_sb.sb_dirblklog)) - 1)))
252#endif
253
254/*
255 * Convert block and offset to dataptr
256 */
257#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DB_OFF_TO_DATAPTR)
258xfs_dir2_dataptr_t
259xfs_dir2_db_off_to_dataptr(struct xfs_mount *mp, xfs_dir2_db_t db,
260 xfs_dir2_data_aoff_t o);
261#define XFS_DIR2_DB_OFF_TO_DATAPTR(mp,db,o) \
262 xfs_dir2_db_off_to_dataptr(mp, db, o)
263#else
264#define XFS_DIR2_DB_OFF_TO_DATAPTR(mp,db,o) \
265 XFS_DIR2_BYTE_TO_DATAPTR(mp, XFS_DIR2_DB_OFF_TO_BYTE(mp, db, o))
266#endif
267
268/*
269 * Convert block (DB) to block (dablk)
270 */
271#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DB_TO_DA)
272xfs_dablk_t xfs_dir2_db_to_da(struct xfs_mount *mp, xfs_dir2_db_t db);
273#define XFS_DIR2_DB_TO_DA(mp,db) xfs_dir2_db_to_da(mp, db)
274#else
275#define XFS_DIR2_DB_TO_DA(mp,db) \
276 ((xfs_dablk_t)((db) << (mp)->m_sb.sb_dirblklog))
277#endif
278
279/*
280 * Convert block (dablk) to block (DB)
281 */
282#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DA_TO_DB)
283xfs_dir2_db_t xfs_dir2_da_to_db(struct xfs_mount *mp, xfs_dablk_t da);
284#define XFS_DIR2_DA_TO_DB(mp,da) xfs_dir2_da_to_db(mp, da)
285#else
286#define XFS_DIR2_DA_TO_DB(mp,da) \
287 ((xfs_dir2_db_t)((da) >> (mp)->m_sb.sb_dirblklog))
288#endif
289
290/*
291 * Convert block (dablk) to byte offset in space
292 */
293#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DA_TO_BYTE)
294xfs_dir2_off_t xfs_dir2_da_to_byte(struct xfs_mount *mp, xfs_dablk_t da);
295#define XFS_DIR2_DA_TO_BYTE(mp,da) xfs_dir2_da_to_byte(mp, da)
296#else
297#define XFS_DIR2_DA_TO_BYTE(mp,da) \
298 XFS_DIR2_DB_OFF_TO_BYTE(mp, XFS_DIR2_DA_TO_DB(mp, da), 0)
299#endif
300
301/*
302 * Function declarations.
303 */
304
305extern int
306 xfs_dir2_block_to_leaf(struct xfs_da_args *args, struct xfs_dabuf *dbp);
307
308extern int
309 xfs_dir2_leaf_addname(struct xfs_da_args *args);
310
311extern void
312 xfs_dir2_leaf_compact(struct xfs_da_args *args, struct xfs_dabuf *bp);
313
314extern void
315 xfs_dir2_leaf_compact_x1(struct xfs_dabuf *bp, int *indexp,
316 int *lowstalep, int *highstalep, int *lowlogp,
317 int *highlogp);
318
319extern int
320 xfs_dir2_leaf_getdents(struct xfs_trans *tp, struct xfs_inode *dp,
321 struct uio *uio, int *eofp, struct xfs_dirent *dbp,
322 xfs_dir2_put_t put);
323
324extern int
325 xfs_dir2_leaf_init(struct xfs_da_args *args, xfs_dir2_db_t bno,
326 struct xfs_dabuf **bpp, int magic);
327
328extern void
329 xfs_dir2_leaf_log_ents(struct xfs_trans *tp, struct xfs_dabuf *bp,
330 int first, int last);
331
332extern void
333 xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_dabuf *bp,
334 int first, int last);
335
336extern void
337 xfs_dir2_leaf_log_header(struct xfs_trans *tp, struct xfs_dabuf *bp);
338
339extern void
340 xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_dabuf *bp);
341
342extern int
343 xfs_dir2_leaf_lookup(struct xfs_da_args *args);
344
345extern int
346 xfs_dir2_leaf_removename(struct xfs_da_args *args);
347
348extern int
349 xfs_dir2_leaf_replace(struct xfs_da_args *args);
350
351extern int
352 xfs_dir2_leaf_search_hash(struct xfs_da_args *args,
353 struct xfs_dabuf *lbp);
354extern int
355 xfs_dir2_leaf_trim_data(struct xfs_da_args *args, struct xfs_dabuf *lbp, xfs_dir2_db_t db);
356
357extern int
358 xfs_dir2_node_to_leaf(struct xfs_da_state *state);
359
360#endif /* __XFS_DIR2_LEAF_H__ */
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
new file mode 100644
index 000000000000..a7615d86bfb7
--- /dev/null
+++ b/fs/xfs/xfs_dir2_node.c
@@ -0,0 +1,2020 @@
1/*
2 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33/*
34 * xfs_dir2_node.c
35 * XFS directory implementation, version 2, node form files
36 * See data structures in xfs_dir2_node.h and xfs_da_btree.h.
37 */
38
39#include "xfs.h"
40
41#include "xfs_macros.h"
42#include "xfs_types.h"
43#include "xfs_inum.h"
44#include "xfs_log.h"
45#include "xfs_trans.h"
46#include "xfs_sb.h"
47#include "xfs_dir.h"
48#include "xfs_dir2.h"
49#include "xfs_dmapi.h"
50#include "xfs_mount.h"
51#include "xfs_bmap_btree.h"
52#include "xfs_attr_sf.h"
53#include "xfs_dir_sf.h"
54#include "xfs_dir2_sf.h"
55#include "xfs_dinode.h"
56#include "xfs_inode.h"
57#include "xfs_bmap.h"
58#include "xfs_da_btree.h"
59#include "xfs_dir2_data.h"
60#include "xfs_dir2_leaf.h"
61#include "xfs_dir2_block.h"
62#include "xfs_dir2_node.h"
63#include "xfs_dir2_trace.h"
64#include "xfs_error.h"
65
66/*
67 * Function declarations.
68 */
69static void xfs_dir2_free_log_header(xfs_trans_t *tp, xfs_dabuf_t *bp);
70static int xfs_dir2_leafn_add(xfs_dabuf_t *bp, xfs_da_args_t *args, int index);
71#ifdef DEBUG
72static void xfs_dir2_leafn_check(xfs_inode_t *dp, xfs_dabuf_t *bp);
73#else
74#define xfs_dir2_leafn_check(dp, bp)
75#endif
76static void xfs_dir2_leafn_moveents(xfs_da_args_t *args, xfs_dabuf_t *bp_s,
77 int start_s, xfs_dabuf_t *bp_d, int start_d,
78 int count);
79static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state,
80 xfs_da_state_blk_t *blk1,
81 xfs_da_state_blk_t *blk2);
82static int xfs_dir2_leafn_remove(xfs_da_args_t *args, xfs_dabuf_t *bp,
83 int index, xfs_da_state_blk_t *dblk,
84 int *rval);
85static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
86 xfs_da_state_blk_t *fblk);
87
88/*
89 * Log entries from a freespace block.
90 */
91void
92xfs_dir2_free_log_bests(
93 xfs_trans_t *tp, /* transaction pointer */
94 xfs_dabuf_t *bp, /* freespace buffer */
95 int first, /* first entry to log */
96 int last) /* last entry to log */
97{
98 xfs_dir2_free_t *free; /* freespace structure */
99
100 free = bp->data;
101 ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
102 xfs_da_log_buf(tp, bp,
103 (uint)((char *)&free->bests[first] - (char *)free),
104 (uint)((char *)&free->bests[last] - (char *)free +
105 sizeof(free->bests[0]) - 1));
106}
107
108/*
109 * Log header from a freespace block.
110 */
111static void
112xfs_dir2_free_log_header(
113 xfs_trans_t *tp, /* transaction pointer */
114 xfs_dabuf_t *bp) /* freespace buffer */
115{
116 xfs_dir2_free_t *free; /* freespace structure */
117
118 free = bp->data;
119 ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
120 xfs_da_log_buf(tp, bp, (uint)((char *)&free->hdr - (char *)free),
121 (uint)(sizeof(xfs_dir2_free_hdr_t) - 1));
122}
123
124/*
125 * Convert a leaf-format directory to a node-format directory.
126 * We need to change the magic number of the leaf block, and copy
127 * the freespace table out of the leaf block into its own block.
128 */
129int /* error */
130xfs_dir2_leaf_to_node(
131 xfs_da_args_t *args, /* operation arguments */
132 xfs_dabuf_t *lbp) /* leaf buffer */
133{
134 xfs_inode_t *dp; /* incore directory inode */
135 int error; /* error return value */
136 xfs_dabuf_t *fbp; /* freespace buffer */
137 xfs_dir2_db_t fdb; /* freespace block number */
138 xfs_dir2_free_t *free; /* freespace structure */
139 xfs_dir2_data_off_t *from; /* pointer to freespace entry */
140 int i; /* leaf freespace index */
141 xfs_dir2_leaf_t *leaf; /* leaf structure */
142 xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
143 xfs_mount_t *mp; /* filesystem mount point */
144 int n; /* count of live freespc ents */
145 xfs_dir2_data_off_t off; /* freespace entry value */
146 xfs_dir2_data_off_t *to; /* pointer to freespace entry */
147 xfs_trans_t *tp; /* transaction pointer */
148
149 xfs_dir2_trace_args_b("leaf_to_node", args, lbp);
150 dp = args->dp;
151 mp = dp->i_mount;
152 tp = args->trans;
153 /*
154 * Add a freespace block to the directory.
155 */
156 if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fdb))) {
157 return error;
158 }
159 ASSERT(fdb == XFS_DIR2_FREE_FIRSTDB(mp));
160 /*
161 * Get the buffer for the new freespace block.
162 */
163 if ((error = xfs_da_get_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, fdb), -1, &fbp,
164 XFS_DATA_FORK))) {
165 return error;
166 }
167 ASSERT(fbp != NULL);
168 free = fbp->data;
169 leaf = lbp->data;
170 ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
171 /*
172 * Initialize the freespace block header.
173 */
174 INT_SET(free->hdr.magic, ARCH_CONVERT, XFS_DIR2_FREE_MAGIC);
175 free->hdr.firstdb = 0;
176 ASSERT(INT_GET(ltp->bestcount, ARCH_CONVERT) <= (uint)dp->i_d.di_size / mp->m_dirblksize);
177 INT_COPY(free->hdr.nvalid, ltp->bestcount, ARCH_CONVERT);
178 /*
179 * Copy freespace entries from the leaf block to the new block.
180 * Count active entries.
181 */
182 for (i = n = 0, from = XFS_DIR2_LEAF_BESTS_P(ltp), to = free->bests;
183 i < INT_GET(ltp->bestcount, ARCH_CONVERT); i++, from++, to++) {
184 if ((off = INT_GET(*from, ARCH_CONVERT)) != NULLDATAOFF)
185 n++;
186 INT_SET(*to, ARCH_CONVERT, off);
187 }
188 INT_SET(free->hdr.nused, ARCH_CONVERT, n);
189 INT_SET(leaf->hdr.info.magic, ARCH_CONVERT, XFS_DIR2_LEAFN_MAGIC);
190 /*
191 * Log everything.
192 */
193 xfs_dir2_leaf_log_header(tp, lbp);
194 xfs_dir2_free_log_header(tp, fbp);
195 xfs_dir2_free_log_bests(tp, fbp, 0, INT_GET(free->hdr.nvalid, ARCH_CONVERT) - 1);
196 xfs_da_buf_done(fbp);
197 xfs_dir2_leafn_check(dp, lbp);
198 return 0;
199}
200
201/*
202 * Add a leaf entry to a leaf block in a node-form directory.
203 * The other work necessary is done from the caller.
204 */
205static int /* error */
206xfs_dir2_leafn_add(
207 xfs_dabuf_t *bp, /* leaf buffer */
208 xfs_da_args_t *args, /* operation arguments */
209 int index) /* insertion pt for new entry */
210{
211 int compact; /* compacting stale leaves */
212 xfs_inode_t *dp; /* incore directory inode */
213 int highstale; /* next stale entry */
214 xfs_dir2_leaf_t *leaf; /* leaf structure */
215 xfs_dir2_leaf_entry_t *lep; /* leaf entry */
216 int lfloghigh; /* high leaf entry logging */
217 int lfloglow; /* low leaf entry logging */
218 int lowstale; /* previous stale entry */
219 xfs_mount_t *mp; /* filesystem mount point */
220 xfs_trans_t *tp; /* transaction pointer */
221
222 xfs_dir2_trace_args_sb("leafn_add", args, index, bp);
223 dp = args->dp;
224 mp = dp->i_mount;
225 tp = args->trans;
226 leaf = bp->data;
227
228 /*
229 * Quick check just to make sure we are not going to index
230 * into other peoples memory
231 */
232 if (index < 0)
233 return XFS_ERROR(EFSCORRUPTED);
234
235 /*
236 * If there are already the maximum number of leaf entries in
237 * the block, if there are no stale entries it won't fit.
238 * Caller will do a split. If there are stale entries we'll do
239 * a compact.
240 */
241
242 if (INT_GET(leaf->hdr.count, ARCH_CONVERT) == XFS_DIR2_MAX_LEAF_ENTS(mp)) {
243 if (!leaf->hdr.stale)
244 return XFS_ERROR(ENOSPC);
245 compact = INT_GET(leaf->hdr.stale, ARCH_CONVERT) > 1;
246 } else
247 compact = 0;
248 ASSERT(index == 0 || INT_GET(leaf->ents[index - 1].hashval, ARCH_CONVERT) <= args->hashval);
249 ASSERT(index == INT_GET(leaf->hdr.count, ARCH_CONVERT) ||
250 INT_GET(leaf->ents[index].hashval, ARCH_CONVERT) >= args->hashval);
251
252 if (args->justcheck)
253 return 0;
254
255 /*
256 * Compact out all but one stale leaf entry. Leaves behind
257 * the entry closest to index.
258 */
259 if (compact) {
260 xfs_dir2_leaf_compact_x1(bp, &index, &lowstale, &highstale,
261 &lfloglow, &lfloghigh);
262 }
263 /*
264 * Set impossible logging indices for this case.
265 */
266 else if (leaf->hdr.stale) {
267 lfloglow = INT_GET(leaf->hdr.count, ARCH_CONVERT);
268 lfloghigh = -1;
269 }
270 /*
271 * No stale entries, just insert a space for the new entry.
272 */
273 if (!leaf->hdr.stale) {
274 lep = &leaf->ents[index];
275 if (index < INT_GET(leaf->hdr.count, ARCH_CONVERT))
276 memmove(lep + 1, lep,
277 (INT_GET(leaf->hdr.count, ARCH_CONVERT) - index) * sizeof(*lep));
278 lfloglow = index;
279 lfloghigh = INT_GET(leaf->hdr.count, ARCH_CONVERT);
280 INT_MOD(leaf->hdr.count, ARCH_CONVERT, +1);
281 }
282 /*
283 * There are stale entries. We'll use one for the new entry.
284 */
285 else {
286 /*
287 * If we didn't do a compact then we need to figure out
288 * which stale entry will be used.
289 */
290 if (compact == 0) {
291 /*
292 * Find first stale entry before our insertion point.
293 */
294 for (lowstale = index - 1;
295 lowstale >= 0 &&
296 INT_GET(leaf->ents[lowstale].address, ARCH_CONVERT) !=
297 XFS_DIR2_NULL_DATAPTR;
298 lowstale--)
299 continue;
300 /*
301 * Find next stale entry after insertion point.
302 * Stop looking if the answer would be worse than
303 * lowstale already found.
304 */
305 for (highstale = index;
306 highstale < INT_GET(leaf->hdr.count, ARCH_CONVERT) &&
307 INT_GET(leaf->ents[highstale].address, ARCH_CONVERT) !=
308 XFS_DIR2_NULL_DATAPTR &&
309 (lowstale < 0 ||
310 index - lowstale - 1 >= highstale - index);
311 highstale++)
312 continue;
313 }
314 /*
315 * Using the low stale entry.
316 * Shift entries up toward the stale slot.
317 */
318 if (lowstale >= 0 &&
319 (highstale == INT_GET(leaf->hdr.count, ARCH_CONVERT) ||
320 index - lowstale - 1 < highstale - index)) {
321 ASSERT(INT_GET(leaf->ents[lowstale].address, ARCH_CONVERT) ==
322 XFS_DIR2_NULL_DATAPTR);
323 ASSERT(index - lowstale - 1 >= 0);
324 if (index - lowstale - 1 > 0)
325 memmove(&leaf->ents[lowstale],
326 &leaf->ents[lowstale + 1],
327 (index - lowstale - 1) * sizeof(*lep));
328 lep = &leaf->ents[index - 1];
329 lfloglow = MIN(lowstale, lfloglow);
330 lfloghigh = MAX(index - 1, lfloghigh);
331 }
332 /*
333 * Using the high stale entry.
334 * Shift entries down toward the stale slot.
335 */
336 else {
337 ASSERT(INT_GET(leaf->ents[highstale].address, ARCH_CONVERT) ==
338 XFS_DIR2_NULL_DATAPTR);
339 ASSERT(highstale - index >= 0);
340 if (highstale - index > 0)
341 memmove(&leaf->ents[index + 1],
342 &leaf->ents[index],
343 (highstale - index) * sizeof(*lep));
344 lep = &leaf->ents[index];
345 lfloglow = MIN(index, lfloglow);
346 lfloghigh = MAX(highstale, lfloghigh);
347 }
348 INT_MOD(leaf->hdr.stale, ARCH_CONVERT, -1);
349 }
350 /*
351 * Insert the new entry, log everything.
352 */
353 INT_SET(lep->hashval, ARCH_CONVERT, args->hashval);
354 INT_SET(lep->address, ARCH_CONVERT, XFS_DIR2_DB_OFF_TO_DATAPTR(mp, args->blkno, args->index));
355 xfs_dir2_leaf_log_header(tp, bp);
356 xfs_dir2_leaf_log_ents(tp, bp, lfloglow, lfloghigh);
357 xfs_dir2_leafn_check(dp, bp);
358 return 0;
359}
360
361#ifdef DEBUG
362/*
363 * Check internal consistency of a leafn block.
364 */
365void
366xfs_dir2_leafn_check(
367 xfs_inode_t *dp, /* incore directory inode */
368 xfs_dabuf_t *bp) /* leaf buffer */
369{
370 int i; /* leaf index */
371 xfs_dir2_leaf_t *leaf; /* leaf structure */
372 xfs_mount_t *mp; /* filesystem mount point */
373 int stale; /* count of stale leaves */
374
375 leaf = bp->data;
376 mp = dp->i_mount;
377 ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
378 ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT) <= XFS_DIR2_MAX_LEAF_ENTS(mp));
379 for (i = stale = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); i++) {
380 if (i + 1 < INT_GET(leaf->hdr.count, ARCH_CONVERT)) {
381 ASSERT(INT_GET(leaf->ents[i].hashval, ARCH_CONVERT) <=
382 INT_GET(leaf->ents[i + 1].hashval, ARCH_CONVERT));
383 }
384 if (INT_GET(leaf->ents[i].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
385 stale++;
386 }
387 ASSERT(INT_GET(leaf->hdr.stale, ARCH_CONVERT) == stale);
388}
389#endif /* DEBUG */
390
391/*
392 * Return the last hash value in the leaf.
393 * Stale entries are ok.
394 */
395xfs_dahash_t /* hash value */
396xfs_dir2_leafn_lasthash(
397 xfs_dabuf_t *bp, /* leaf buffer */
398 int *count) /* count of entries in leaf */
399{
400 xfs_dir2_leaf_t *leaf; /* leaf structure */
401
402 leaf = bp->data;
403 ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
404 if (count)
405 *count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
406 if (!leaf->hdr.count)
407 return 0;
408 return INT_GET(leaf->ents[INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT);
409}
410
411/*
412 * Look up a leaf entry in a node-format leaf block.
413 * If this is an addname then the extrablk in state is a freespace block,
414 * otherwise it's a data block.
415 */
416int
417xfs_dir2_leafn_lookup_int(
418 xfs_dabuf_t *bp, /* leaf buffer */
419 xfs_da_args_t *args, /* operation arguments */
420 int *indexp, /* out: leaf entry index */
421 xfs_da_state_t *state) /* state to fill in */
422{
423 xfs_dabuf_t *curbp; /* current data/free buffer */
424 xfs_dir2_db_t curdb; /* current data block number */
425 xfs_dir2_db_t curfdb; /* current free block number */
426 xfs_dir2_data_entry_t *dep; /* data block entry */
427 xfs_inode_t *dp; /* incore directory inode */
428 int error; /* error return value */
429 int fi; /* free entry index */
430 xfs_dir2_free_t *free=NULL; /* free block structure */
431 int index; /* leaf entry index */
432 xfs_dir2_leaf_t *leaf; /* leaf structure */
433 int length=0; /* length of new data entry */
434 xfs_dir2_leaf_entry_t *lep; /* leaf entry */
435 xfs_mount_t *mp; /* filesystem mount point */
436 xfs_dir2_db_t newdb; /* new data block number */
437 xfs_dir2_db_t newfdb; /* new free block number */
438 xfs_trans_t *tp; /* transaction pointer */
439
440 dp = args->dp;
441 tp = args->trans;
442 mp = dp->i_mount;
443 leaf = bp->data;
444 ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
445#ifdef __KERNEL__
446 ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT) > 0);
447#endif
448 xfs_dir2_leafn_check(dp, bp);
449 /*
450 * Look up the hash value in the leaf entries.
451 */
452 index = xfs_dir2_leaf_search_hash(args, bp);
453 /*
454 * Do we have a buffer coming in?
455 */
456 if (state->extravalid)
457 curbp = state->extrablk.bp;
458 else
459 curbp = NULL;
460 /*
461 * For addname, it's a free block buffer, get the block number.
462 */
463 if (args->addname) {
464 curfdb = curbp ? state->extrablk.blkno : -1;
465 curdb = -1;
466 length = XFS_DIR2_DATA_ENTSIZE(args->namelen);
467 if ((free = (curbp ? curbp->data : NULL)))
468 ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
469 }
470 /*
471 * For others, it's a data block buffer, get the block number.
472 */
473 else {
474 curfdb = -1;
475 curdb = curbp ? state->extrablk.blkno : -1;
476 }
477 /*
478 * Loop over leaf entries with the right hash value.
479 */
480 for (lep = &leaf->ents[index];
481 index < INT_GET(leaf->hdr.count, ARCH_CONVERT) && INT_GET(lep->hashval, ARCH_CONVERT) == args->hashval;
482 lep++, index++) {
483 /*
484 * Skip stale leaf entries.
485 */
486 if (INT_GET(lep->address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
487 continue;
488 /*
489 * Pull the data block number from the entry.
490 */
491 newdb = XFS_DIR2_DATAPTR_TO_DB(mp, INT_GET(lep->address, ARCH_CONVERT));
492 /*
493 * For addname, we're looking for a place to put the new entry.
494 * We want to use a data block with an entry of equal
495 * hash value to ours if there is one with room.
496 */
497 if (args->addname) {
498 /*
499 * If this block isn't the data block we already have
500 * in hand, take a look at it.
501 */
502 if (newdb != curdb) {
503 curdb = newdb;
504 /*
505 * Convert the data block to the free block
506 * holding its freespace information.
507 */
508 newfdb = XFS_DIR2_DB_TO_FDB(mp, newdb);
509 /*
510 * If it's not the one we have in hand,
511 * read it in.
512 */
513 if (newfdb != curfdb) {
514 /*
515 * If we had one before, drop it.
516 */
517 if (curbp)
518 xfs_da_brelse(tp, curbp);
519 /*
520 * Read the free block.
521 */
522 if ((error = xfs_da_read_buf(tp, dp,
523 XFS_DIR2_DB_TO_DA(mp,
524 newfdb),
525 -1, &curbp,
526 XFS_DATA_FORK))) {
527 return error;
528 }
529 curfdb = newfdb;
530 free = curbp->data;
531 ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) ==
532 XFS_DIR2_FREE_MAGIC);
533 ASSERT((INT_GET(free->hdr.firstdb, ARCH_CONVERT) %
534 XFS_DIR2_MAX_FREE_BESTS(mp)) ==
535 0);
536 ASSERT(INT_GET(free->hdr.firstdb, ARCH_CONVERT) <= curdb);
537 ASSERT(curdb <
538 INT_GET(free->hdr.firstdb, ARCH_CONVERT) +
539 INT_GET(free->hdr.nvalid, ARCH_CONVERT));
540 }
541 /*
542 * Get the index for our entry.
543 */
544 fi = XFS_DIR2_DB_TO_FDINDEX(mp, curdb);
545 /*
546 * If it has room, return it.
547 */
548 if (unlikely(INT_GET(free->bests[fi], ARCH_CONVERT) == NULLDATAOFF)) {
549 XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int",
550 XFS_ERRLEVEL_LOW, mp);
551 return XFS_ERROR(EFSCORRUPTED);
552 }
553 if (INT_GET(free->bests[fi], ARCH_CONVERT) >= length) {
554 *indexp = index;
555 state->extravalid = 1;
556 state->extrablk.bp = curbp;
557 state->extrablk.blkno = curfdb;
558 state->extrablk.index = fi;
559 state->extrablk.magic =
560 XFS_DIR2_FREE_MAGIC;
561 ASSERT(args->oknoent);
562 return XFS_ERROR(ENOENT);
563 }
564 }
565 }
566 /*
567 * Not adding a new entry, so we really want to find
568 * the name given to us.
569 */
570 else {
571 /*
572 * If it's a different data block, go get it.
573 */
574 if (newdb != curdb) {
575 /*
576 * If we had a block before, drop it.
577 */
578 if (curbp)
579 xfs_da_brelse(tp, curbp);
580 /*
581 * Read the data block.
582 */
583 if ((error =
584 xfs_da_read_buf(tp, dp,
585 XFS_DIR2_DB_TO_DA(mp, newdb), -1,
586 &curbp, XFS_DATA_FORK))) {
587 return error;
588 }
589 xfs_dir2_data_check(dp, curbp);
590 curdb = newdb;
591 }
592 /*
593 * Point to the data entry.
594 */
595 dep = (xfs_dir2_data_entry_t *)
596 ((char *)curbp->data +
597 XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(lep->address, ARCH_CONVERT)));
598 /*
599 * Compare the entry, return it if it matches.
600 */
601 if (dep->namelen == args->namelen &&
602 dep->name[0] == args->name[0] &&
603 memcmp(dep->name, args->name, args->namelen) == 0) {
604 args->inumber = INT_GET(dep->inumber, ARCH_CONVERT);
605 *indexp = index;
606 state->extravalid = 1;
607 state->extrablk.bp = curbp;
608 state->extrablk.blkno = curdb;
609 state->extrablk.index =
610 (int)((char *)dep -
611 (char *)curbp->data);
612 state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
613 return XFS_ERROR(EEXIST);
614 }
615 }
616 }
617 /*
618 * Didn't find a match.
619 * If we are holding a buffer, give it back in case our caller
620 * finds it useful.
621 */
622 if ((state->extravalid = (curbp != NULL))) {
623 state->extrablk.bp = curbp;
624 state->extrablk.index = -1;
625 /*
626 * For addname, giving back a free block.
627 */
628 if (args->addname) {
629 state->extrablk.blkno = curfdb;
630 state->extrablk.magic = XFS_DIR2_FREE_MAGIC;
631 }
632 /*
633 * For other callers, giving back a data block.
634 */
635 else {
636 state->extrablk.blkno = curdb;
637 state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
638 }
639 }
640 /*
641 * Return the final index, that will be the insertion point.
642 */
643 *indexp = index;
644 ASSERT(index == INT_GET(leaf->hdr.count, ARCH_CONVERT) || args->oknoent);
645 return XFS_ERROR(ENOENT);
646}
647
648/*
649 * Move count leaf entries from source to destination leaf.
650 * Log entries and headers. Stale entries are preserved.
651 */
652static void
653xfs_dir2_leafn_moveents(
654 xfs_da_args_t *args, /* operation arguments */
655 xfs_dabuf_t *bp_s, /* source leaf buffer */
656 int start_s, /* source leaf index */
657 xfs_dabuf_t *bp_d, /* destination leaf buffer */
658 int start_d, /* destination leaf index */
659 int count) /* count of leaves to copy */
660{
661 xfs_dir2_leaf_t *leaf_d; /* destination leaf structure */
662 xfs_dir2_leaf_t *leaf_s; /* source leaf structure */
663 int stale; /* count stale leaves copied */
664 xfs_trans_t *tp; /* transaction pointer */
665
666 xfs_dir2_trace_args_bibii("leafn_moveents", args, bp_s, start_s, bp_d,
667 start_d, count);
668 /*
669 * Silently return if nothing to do.
670 */
671 if (count == 0) {
672 return;
673 }
674 tp = args->trans;
675 leaf_s = bp_s->data;
676 leaf_d = bp_d->data;
677 /*
678 * If the destination index is not the end of the current
679 * destination leaf entries, open up a hole in the destination
680 * to hold the new entries.
681 */
682 if (start_d < INT_GET(leaf_d->hdr.count, ARCH_CONVERT)) {
683 memmove(&leaf_d->ents[start_d + count], &leaf_d->ents[start_d],
684 (INT_GET(leaf_d->hdr.count, ARCH_CONVERT) - start_d) *
685 sizeof(xfs_dir2_leaf_entry_t));
686 xfs_dir2_leaf_log_ents(tp, bp_d, start_d + count,
687 count + INT_GET(leaf_d->hdr.count, ARCH_CONVERT) - 1);
688 }
689 /*
690 * If the source has stale leaves, count the ones in the copy range
691 * so we can update the header correctly.
692 */
693 if (leaf_s->hdr.stale) {
694 int i; /* temp leaf index */
695
696 for (i = start_s, stale = 0; i < start_s + count; i++) {
697 if (INT_GET(leaf_s->ents[i].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
698 stale++;
699 }
700 } else
701 stale = 0;
702 /*
703 * Copy the leaf entries from source to destination.
704 */
705 memcpy(&leaf_d->ents[start_d], &leaf_s->ents[start_s],
706 count * sizeof(xfs_dir2_leaf_entry_t));
707 xfs_dir2_leaf_log_ents(tp, bp_d, start_d, start_d + count - 1);
708 /*
709 * If there are source entries after the ones we copied,
710 * delete the ones we copied by sliding the next ones down.
711 */
712 if (start_s + count < INT_GET(leaf_s->hdr.count, ARCH_CONVERT)) {
713 memmove(&leaf_s->ents[start_s], &leaf_s->ents[start_s + count],
714 count * sizeof(xfs_dir2_leaf_entry_t));
715 xfs_dir2_leaf_log_ents(tp, bp_s, start_s, start_s + count - 1);
716 }
717 /*
718 * Update the headers and log them.
719 */
720 INT_MOD(leaf_s->hdr.count, ARCH_CONVERT, -(count));
721 INT_MOD(leaf_s->hdr.stale, ARCH_CONVERT, -(stale));
722 INT_MOD(leaf_d->hdr.count, ARCH_CONVERT, count);
723 INT_MOD(leaf_d->hdr.stale, ARCH_CONVERT, stale);
724 xfs_dir2_leaf_log_header(tp, bp_s);
725 xfs_dir2_leaf_log_header(tp, bp_d);
726 xfs_dir2_leafn_check(args->dp, bp_s);
727 xfs_dir2_leafn_check(args->dp, bp_d);
728}
729
730/*
731 * Determine the sort order of two leaf blocks.
732 * Returns 1 if both are valid and leaf2 should be before leaf1, else 0.
733 */
734int /* sort order */
735xfs_dir2_leafn_order(
736 xfs_dabuf_t *leaf1_bp, /* leaf1 buffer */
737 xfs_dabuf_t *leaf2_bp) /* leaf2 buffer */
738{
739 xfs_dir2_leaf_t *leaf1; /* leaf1 structure */
740 xfs_dir2_leaf_t *leaf2; /* leaf2 structure */
741
742 leaf1 = leaf1_bp->data;
743 leaf2 = leaf2_bp->data;
744 ASSERT(INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
745 ASSERT(INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
746 if (INT_GET(leaf1->hdr.count, ARCH_CONVERT) > 0 &&
747 INT_GET(leaf2->hdr.count, ARCH_CONVERT) > 0 &&
748 (INT_GET(leaf2->ents[0].hashval, ARCH_CONVERT) < INT_GET(leaf1->ents[0].hashval, ARCH_CONVERT) ||
749 INT_GET(leaf2->ents[INT_GET(leaf2->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT) <
750 INT_GET(leaf1->ents[INT_GET(leaf1->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT)))
751 return 1;
752 return 0;
753}
754
755/*
756 * Rebalance leaf entries between two leaf blocks.
757 * This is actually only called when the second block is new,
758 * though the code deals with the general case.
759 * A new entry will be inserted in one of the blocks, and that
760 * entry is taken into account when balancing.
761 */
762static void
763xfs_dir2_leafn_rebalance(
764 xfs_da_state_t *state, /* btree cursor */
765 xfs_da_state_blk_t *blk1, /* first btree block */
766 xfs_da_state_blk_t *blk2) /* second btree block */
767{
768 xfs_da_args_t *args; /* operation arguments */
769 int count; /* count (& direction) leaves */
770 int isleft; /* new goes in left leaf */
771 xfs_dir2_leaf_t *leaf1; /* first leaf structure */
772 xfs_dir2_leaf_t *leaf2; /* second leaf structure */
773 int mid; /* midpoint leaf index */
774#ifdef DEBUG
775 int oldstale; /* old count of stale leaves */
776#endif
777 int oldsum; /* old total leaf count */
778 int swap; /* swapped leaf blocks */
779
780 args = state->args;
781 /*
782 * If the block order is wrong, swap the arguments.
783 */
784 if ((swap = xfs_dir2_leafn_order(blk1->bp, blk2->bp))) {
785 xfs_da_state_blk_t *tmp; /* temp for block swap */
786
787 tmp = blk1;
788 blk1 = blk2;
789 blk2 = tmp;
790 }
791 leaf1 = blk1->bp->data;
792 leaf2 = blk2->bp->data;
793 oldsum = INT_GET(leaf1->hdr.count, ARCH_CONVERT) + INT_GET(leaf2->hdr.count, ARCH_CONVERT);
794#ifdef DEBUG
795 oldstale = INT_GET(leaf1->hdr.stale, ARCH_CONVERT) + INT_GET(leaf2->hdr.stale, ARCH_CONVERT);
796#endif
797 mid = oldsum >> 1;
798 /*
799 * If the old leaf count was odd then the new one will be even,
800 * so we need to divide the new count evenly.
801 */
802 if (oldsum & 1) {
803 xfs_dahash_t midhash; /* middle entry hash value */
804
805 if (mid >= INT_GET(leaf1->hdr.count, ARCH_CONVERT))
806 midhash = INT_GET(leaf2->ents[mid - INT_GET(leaf1->hdr.count, ARCH_CONVERT)].hashval, ARCH_CONVERT);
807 else
808 midhash = INT_GET(leaf1->ents[mid].hashval, ARCH_CONVERT);
809 isleft = args->hashval <= midhash;
810 }
811 /*
812 * If the old count is even then the new count is odd, so there's
813 * no preferred side for the new entry.
814 * Pick the left one.
815 */
816 else
817 isleft = 1;
818 /*
819 * Calculate moved entry count. Positive means left-to-right,
820 * negative means right-to-left. Then move the entries.
821 */
822 count = INT_GET(leaf1->hdr.count, ARCH_CONVERT) - mid + (isleft == 0);
823 if (count > 0)
824 xfs_dir2_leafn_moveents(args, blk1->bp,
825 INT_GET(leaf1->hdr.count, ARCH_CONVERT) - count, blk2->bp, 0, count);
826 else if (count < 0)
827 xfs_dir2_leafn_moveents(args, blk2->bp, 0, blk1->bp,
828 INT_GET(leaf1->hdr.count, ARCH_CONVERT), count);
829 ASSERT(INT_GET(leaf1->hdr.count, ARCH_CONVERT) + INT_GET(leaf2->hdr.count, ARCH_CONVERT) == oldsum);
830 ASSERT(INT_GET(leaf1->hdr.stale, ARCH_CONVERT) + INT_GET(leaf2->hdr.stale, ARCH_CONVERT) == oldstale);
831 /*
832 * Mark whether we're inserting into the old or new leaf.
833 */
834 if (INT_GET(leaf1->hdr.count, ARCH_CONVERT) < INT_GET(leaf2->hdr.count, ARCH_CONVERT))
835 state->inleaf = swap;
836 else if (INT_GET(leaf1->hdr.count, ARCH_CONVERT) > INT_GET(leaf2->hdr.count, ARCH_CONVERT))
837 state->inleaf = !swap;
838 else
839 state->inleaf =
840 swap ^ (blk1->index <= INT_GET(leaf1->hdr.count, ARCH_CONVERT));
841 /*
842 * Adjust the expected index for insertion.
843 */
844 if (!state->inleaf)
845 blk2->index = blk1->index - INT_GET(leaf1->hdr.count, ARCH_CONVERT);
846
847 /*
848 * Finally sanity check just to make sure we are not returning a negative index
849 */
850 if(blk2->index < 0) {
851 state->inleaf = 1;
852 blk2->index = 0;
853 cmn_err(CE_ALERT,
854 "xfs_dir2_leafn_rebalance: picked the wrong leaf? reverting orignal leaf: "
855 "blk1->index %d\n",
856 blk1->index);
857 }
858}
859
860/*
861 * Remove an entry from a node directory.
862 * This removes the leaf entry and the data entry,
863 * and updates the free block if necessary.
864 */
865static int /* error */
866xfs_dir2_leafn_remove(
867 xfs_da_args_t *args, /* operation arguments */
868 xfs_dabuf_t *bp, /* leaf buffer */
869 int index, /* leaf entry index */
870 xfs_da_state_blk_t *dblk, /* data block */
871 int *rval) /* resulting block needs join */
872{
873 xfs_dir2_data_t *data; /* data block structure */
874 xfs_dir2_db_t db; /* data block number */
875 xfs_dabuf_t *dbp; /* data block buffer */
876 xfs_dir2_data_entry_t *dep; /* data block entry */
877 xfs_inode_t *dp; /* incore directory inode */
878 xfs_dir2_leaf_t *leaf; /* leaf structure */
879 xfs_dir2_leaf_entry_t *lep; /* leaf entry */
880 int longest; /* longest data free entry */
881 int off; /* data block entry offset */
882 xfs_mount_t *mp; /* filesystem mount point */
883 int needlog; /* need to log data header */
884 int needscan; /* need to rescan data frees */
885 xfs_trans_t *tp; /* transaction pointer */
886
887 xfs_dir2_trace_args_sb("leafn_remove", args, index, bp);
888 dp = args->dp;
889 tp = args->trans;
890 mp = dp->i_mount;
891 leaf = bp->data;
892 ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
893 /*
894 * Point to the entry we're removing.
895 */
896 lep = &leaf->ents[index];
897 /*
898 * Extract the data block and offset from the entry.
899 */
900 db = XFS_DIR2_DATAPTR_TO_DB(mp, INT_GET(lep->address, ARCH_CONVERT));
901 ASSERT(dblk->blkno == db);
902 off = XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(lep->address, ARCH_CONVERT));
903 ASSERT(dblk->index == off);
904 /*
905 * Kill the leaf entry by marking it stale.
906 * Log the leaf block changes.
907 */
908 INT_MOD(leaf->hdr.stale, ARCH_CONVERT, +1);
909 xfs_dir2_leaf_log_header(tp, bp);
910 INT_SET(lep->address, ARCH_CONVERT, XFS_DIR2_NULL_DATAPTR);
911 xfs_dir2_leaf_log_ents(tp, bp, index, index);
912 /*
913 * Make the data entry free. Keep track of the longest freespace
914 * in the data block in case it changes.
915 */
916 dbp = dblk->bp;
917 data = dbp->data;
918 dep = (xfs_dir2_data_entry_t *)((char *)data + off);
919 longest = INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT);
920 needlog = needscan = 0;
921 xfs_dir2_data_make_free(tp, dbp, off,
922 XFS_DIR2_DATA_ENTSIZE(dep->namelen), &needlog, &needscan);
923 /*
924 * Rescan the data block freespaces for bestfree.
925 * Log the data block header if needed.
926 */
927 if (needscan)
928 xfs_dir2_data_freescan(mp, data, &needlog, NULL);
929 if (needlog)
930 xfs_dir2_data_log_header(tp, dbp);
931 xfs_dir2_data_check(dp, dbp);
932 /*
933 * If the longest data block freespace changes, need to update
934 * the corresponding freeblock entry.
935 */
936 if (longest < INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT)) {
937 int error; /* error return value */
938 xfs_dabuf_t *fbp; /* freeblock buffer */
939 xfs_dir2_db_t fdb; /* freeblock block number */
940 int findex; /* index in freeblock entries */
941 xfs_dir2_free_t *free; /* freeblock structure */
942 int logfree; /* need to log free entry */
943
944 /*
945 * Convert the data block number to a free block,
946 * read in the free block.
947 */
948 fdb = XFS_DIR2_DB_TO_FDB(mp, db);
949 if ((error = xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, fdb),
950 -1, &fbp, XFS_DATA_FORK))) {
951 return error;
952 }
953 free = fbp->data;
954 ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
955 ASSERT(INT_GET(free->hdr.firstdb, ARCH_CONVERT) ==
956 XFS_DIR2_MAX_FREE_BESTS(mp) *
957 (fdb - XFS_DIR2_FREE_FIRSTDB(mp)));
958 /*
959 * Calculate which entry we need to fix.
960 */
961 findex = XFS_DIR2_DB_TO_FDINDEX(mp, db);
962 longest = INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT);
963 /*
964 * If the data block is now empty we can get rid of it
965 * (usually).
966 */
967 if (longest == mp->m_dirblksize - (uint)sizeof(data->hdr)) {
968 /*
969 * Try to punch out the data block.
970 */
971 error = xfs_dir2_shrink_inode(args, db, dbp);
972 if (error == 0) {
973 dblk->bp = NULL;
974 data = NULL;
975 }
976 /*
977 * We can get ENOSPC if there's no space reservation.
978 * In this case just drop the buffer and some one else
979 * will eventually get rid of the empty block.
980 */
981 else if (error == ENOSPC && args->total == 0)
982 xfs_da_buf_done(dbp);
983 else
984 return error;
985 }
986 /*
987 * If we got rid of the data block, we can eliminate that entry
988 * in the free block.
989 */
990 if (data == NULL) {
991 /*
992 * One less used entry in the free table.
993 */
994 INT_MOD(free->hdr.nused, ARCH_CONVERT, -1);
995 xfs_dir2_free_log_header(tp, fbp);
996 /*
997 * If this was the last entry in the table, we can
998 * trim the table size back. There might be other
999 * entries at the end referring to non-existent
1000 * data blocks, get those too.
1001 */
1002 if (findex == INT_GET(free->hdr.nvalid, ARCH_CONVERT) - 1) {
1003 int i; /* free entry index */
1004
1005 for (i = findex - 1;
1006 i >= 0 && INT_GET(free->bests[i], ARCH_CONVERT) == NULLDATAOFF;
1007 i--)
1008 continue;
1009 INT_SET(free->hdr.nvalid, ARCH_CONVERT, i + 1);
1010 logfree = 0;
1011 }
1012 /*
1013 * Not the last entry, just punch it out.
1014 */
1015 else {
1016 INT_SET(free->bests[findex], ARCH_CONVERT, NULLDATAOFF);
1017 logfree = 1;
1018 }
1019 /*
1020 * If there are no useful entries left in the block,
1021 * get rid of the block if we can.
1022 */
1023 if (!free->hdr.nused) {
1024 error = xfs_dir2_shrink_inode(args, fdb, fbp);
1025 if (error == 0) {
1026 fbp = NULL;
1027 logfree = 0;
1028 } else if (error != ENOSPC || args->total != 0)
1029 return error;
1030 /*
1031 * It's possible to get ENOSPC if there is no
1032 * space reservation. In this case some one
1033 * else will eventually get rid of this block.
1034 */
1035 }
1036 }
1037 /*
1038 * Data block is not empty, just set the free entry to
1039 * the new value.
1040 */
1041 else {
1042 INT_SET(free->bests[findex], ARCH_CONVERT, longest);
1043 logfree = 1;
1044 }
1045 /*
1046 * Log the free entry that changed, unless we got rid of it.
1047 */
1048 if (logfree)
1049 xfs_dir2_free_log_bests(tp, fbp, findex, findex);
1050 /*
1051 * Drop the buffer if we still have it.
1052 */
1053 if (fbp)
1054 xfs_da_buf_done(fbp);
1055 }
1056 xfs_dir2_leafn_check(dp, bp);
1057 /*
1058 * Return indication of whether this leaf block is emtpy enough
1059 * to justify trying to join it with a neighbor.
1060 */
1061 *rval =
1062 ((uint)sizeof(leaf->hdr) +
1063 (uint)sizeof(leaf->ents[0]) *
1064 (INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT))) <
1065 mp->m_dir_magicpct;
1066 return 0;
1067}
1068
1069/*
1070 * Split the leaf entries in the old block into old and new blocks.
1071 */
1072int /* error */
1073xfs_dir2_leafn_split(
1074 xfs_da_state_t *state, /* btree cursor */
1075 xfs_da_state_blk_t *oldblk, /* original block */
1076 xfs_da_state_blk_t *newblk) /* newly created block */
1077{
1078 xfs_da_args_t *args; /* operation arguments */
1079 xfs_dablk_t blkno; /* new leaf block number */
1080 int error; /* error return value */
1081 xfs_mount_t *mp; /* filesystem mount point */
1082
1083 /*
1084 * Allocate space for a new leaf node.
1085 */
1086 args = state->args;
1087 mp = args->dp->i_mount;
1088 ASSERT(args != NULL);
1089 ASSERT(oldblk->magic == XFS_DIR2_LEAFN_MAGIC);
1090 error = xfs_da_grow_inode(args, &blkno);
1091 if (error) {
1092 return error;
1093 }
1094 /*
1095 * Initialize the new leaf block.
1096 */
1097 error = xfs_dir2_leaf_init(args, XFS_DIR2_DA_TO_DB(mp, blkno),
1098 &newblk->bp, XFS_DIR2_LEAFN_MAGIC);
1099 if (error) {
1100 return error;
1101 }
1102 newblk->blkno = blkno;
1103 newblk->magic = XFS_DIR2_LEAFN_MAGIC;
1104 /*
1105 * Rebalance the entries across the two leaves, link the new
1106 * block into the leaves.
1107 */
1108 xfs_dir2_leafn_rebalance(state, oldblk, newblk);
1109 error = xfs_da_blk_link(state, oldblk, newblk);
1110 if (error) {
1111 return error;
1112 }
1113 /*
1114 * Insert the new entry in the correct block.
1115 */
1116 if (state->inleaf)
1117 error = xfs_dir2_leafn_add(oldblk->bp, args, oldblk->index);
1118 else
1119 error = xfs_dir2_leafn_add(newblk->bp, args, newblk->index);
1120 /*
1121 * Update last hashval in each block since we added the name.
1122 */
1123 oldblk->hashval = xfs_dir2_leafn_lasthash(oldblk->bp, NULL);
1124 newblk->hashval = xfs_dir2_leafn_lasthash(newblk->bp, NULL);
1125 xfs_dir2_leafn_check(args->dp, oldblk->bp);
1126 xfs_dir2_leafn_check(args->dp, newblk->bp);
1127 return error;
1128}
1129
1130/*
1131 * Check a leaf block and its neighbors to see if the block should be
1132 * collapsed into one or the other neighbor. Always keep the block
1133 * with the smaller block number.
1134 * If the current block is over 50% full, don't try to join it, return 0.
1135 * If the block is empty, fill in the state structure and return 2.
1136 * If it can be collapsed, fill in the state structure and return 1.
1137 * If nothing can be done, return 0.
1138 */
1139int /* error */
1140xfs_dir2_leafn_toosmall(
1141 xfs_da_state_t *state, /* btree cursor */
1142 int *action) /* resulting action to take */
1143{
1144 xfs_da_state_blk_t *blk; /* leaf block */
1145 xfs_dablk_t blkno; /* leaf block number */
1146 xfs_dabuf_t *bp; /* leaf buffer */
1147 int bytes; /* bytes in use */
1148 int count; /* leaf live entry count */
1149 int error; /* error return value */
1150 int forward; /* sibling block direction */
1151 int i; /* sibling counter */
1152 xfs_da_blkinfo_t *info; /* leaf block header */
1153 xfs_dir2_leaf_t *leaf; /* leaf structure */
1154 int rval; /* result from path_shift */
1155
1156 /*
1157 * Check for the degenerate case of the block being over 50% full.
1158 * If so, it's not worth even looking to see if we might be able
1159 * to coalesce with a sibling.
1160 */
1161 blk = &state->path.blk[state->path.active - 1];
1162 info = blk->bp->data;
1163 ASSERT(INT_GET(info->magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
1164 leaf = (xfs_dir2_leaf_t *)info;
1165 count = INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT);
1166 bytes = (uint)sizeof(leaf->hdr) + count * (uint)sizeof(leaf->ents[0]);
1167 if (bytes > (state->blocksize >> 1)) {
1168 /*
1169 * Blk over 50%, don't try to join.
1170 */
1171 *action = 0;
1172 return 0;
1173 }
1174 /*
1175 * Check for the degenerate case of the block being empty.
1176 * If the block is empty, we'll simply delete it, no need to
1177 * coalesce it with a sibling block. We choose (arbitrarily)
1178 * to merge with the forward block unless it is NULL.
1179 */
1180 if (count == 0) {
1181 /*
1182 * Make altpath point to the block we want to keep and
1183 * path point to the block we want to drop (this one).
1184 */
1185 forward = info->forw;
1186 memcpy(&state->altpath, &state->path, sizeof(state->path));
1187 error = xfs_da_path_shift(state, &state->altpath, forward, 0,
1188 &rval);
1189 if (error)
1190 return error;
1191 *action = rval ? 2 : 0;
1192 return 0;
1193 }
1194 /*
1195 * Examine each sibling block to see if we can coalesce with
1196 * at least 25% free space to spare. We need to figure out
1197 * whether to merge with the forward or the backward block.
1198 * We prefer coalescing with the lower numbered sibling so as
1199 * to shrink a directory over time.
1200 */
1201 forward = INT_GET(info->forw, ARCH_CONVERT) < INT_GET(info->back, ARCH_CONVERT);
1202 for (i = 0, bp = NULL; i < 2; forward = !forward, i++) {
1203 blkno = forward ?INT_GET( info->forw, ARCH_CONVERT) : INT_GET(info->back, ARCH_CONVERT);
1204 if (blkno == 0)
1205 continue;
1206 /*
1207 * Read the sibling leaf block.
1208 */
1209 if ((error =
1210 xfs_da_read_buf(state->args->trans, state->args->dp, blkno,
1211 -1, &bp, XFS_DATA_FORK))) {
1212 return error;
1213 }
1214 ASSERT(bp != NULL);
1215 /*
1216 * Count bytes in the two blocks combined.
1217 */
1218 leaf = (xfs_dir2_leaf_t *)info;
1219 count = INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT);
1220 bytes = state->blocksize - (state->blocksize >> 2);
1221 leaf = bp->data;
1222 ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
1223 count += INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT);
1224 bytes -= count * (uint)sizeof(leaf->ents[0]);
1225 /*
1226 * Fits with at least 25% to spare.
1227 */
1228 if (bytes >= 0)
1229 break;
1230 xfs_da_brelse(state->args->trans, bp);
1231 }
1232 /*
1233 * Didn't like either block, give up.
1234 */
1235 if (i >= 2) {
1236 *action = 0;
1237 return 0;
1238 }
1239 /*
1240 * Done with the sibling leaf block here, drop the dabuf
1241 * so path_shift can get it.
1242 */
1243 xfs_da_buf_done(bp);
1244 /*
1245 * Make altpath point to the block we want to keep (the lower
1246 * numbered block) and path point to the block we want to drop.
1247 */
1248 memcpy(&state->altpath, &state->path, sizeof(state->path));
1249 if (blkno < blk->blkno)
1250 error = xfs_da_path_shift(state, &state->altpath, forward, 0,
1251 &rval);
1252 else
1253 error = xfs_da_path_shift(state, &state->path, forward, 0,
1254 &rval);
1255 if (error) {
1256 return error;
1257 }
1258 *action = rval ? 0 : 1;
1259 return 0;
1260}
1261
1262/*
1263 * Move all the leaf entries from drop_blk to save_blk.
1264 * This is done as part of a join operation.
1265 */
1266void
1267xfs_dir2_leafn_unbalance(
1268 xfs_da_state_t *state, /* cursor */
1269 xfs_da_state_blk_t *drop_blk, /* dead block */
1270 xfs_da_state_blk_t *save_blk) /* surviving block */
1271{
1272 xfs_da_args_t *args; /* operation arguments */
1273 xfs_dir2_leaf_t *drop_leaf; /* dead leaf structure */
1274 xfs_dir2_leaf_t *save_leaf; /* surviving leaf structure */
1275
1276 args = state->args;
1277 ASSERT(drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
1278 ASSERT(save_blk->magic == XFS_DIR2_LEAFN_MAGIC);
1279 drop_leaf = drop_blk->bp->data;
1280 save_leaf = save_blk->bp->data;
1281 ASSERT(INT_GET(drop_leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
1282 ASSERT(INT_GET(save_leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
1283 /*
1284 * If there are any stale leaf entries, take this opportunity
1285 * to purge them.
1286 */
1287 if (INT_GET(drop_leaf->hdr.stale, ARCH_CONVERT))
1288 xfs_dir2_leaf_compact(args, drop_blk->bp);
1289 if (INT_GET(save_leaf->hdr.stale, ARCH_CONVERT))
1290 xfs_dir2_leaf_compact(args, save_blk->bp);
1291 /*
1292 * Move the entries from drop to the appropriate end of save.
1293 */
1294 drop_blk->hashval = INT_GET(drop_leaf->ents[INT_GET(drop_leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT);
1295 if (xfs_dir2_leafn_order(save_blk->bp, drop_blk->bp))
1296 xfs_dir2_leafn_moveents(args, drop_blk->bp, 0, save_blk->bp, 0,
1297 INT_GET(drop_leaf->hdr.count, ARCH_CONVERT));
1298 else
1299 xfs_dir2_leafn_moveents(args, drop_blk->bp, 0, save_blk->bp,
1300 INT_GET(save_leaf->hdr.count, ARCH_CONVERT), INT_GET(drop_leaf->hdr.count, ARCH_CONVERT));
1301 save_blk->hashval = INT_GET(save_leaf->ents[INT_GET(save_leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT);
1302 xfs_dir2_leafn_check(args->dp, save_blk->bp);
1303}
1304
1305/*
1306 * Top-level node form directory addname routine.
1307 */
1308int /* error */
1309xfs_dir2_node_addname(
1310 xfs_da_args_t *args) /* operation arguments */
1311{
1312 xfs_da_state_blk_t *blk; /* leaf block for insert */
1313 int error; /* error return value */
1314 int rval; /* sub-return value */
1315 xfs_da_state_t *state; /* btree cursor */
1316
1317 xfs_dir2_trace_args("node_addname", args);
1318 /*
1319 * Allocate and initialize the state (btree cursor).
1320 */
1321 state = xfs_da_state_alloc();
1322 state->args = args;
1323 state->mp = args->dp->i_mount;
1324 state->blocksize = state->mp->m_dirblksize;
1325 state->node_ents = state->mp->m_dir_node_ents;
1326 /*
1327 * Look up the name. We're not supposed to find it, but
1328 * this gives us the insertion point.
1329 */
1330 error = xfs_da_node_lookup_int(state, &rval);
1331 if (error)
1332 rval = error;
1333 if (rval != ENOENT) {
1334 goto done;
1335 }
1336 /*
1337 * Add the data entry to a data block.
1338 * Extravalid is set to a freeblock found by lookup.
1339 */
1340 rval = xfs_dir2_node_addname_int(args,
1341 state->extravalid ? &state->extrablk : NULL);
1342 if (rval) {
1343 goto done;
1344 }
1345 blk = &state->path.blk[state->path.active - 1];
1346 ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
1347 /*
1348 * Add the new leaf entry.
1349 */
1350 rval = xfs_dir2_leafn_add(blk->bp, args, blk->index);
1351 if (rval == 0) {
1352 /*
1353 * It worked, fix the hash values up the btree.
1354 */
1355 if (!args->justcheck)
1356 xfs_da_fixhashpath(state, &state->path);
1357 } else {
1358 /*
1359 * It didn't work, we need to split the leaf block.
1360 */
1361 if (args->total == 0) {
1362 ASSERT(rval == ENOSPC);
1363 goto done;
1364 }
1365 /*
1366 * Split the leaf block and insert the new entry.
1367 */
1368 rval = xfs_da_split(state);
1369 }
1370done:
1371 xfs_da_state_free(state);
1372 return rval;
1373}
1374
1375/*
1376 * Add the data entry for a node-format directory name addition.
1377 * The leaf entry is added in xfs_dir2_leafn_add.
1378 * We may enter with a freespace block that the lookup found.
1379 */
1380static int /* error */
1381xfs_dir2_node_addname_int(
1382 xfs_da_args_t *args, /* operation arguments */
1383 xfs_da_state_blk_t *fblk) /* optional freespace block */
1384{
1385 xfs_dir2_data_t *data; /* data block structure */
1386 xfs_dir2_db_t dbno; /* data block number */
1387 xfs_dabuf_t *dbp; /* data block buffer */
1388 xfs_dir2_data_entry_t *dep; /* data entry pointer */
1389 xfs_inode_t *dp; /* incore directory inode */
1390 xfs_dir2_data_unused_t *dup; /* data unused entry pointer */
1391 int error; /* error return value */
1392 xfs_dir2_db_t fbno; /* freespace block number */
1393 xfs_dabuf_t *fbp; /* freespace buffer */
1394 int findex; /* freespace entry index */
1395 xfs_dir2_free_t *free=NULL; /* freespace block structure */
1396 xfs_dir2_db_t ifbno; /* initial freespace block no */
1397 xfs_dir2_db_t lastfbno=0; /* highest freespace block no */
1398 int length; /* length of the new entry */
1399 int logfree; /* need to log free entry */
1400 xfs_mount_t *mp; /* filesystem mount point */
1401 int needlog; /* need to log data header */
1402 int needscan; /* need to rescan data frees */
1403 xfs_dir2_data_off_t *tagp; /* data entry tag pointer */
1404 xfs_trans_t *tp; /* transaction pointer */
1405
1406 dp = args->dp;
1407 mp = dp->i_mount;
1408 tp = args->trans;
1409 length = XFS_DIR2_DATA_ENTSIZE(args->namelen);
1410 /*
1411 * If we came in with a freespace block that means that lookup
1412 * found an entry with our hash value. This is the freespace
1413 * block for that data entry.
1414 */
1415 if (fblk) {
1416 fbp = fblk->bp;
1417 /*
1418 * Remember initial freespace block number.
1419 */
1420 ifbno = fblk->blkno;
1421 free = fbp->data;
1422 ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
1423 findex = fblk->index;
1424 /*
1425 * This means the free entry showed that the data block had
1426 * space for our entry, so we remembered it.
1427 * Use that data block.
1428 */
1429 if (findex >= 0) {
1430 ASSERT(findex < INT_GET(free->hdr.nvalid, ARCH_CONVERT));
1431 ASSERT(INT_GET(free->bests[findex], ARCH_CONVERT) != NULLDATAOFF);
1432 ASSERT(INT_GET(free->bests[findex], ARCH_CONVERT) >= length);
1433 dbno = INT_GET(free->hdr.firstdb, ARCH_CONVERT) + findex;
1434 }
1435 /*
1436 * The data block looked at didn't have enough room.
1437 * We'll start at the beginning of the freespace entries.
1438 */
1439 else {
1440 dbno = -1;
1441 findex = 0;
1442 }
1443 }
1444 /*
1445 * Didn't come in with a freespace block, so don't have a data block.
1446 */
1447 else {
1448 ifbno = dbno = -1;
1449 fbp = NULL;
1450 findex = 0;
1451 }
1452 /*
1453 * If we don't have a data block yet, we're going to scan the
1454 * freespace blocks looking for one. Figure out what the
1455 * highest freespace block number is.
1456 */
1457 if (dbno == -1) {
1458 xfs_fileoff_t fo; /* freespace block number */
1459
1460 if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK)))
1461 return error;
1462 lastfbno = XFS_DIR2_DA_TO_DB(mp, (xfs_dablk_t)fo);
1463 fbno = ifbno;
1464 }
1465 /*
1466 * While we haven't identified a data block, search the freeblock
1467 * data for a good data block. If we find a null freeblock entry,
1468 * indicating a hole in the data blocks, remember that.
1469 */
1470 while (dbno == -1) {
1471 /*
1472 * If we don't have a freeblock in hand, get the next one.
1473 */
1474 if (fbp == NULL) {
1475 /*
1476 * Happens the first time through unless lookup gave
1477 * us a freespace block to start with.
1478 */
1479 if (++fbno == 0)
1480 fbno = XFS_DIR2_FREE_FIRSTDB(mp);
1481 /*
1482 * If it's ifbno we already looked at it.
1483 */
1484 if (fbno == ifbno)
1485 fbno++;
1486 /*
1487 * If it's off the end we're done.
1488 */
1489 if (fbno >= lastfbno)
1490 break;
1491 /*
1492 * Read the block. There can be holes in the
1493 * freespace blocks, so this might not succeed.
1494 * This should be really rare, so there's no reason
1495 * to avoid it.
1496 */
1497 if ((error = xfs_da_read_buf(tp, dp,
1498 XFS_DIR2_DB_TO_DA(mp, fbno), -2, &fbp,
1499 XFS_DATA_FORK))) {
1500 return error;
1501 }
1502 if (unlikely(fbp == NULL)) {
1503 continue;
1504 }
1505 free = fbp->data;
1506 ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
1507 findex = 0;
1508 }
1509 /*
1510 * Look at the current free entry. Is it good enough?
1511 */
1512 if (INT_GET(free->bests[findex], ARCH_CONVERT) != NULLDATAOFF &&
1513 INT_GET(free->bests[findex], ARCH_CONVERT) >= length)
1514 dbno = INT_GET(free->hdr.firstdb, ARCH_CONVERT) + findex;
1515 else {
1516 /*
1517 * Are we done with the freeblock?
1518 */
1519 if (++findex == INT_GET(free->hdr.nvalid, ARCH_CONVERT)) {
1520 /*
1521 * Drop the block.
1522 */
1523 xfs_da_brelse(tp, fbp);
1524 fbp = NULL;
1525 if (fblk && fblk->bp)
1526 fblk->bp = NULL;
1527 }
1528 }
1529 }
1530 /*
1531 * If we don't have a data block, we need to allocate one and make
1532 * the freespace entries refer to it.
1533 */
1534 if (unlikely(dbno == -1)) {
1535 /*
1536 * Not allowed to allocate, return failure.
1537 */
1538 if (args->justcheck || args->total == 0) {
1539 /*
1540 * Drop the freespace buffer unless it came from our
1541 * caller.
1542 */
1543 if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL)
1544 xfs_da_buf_done(fbp);
1545 return XFS_ERROR(ENOSPC);
1546 }
1547 /*
1548 * Allocate and initialize the new data block.
1549 */
1550 if (unlikely((error = xfs_dir2_grow_inode(args,
1551 XFS_DIR2_DATA_SPACE,
1552 &dbno)) ||
1553 (error = xfs_dir2_data_init(args, dbno, &dbp)))) {
1554 /*
1555 * Drop the freespace buffer unless it came from our
1556 * caller.
1557 */
1558 if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL)
1559 xfs_da_buf_done(fbp);
1560 return error;
1561 }
1562 /*
1563 * If (somehow) we have a freespace block, get rid of it.
1564 */
1565 if (fbp)
1566 xfs_da_brelse(tp, fbp);
1567 if (fblk && fblk->bp)
1568 fblk->bp = NULL;
1569
1570 /*
1571 * Get the freespace block corresponding to the data block
1572 * that was just allocated.
1573 */
1574 fbno = XFS_DIR2_DB_TO_FDB(mp, dbno);
1575 if (unlikely(error = xfs_da_read_buf(tp, dp,
1576 XFS_DIR2_DB_TO_DA(mp, fbno), -2, &fbp,
1577 XFS_DATA_FORK))) {
1578 xfs_da_buf_done(dbp);
1579 return error;
1580 }
1581 /*
1582 * If there wasn't a freespace block, the read will
1583 * return a NULL fbp. Allocate and initialize a new one.
1584 */
1585 if( fbp == NULL ) {
1586 if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE,
1587 &fbno))) {
1588 return error;
1589 }
1590
1591 if (unlikely(XFS_DIR2_DB_TO_FDB(mp, dbno) != fbno)) {
1592 cmn_err(CE_ALERT,
1593 "xfs_dir2_node_addname_int: dir ino "
1594 "%llu needed freesp block %lld for\n"
1595 " data block %lld, got %lld\n"
1596 " ifbno %llu lastfbno %d\n",
1597 (unsigned long long)dp->i_ino,
1598 (long long)XFS_DIR2_DB_TO_FDB(mp, dbno),
1599 (long long)dbno, (long long)fbno,
1600 (unsigned long long)ifbno, lastfbno);
1601 if (fblk) {
1602 cmn_err(CE_ALERT,
1603 " fblk 0x%p blkno %llu "
1604 "index %d magic 0x%x\n",
1605 fblk,
1606 (unsigned long long)fblk->blkno,
1607 fblk->index,
1608 fblk->magic);
1609 } else {
1610 cmn_err(CE_ALERT,
1611 " ... fblk is NULL\n");
1612 }
1613 XFS_ERROR_REPORT("xfs_dir2_node_addname_int",
1614 XFS_ERRLEVEL_LOW, mp);
1615 return XFS_ERROR(EFSCORRUPTED);
1616 }
1617
1618 /*
1619 * Get a buffer for the new block.
1620 */
1621 if ((error = xfs_da_get_buf(tp, dp,
1622 XFS_DIR2_DB_TO_DA(mp, fbno),
1623 -1, &fbp, XFS_DATA_FORK))) {
1624 return error;
1625 }
1626 ASSERT(fbp != NULL);
1627
1628 /*
1629 * Initialize the new block to be empty, and remember
1630 * its first slot as our empty slot.
1631 */
1632 free = fbp->data;
1633 INT_SET(free->hdr.magic, ARCH_CONVERT, XFS_DIR2_FREE_MAGIC);
1634 INT_SET(free->hdr.firstdb, ARCH_CONVERT,
1635 (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) *
1636 XFS_DIR2_MAX_FREE_BESTS(mp));
1637 free->hdr.nvalid = 0;
1638 free->hdr.nused = 0;
1639 } else {
1640 free = fbp->data;
1641 ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
1642 }
1643
1644 /*
1645 * Set the freespace block index from the data block number.
1646 */
1647 findex = XFS_DIR2_DB_TO_FDINDEX(mp, dbno);
1648 /*
1649 * If it's after the end of the current entries in the
1650 * freespace block, extend that table.
1651 */
1652 if (findex >= INT_GET(free->hdr.nvalid, ARCH_CONVERT)) {
1653 ASSERT(findex < XFS_DIR2_MAX_FREE_BESTS(mp));
1654 INT_SET(free->hdr.nvalid, ARCH_CONVERT, findex + 1);
1655 /*
1656 * Tag new entry so nused will go up.
1657 */
1658 INT_SET(free->bests[findex], ARCH_CONVERT, NULLDATAOFF);
1659 }
1660 /*
1661 * If this entry was for an empty data block
1662 * (this should always be true) then update the header.
1663 */
1664 if (INT_GET(free->bests[findex], ARCH_CONVERT) == NULLDATAOFF) {
1665 INT_MOD(free->hdr.nused, ARCH_CONVERT, +1);
1666 xfs_dir2_free_log_header(tp, fbp);
1667 }
1668 /*
1669 * Update the real value in the table.
1670 * We haven't allocated the data entry yet so this will
1671 * change again.
1672 */
1673 data = dbp->data;
1674 INT_COPY(free->bests[findex], data->hdr.bestfree[0].length, ARCH_CONVERT);
1675 logfree = 1;
1676 }
1677 /*
1678 * We had a data block so we don't have to make a new one.
1679 */
1680 else {
1681 /*
1682 * If just checking, we succeeded.
1683 */
1684 if (args->justcheck) {
1685 if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL)
1686 xfs_da_buf_done(fbp);
1687 return 0;
1688 }
1689 /*
1690 * Read the data block in.
1691 */
1692 if (unlikely(
1693 error = xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, dbno),
1694 -1, &dbp, XFS_DATA_FORK))) {
1695 if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL)
1696 xfs_da_buf_done(fbp);
1697 return error;
1698 }
1699 data = dbp->data;
1700 logfree = 0;
1701 }
1702 ASSERT(INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT) >= length);
1703 /*
1704 * Point to the existing unused space.
1705 */
1706 dup = (xfs_dir2_data_unused_t *)
1707 ((char *)data + INT_GET(data->hdr.bestfree[0].offset, ARCH_CONVERT));
1708 needscan = needlog = 0;
1709 /*
1710 * Mark the first part of the unused space, inuse for us.
1711 */
1712 xfs_dir2_data_use_free(tp, dbp, dup,
1713 (xfs_dir2_data_aoff_t)((char *)dup - (char *)data), length,
1714 &needlog, &needscan);
1715 /*
1716 * Fill in the new entry and log it.
1717 */
1718 dep = (xfs_dir2_data_entry_t *)dup;
1719 INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
1720 dep->namelen = args->namelen;
1721 memcpy(dep->name, args->name, dep->namelen);
1722 tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
1723 INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)data));
1724 xfs_dir2_data_log_entry(tp, dbp, dep);
1725 /*
1726 * Rescan the block for bestfree if needed.
1727 */
1728 if (needscan)
1729 xfs_dir2_data_freescan(mp, data, &needlog, NULL);
1730 /*
1731 * Log the data block header if needed.
1732 */
1733 if (needlog)
1734 xfs_dir2_data_log_header(tp, dbp);
1735 /*
1736 * If the freespace entry is now wrong, update it.
1737 */
1738 if (INT_GET(free->bests[findex], ARCH_CONVERT) != INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT)) {
1739 INT_COPY(free->bests[findex], data->hdr.bestfree[0].length, ARCH_CONVERT);
1740 logfree = 1;
1741 }
1742 /*
1743 * Log the freespace entry if needed.
1744 */
1745 if (logfree)
1746 xfs_dir2_free_log_bests(tp, fbp, findex, findex);
1747 /*
1748 * If the caller didn't hand us the freespace block, drop it.
1749 */
1750 if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL)
1751 xfs_da_buf_done(fbp);
1752 /*
1753 * Return the data block and offset in args, then drop the data block.
1754 */
1755 args->blkno = (xfs_dablk_t)dbno;
1756 args->index = INT_GET(*tagp, ARCH_CONVERT);
1757 xfs_da_buf_done(dbp);
1758 return 0;
1759}
1760
1761/*
1762 * Lookup an entry in a node-format directory.
1763 * All the real work happens in xfs_da_node_lookup_int.
1764 * The only real output is the inode number of the entry.
1765 */
1766int /* error */
1767xfs_dir2_node_lookup(
1768 xfs_da_args_t *args) /* operation arguments */
1769{
1770 int error; /* error return value */
1771 int i; /* btree level */
1772 int rval; /* operation return value */
1773 xfs_da_state_t *state; /* btree cursor */
1774
1775 xfs_dir2_trace_args("node_lookup", args);
1776 /*
1777 * Allocate and initialize the btree cursor.
1778 */
1779 state = xfs_da_state_alloc();
1780 state->args = args;
1781 state->mp = args->dp->i_mount;
1782 state->blocksize = state->mp->m_dirblksize;
1783 state->node_ents = state->mp->m_dir_node_ents;
1784 /*
1785 * Fill in the path to the entry in the cursor.
1786 */
1787 error = xfs_da_node_lookup_int(state, &rval);
1788 if (error)
1789 rval = error;
1790 /*
1791 * Release the btree blocks and leaf block.
1792 */
1793 for (i = 0; i < state->path.active; i++) {
1794 xfs_da_brelse(args->trans, state->path.blk[i].bp);
1795 state->path.blk[i].bp = NULL;
1796 }
1797 /*
1798 * Release the data block if we have it.
1799 */
1800 if (state->extravalid && state->extrablk.bp) {
1801 xfs_da_brelse(args->trans, state->extrablk.bp);
1802 state->extrablk.bp = NULL;
1803 }
1804 xfs_da_state_free(state);
1805 return rval;
1806}
1807
1808/*
1809 * Remove an entry from a node-format directory.
1810 */
1811int /* error */
1812xfs_dir2_node_removename(
1813 xfs_da_args_t *args) /* operation arguments */
1814{
1815 xfs_da_state_blk_t *blk; /* leaf block */
1816 int error; /* error return value */
1817 int rval; /* operation return value */
1818 xfs_da_state_t *state; /* btree cursor */
1819
1820 xfs_dir2_trace_args("node_removename", args);
1821 /*
1822 * Allocate and initialize the btree cursor.
1823 */
1824 state = xfs_da_state_alloc();
1825 state->args = args;
1826 state->mp = args->dp->i_mount;
1827 state->blocksize = state->mp->m_dirblksize;
1828 state->node_ents = state->mp->m_dir_node_ents;
1829 /*
1830 * Look up the entry we're deleting, set up the cursor.
1831 */
1832 error = xfs_da_node_lookup_int(state, &rval);
1833 if (error) {
1834 rval = error;
1835 }
1836 /*
1837 * Didn't find it, upper layer screwed up.
1838 */
1839 if (rval != EEXIST) {
1840 xfs_da_state_free(state);
1841 return rval;
1842 }
1843 blk = &state->path.blk[state->path.active - 1];
1844 ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
1845 ASSERT(state->extravalid);
1846 /*
1847 * Remove the leaf and data entries.
1848 * Extrablk refers to the data block.
1849 */
1850 error = xfs_dir2_leafn_remove(args, blk->bp, blk->index,
1851 &state->extrablk, &rval);
1852 if (error) {
1853 return error;
1854 }
1855 /*
1856 * Fix the hash values up the btree.
1857 */
1858 xfs_da_fixhashpath(state, &state->path);
1859 /*
1860 * If we need to join leaf blocks, do it.
1861 */
1862 if (rval && state->path.active > 1)
1863 error = xfs_da_join(state);
1864 /*
1865 * If no errors so far, try conversion to leaf format.
1866 */
1867 if (!error)
1868 error = xfs_dir2_node_to_leaf(state);
1869 xfs_da_state_free(state);
1870 return error;
1871}
1872
1873/*
1874 * Replace an entry's inode number in a node-format directory.
1875 */
1876int /* error */
1877xfs_dir2_node_replace(
1878 xfs_da_args_t *args) /* operation arguments */
1879{
1880 xfs_da_state_blk_t *blk; /* leaf block */
1881 xfs_dir2_data_t *data; /* data block structure */
1882 xfs_dir2_data_entry_t *dep; /* data entry changed */
1883 int error; /* error return value */
1884 int i; /* btree level */
1885 xfs_ino_t inum; /* new inode number */
1886 xfs_dir2_leaf_t *leaf; /* leaf structure */
1887 xfs_dir2_leaf_entry_t *lep; /* leaf entry being changed */
1888 int rval; /* internal return value */
1889 xfs_da_state_t *state; /* btree cursor */
1890
1891 xfs_dir2_trace_args("node_replace", args);
1892 /*
1893 * Allocate and initialize the btree cursor.
1894 */
1895 state = xfs_da_state_alloc();
1896 state->args = args;
1897 state->mp = args->dp->i_mount;
1898 state->blocksize = state->mp->m_dirblksize;
1899 state->node_ents = state->mp->m_dir_node_ents;
1900 inum = args->inumber;
1901 /*
1902 * Lookup the entry to change in the btree.
1903 */
1904 error = xfs_da_node_lookup_int(state, &rval);
1905 if (error) {
1906 rval = error;
1907 }
1908 /*
1909 * It should be found, since the vnodeops layer has looked it up
1910 * and locked it. But paranoia is good.
1911 */
1912 if (rval == EEXIST) {
1913 /*
1914 * Find the leaf entry.
1915 */
1916 blk = &state->path.blk[state->path.active - 1];
1917 ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
1918 leaf = blk->bp->data;
1919 lep = &leaf->ents[blk->index];
1920 ASSERT(state->extravalid);
1921 /*
1922 * Point to the data entry.
1923 */
1924 data = state->extrablk.bp->data;
1925 ASSERT(INT_GET(data->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC);
1926 dep = (xfs_dir2_data_entry_t *)
1927 ((char *)data +
1928 XFS_DIR2_DATAPTR_TO_OFF(state->mp, INT_GET(lep->address, ARCH_CONVERT)));
1929 ASSERT(inum != INT_GET(dep->inumber, ARCH_CONVERT));
1930 /*
1931 * Fill in the new inode number and log the entry.
1932 */
1933 INT_SET(dep->inumber, ARCH_CONVERT, inum);
1934 xfs_dir2_data_log_entry(args->trans, state->extrablk.bp, dep);
1935 rval = 0;
1936 }
1937 /*
1938 * Didn't find it, and we're holding a data block. Drop it.
1939 */
1940 else if (state->extravalid) {
1941 xfs_da_brelse(args->trans, state->extrablk.bp);
1942 state->extrablk.bp = NULL;
1943 }
1944 /*
1945 * Release all the buffers in the cursor.
1946 */
1947 for (i = 0; i < state->path.active; i++) {
1948 xfs_da_brelse(args->trans, state->path.blk[i].bp);
1949 state->path.blk[i].bp = NULL;
1950 }
1951 xfs_da_state_free(state);
1952 return rval;
1953}
1954
1955/*
1956 * Trim off a trailing empty freespace block.
1957 * Return (in rvalp) 1 if we did it, 0 if not.
1958 */
1959int /* error */
1960xfs_dir2_node_trim_free(
1961 xfs_da_args_t *args, /* operation arguments */
1962 xfs_fileoff_t fo, /* free block number */
1963 int *rvalp) /* out: did something */
1964{
1965 xfs_dabuf_t *bp; /* freespace buffer */
1966 xfs_inode_t *dp; /* incore directory inode */
1967 int error; /* error return code */
1968 xfs_dir2_free_t *free; /* freespace structure */
1969 xfs_mount_t *mp; /* filesystem mount point */
1970 xfs_trans_t *tp; /* transaction pointer */
1971
1972 dp = args->dp;
1973 mp = dp->i_mount;
1974 tp = args->trans;
1975 /*
1976 * Read the freespace block.
1977 */
1978 if (unlikely(error = xfs_da_read_buf(tp, dp, (xfs_dablk_t)fo, -2, &bp,
1979 XFS_DATA_FORK))) {
1980 return error;
1981 }
1982
1983 /*
1984 * There can be holes in freespace. If fo is a hole, there's
1985 * nothing to do.
1986 */
1987 if (bp == NULL) {
1988 return 0;
1989 }
1990 free = bp->data;
1991 ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
1992 /*
1993 * If there are used entries, there's nothing to do.
1994 */
1995 if (INT_GET(free->hdr.nused, ARCH_CONVERT) > 0) {
1996 xfs_da_brelse(tp, bp);
1997 *rvalp = 0;
1998 return 0;
1999 }
2000 /*
2001 * Blow the block away.
2002 */
2003 if ((error =
2004 xfs_dir2_shrink_inode(args, XFS_DIR2_DA_TO_DB(mp, (xfs_dablk_t)fo),
2005 bp))) {
2006 /*
2007 * Can't fail with ENOSPC since that only happens with no
2008 * space reservation, when breaking up an extent into two
2009 * pieces. This is the last block of an extent.
2010 */
2011 ASSERT(error != ENOSPC);
2012 xfs_da_brelse(tp, bp);
2013 return error;
2014 }
2015 /*
2016 * Return that we succeeded.
2017 */
2018 *rvalp = 1;
2019 return 0;
2020}
diff --git a/fs/xfs/xfs_dir2_node.h b/fs/xfs/xfs_dir2_node.h
new file mode 100644
index 000000000000..96db420c7c5c
--- /dev/null
+++ b/fs/xfs/xfs_dir2_node.h
@@ -0,0 +1,159 @@
1/*
2 * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_DIR2_NODE_H__
33#define __XFS_DIR2_NODE_H__
34
35/*
36 * Directory version 2, btree node format structures
37 */
38
39struct uio;
40struct xfs_dabuf;
41struct xfs_da_args;
42struct xfs_da_state;
43struct xfs_da_state_blk;
44struct xfs_inode;
45struct xfs_trans;
46
47/*
48 * Constants.
49 */
50
51/*
52 * Offset of the freespace index.
53 */
54#define XFS_DIR2_FREE_SPACE 2
55#define XFS_DIR2_FREE_OFFSET (XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE)
56#define XFS_DIR2_FREE_FIRSTDB(mp) \
57 XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_FREE_OFFSET)
58
59#define XFS_DIR2_FREE_MAGIC 0x58443246 /* XD2F */
60
61/*
62 * Structures.
63 */
64typedef struct xfs_dir2_free_hdr {
65 __uint32_t magic; /* XFS_DIR2_FREE_MAGIC */
66 __int32_t firstdb; /* db of first entry */
67 __int32_t nvalid; /* count of valid entries */
68 __int32_t nused; /* count of used entries */
69} xfs_dir2_free_hdr_t;
70
71typedef struct xfs_dir2_free {
72 xfs_dir2_free_hdr_t hdr; /* block header */
73 xfs_dir2_data_off_t bests[1]; /* best free counts */
74 /* unused entries are -1 */
75} xfs_dir2_free_t;
76#define XFS_DIR2_MAX_FREE_BESTS(mp) \
77 (((mp)->m_dirblksize - (uint)sizeof(xfs_dir2_free_hdr_t)) / \
78 (uint)sizeof(xfs_dir2_data_off_t))
79
80/*
81 * Macros.
82 */
83
84/*
85 * Convert data space db to the corresponding free db.
86 */
87#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DB_TO_FDB)
88xfs_dir2_db_t
89xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db);
90#define XFS_DIR2_DB_TO_FDB(mp,db) xfs_dir2_db_to_fdb(mp, db)
91#else
92#define XFS_DIR2_DB_TO_FDB(mp,db) \
93 (XFS_DIR2_FREE_FIRSTDB(mp) + (db) / XFS_DIR2_MAX_FREE_BESTS(mp))
94#endif
95
96/*
97 * Convert data space db to the corresponding index in a free db.
98 */
99#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DB_TO_FDINDEX)
100int
101xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db);
102#define XFS_DIR2_DB_TO_FDINDEX(mp,db) xfs_dir2_db_to_fdindex(mp, db)
103#else
104#define XFS_DIR2_DB_TO_FDINDEX(mp,db) ((db) % XFS_DIR2_MAX_FREE_BESTS(mp))
105#endif
106
107/*
108 * Functions.
109 */
110
111extern void
112 xfs_dir2_free_log_bests(struct xfs_trans *tp, struct xfs_dabuf *bp,
113 int first, int last);
114
115extern int
116 xfs_dir2_leaf_to_node(struct xfs_da_args *args, struct xfs_dabuf *lbp);
117
118extern xfs_dahash_t
119 xfs_dir2_leafn_lasthash(struct xfs_dabuf *bp, int *count);
120
121extern int
122 xfs_dir2_leafn_lookup_int(struct xfs_dabuf *bp,
123 struct xfs_da_args *args, int *indexp,
124 struct xfs_da_state *state);
125
126extern int
127 xfs_dir2_leafn_order(struct xfs_dabuf *leaf1_bp,
128 struct xfs_dabuf *leaf2_bp);
129
130extern int
131 xfs_dir2_leafn_split(struct xfs_da_state *state,
132 struct xfs_da_state_blk *oldblk,
133 struct xfs_da_state_blk *newblk);
134
135extern int
136 xfs_dir2_leafn_toosmall(struct xfs_da_state *state, int *action);
137
138extern void
139 xfs_dir2_leafn_unbalance(struct xfs_da_state *state,
140 struct xfs_da_state_blk *drop_blk,
141 struct xfs_da_state_blk *save_blk);
142
143extern int
144 xfs_dir2_node_addname(struct xfs_da_args *args);
145
146extern int
147 xfs_dir2_node_lookup(struct xfs_da_args *args);
148
149extern int
150 xfs_dir2_node_removename(struct xfs_da_args *args);
151
152extern int
153 xfs_dir2_node_replace(struct xfs_da_args *args);
154
155extern int
156 xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo,
157 int *rvalp);
158
159#endif /* __XFS_DIR2_NODE_H__ */
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
new file mode 100644
index 000000000000..6bbc61674411
--- /dev/null
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -0,0 +1,1317 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33/*
34 * xfs_dir2_sf.c
35 * Shortform directory implementation for v2 directories.
36 */
37
38#include "xfs.h"
39
40#include "xfs_macros.h"
41#include "xfs_types.h"
42#include "xfs_inum.h"
43#include "xfs_log.h"
44#include "xfs_trans.h"
45#include "xfs_sb.h"
46#include "xfs_dir.h"
47#include "xfs_dir2.h"
48#include "xfs_dmapi.h"
49#include "xfs_mount.h"
50#include "xfs_bmap_btree.h"
51#include "xfs_attr_sf.h"
52#include "xfs_dir_sf.h"
53#include "xfs_dir2_sf.h"
54#include "xfs_dinode.h"
55#include "xfs_inode_item.h"
56#include "xfs_inode.h"
57#include "xfs_da_btree.h"
58#include "xfs_dir_leaf.h"
59#include "xfs_error.h"
60#include "xfs_dir2_data.h"
61#include "xfs_dir2_leaf.h"
62#include "xfs_dir2_block.h"
63#include "xfs_dir2_trace.h"
64
65/*
66 * Prototypes for internal functions.
67 */
68static void xfs_dir2_sf_addname_easy(xfs_da_args_t *args,
69 xfs_dir2_sf_entry_t *sfep,
70 xfs_dir2_data_aoff_t offset,
71 int new_isize);
72static void xfs_dir2_sf_addname_hard(xfs_da_args_t *args, int objchange,
73 int new_isize);
74static int xfs_dir2_sf_addname_pick(xfs_da_args_t *args, int objchange,
75 xfs_dir2_sf_entry_t **sfepp,
76 xfs_dir2_data_aoff_t *offsetp);
77#ifdef DEBUG
78static void xfs_dir2_sf_check(xfs_da_args_t *args);
79#else
80#define xfs_dir2_sf_check(args)
81#endif /* DEBUG */
82#if XFS_BIG_INUMS
83static void xfs_dir2_sf_toino4(xfs_da_args_t *args);
84static void xfs_dir2_sf_toino8(xfs_da_args_t *args);
85#endif /* XFS_BIG_INUMS */
86
87/*
88 * Given a block directory (dp/block), calculate its size as a shortform (sf)
89 * directory and a header for the sf directory, if it will fit it the
90 * space currently present in the inode. If it won't fit, the output
91 * size is too big (but not accurate).
92 */
93int /* size for sf form */
94xfs_dir2_block_sfsize(
95 xfs_inode_t *dp, /* incore inode pointer */
96 xfs_dir2_block_t *block, /* block directory data */
97 xfs_dir2_sf_hdr_t *sfhp) /* output: header for sf form */
98{
99 xfs_dir2_dataptr_t addr; /* data entry address */
100 xfs_dir2_leaf_entry_t *blp; /* leaf area of the block */
101 xfs_dir2_block_tail_t *btp; /* tail area of the block */
102 int count; /* shortform entry count */
103 xfs_dir2_data_entry_t *dep; /* data entry in the block */
104 int i; /* block entry index */
105 int i8count; /* count of big-inode entries */
106 int isdot; /* entry is "." */
107 int isdotdot; /* entry is ".." */
108 xfs_mount_t *mp; /* mount structure pointer */
109 int namelen; /* total name bytes */
110 xfs_ino_t parent; /* parent inode number */
111 int size=0; /* total computed size */
112
113 mp = dp->i_mount;
114
115 count = i8count = namelen = 0;
116 btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
117 blp = XFS_DIR2_BLOCK_LEAF_P(btp);
118
119 /*
120 * Iterate over the block's data entries by using the leaf pointers.
121 */
122 for (i = 0; i < INT_GET(btp->count, ARCH_CONVERT); i++) {
123 if ((addr = INT_GET(blp[i].address, ARCH_CONVERT)) == XFS_DIR2_NULL_DATAPTR)
124 continue;
125 /*
126 * Calculate the pointer to the entry at hand.
127 */
128 dep = (xfs_dir2_data_entry_t *)
129 ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, addr));
130 /*
131 * Detect . and .., so we can special-case them.
132 * . is not included in sf directories.
133 * .. is included by just the parent inode number.
134 */
135 isdot = dep->namelen == 1 && dep->name[0] == '.';
136 isdotdot =
137 dep->namelen == 2 &&
138 dep->name[0] == '.' && dep->name[1] == '.';
139#if XFS_BIG_INUMS
140 if (!isdot)
141 i8count += INT_GET(dep->inumber, ARCH_CONVERT) > XFS_DIR2_MAX_SHORT_INUM;
142#endif
143 if (!isdot && !isdotdot) {
144 count++;
145 namelen += dep->namelen;
146 } else if (isdotdot)
147 parent = INT_GET(dep->inumber, ARCH_CONVERT);
148 /*
149 * Calculate the new size, see if we should give up yet.
150 */
151 size = XFS_DIR2_SF_HDR_SIZE(i8count) + /* header */
152 count + /* namelen */
153 count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */
154 namelen + /* name */
155 (i8count ? /* inumber */
156 (uint)sizeof(xfs_dir2_ino8_t) * count :
157 (uint)sizeof(xfs_dir2_ino4_t) * count);
158 if (size > XFS_IFORK_DSIZE(dp))
159 return size; /* size value is a failure */
160 }
161 /*
162 * Create the output header, if it worked.
163 */
164 sfhp->count = count;
165 sfhp->i8count = i8count;
166 XFS_DIR2_SF_PUT_INUMBER((xfs_dir2_sf_t *)sfhp, &parent, &sfhp->parent);
167 return size;
168}
169
170/*
171 * Convert a block format directory to shortform.
172 * Caller has already checked that it will fit, and built us a header.
173 */
174int /* error */
175xfs_dir2_block_to_sf(
176 xfs_da_args_t *args, /* operation arguments */
177 xfs_dabuf_t *bp, /* block buffer */
178 int size, /* shortform directory size */
179 xfs_dir2_sf_hdr_t *sfhp) /* shortform directory hdr */
180{
181 xfs_dir2_block_t *block; /* block structure */
182 xfs_dir2_block_tail_t *btp; /* block tail pointer */
183 xfs_dir2_data_entry_t *dep; /* data entry pointer */
184 xfs_inode_t *dp; /* incore directory inode */
185 xfs_dir2_data_unused_t *dup; /* unused data pointer */
186 char *endptr; /* end of data entries */
187 int error; /* error return value */
188 int logflags; /* inode logging flags */
189 xfs_mount_t *mp; /* filesystem mount point */
190 char *ptr; /* current data pointer */
191 xfs_dir2_sf_entry_t *sfep; /* shortform entry */
192 xfs_dir2_sf_t *sfp; /* shortform structure */
193 xfs_ino_t temp;
194
195 xfs_dir2_trace_args_sb("block_to_sf", args, size, bp);
196 dp = args->dp;
197 mp = dp->i_mount;
198
199 /*
200 * Make a copy of the block data, so we can shrink the inode
201 * and add local data.
202 */
203 block = kmem_alloc(mp->m_dirblksize, KM_SLEEP);
204 memcpy(block, bp->data, mp->m_dirblksize);
205 logflags = XFS_ILOG_CORE;
206 if ((error = xfs_dir2_shrink_inode(args, mp->m_dirdatablk, bp))) {
207 ASSERT(error != ENOSPC);
208 goto out;
209 }
210 /*
211 * The buffer is now unconditionally gone, whether
212 * xfs_dir2_shrink_inode worked or not.
213 *
214 * Convert the inode to local format.
215 */
216 dp->i_df.if_flags &= ~XFS_IFEXTENTS;
217 dp->i_df.if_flags |= XFS_IFINLINE;
218 dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
219 ASSERT(dp->i_df.if_bytes == 0);
220 xfs_idata_realloc(dp, size, XFS_DATA_FORK);
221 logflags |= XFS_ILOG_DDATA;
222 /*
223 * Copy the header into the newly allocate local space.
224 */
225 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
226 memcpy(sfp, sfhp, XFS_DIR2_SF_HDR_SIZE(sfhp->i8count));
227 dp->i_d.di_size = size;
228 /*
229 * Set up to loop over the block's entries.
230 */
231 btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
232 ptr = (char *)block->u;
233 endptr = (char *)XFS_DIR2_BLOCK_LEAF_P(btp);
234 sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
235 /*
236 * Loop over the active and unused entries.
237 * Stop when we reach the leaf/tail portion of the block.
238 */
239 while (ptr < endptr) {
240 /*
241 * If it's unused, just skip over it.
242 */
243 dup = (xfs_dir2_data_unused_t *)ptr;
244 if (INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG) {
245 ptr += INT_GET(dup->length, ARCH_CONVERT);
246 continue;
247 }
248 dep = (xfs_dir2_data_entry_t *)ptr;
249 /*
250 * Skip .
251 */
252 if (dep->namelen == 1 && dep->name[0] == '.')
253 ASSERT(INT_GET(dep->inumber, ARCH_CONVERT) == dp->i_ino);
254 /*
255 * Skip .., but make sure the inode number is right.
256 */
257 else if (dep->namelen == 2 &&
258 dep->name[0] == '.' && dep->name[1] == '.')
259 ASSERT(INT_GET(dep->inumber, ARCH_CONVERT) ==
260 XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent));
261 /*
262 * Normal entry, copy it into shortform.
263 */
264 else {
265 sfep->namelen = dep->namelen;
266 XFS_DIR2_SF_PUT_OFFSET(sfep,
267 (xfs_dir2_data_aoff_t)
268 ((char *)dep - (char *)block));
269 memcpy(sfep->name, dep->name, dep->namelen);
270 temp=INT_GET(dep->inumber, ARCH_CONVERT);
271 XFS_DIR2_SF_PUT_INUMBER(sfp, &temp,
272 XFS_DIR2_SF_INUMBERP(sfep));
273 sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep);
274 }
275 ptr += XFS_DIR2_DATA_ENTSIZE(dep->namelen);
276 }
277 ASSERT((char *)sfep - (char *)sfp == size);
278 xfs_dir2_sf_check(args);
279out:
280 xfs_trans_log_inode(args->trans, dp, logflags);
281 kmem_free(block, mp->m_dirblksize);
282 return error;
283}
284
285/*
286 * Add a name to a shortform directory.
287 * There are two algorithms, "easy" and "hard" which we decide on
288 * before changing anything.
289 * Convert to block form if necessary, if the new entry won't fit.
290 */
291int /* error */
292xfs_dir2_sf_addname(
293 xfs_da_args_t *args) /* operation arguments */
294{
295 int add_entsize; /* size of the new entry */
296 xfs_inode_t *dp; /* incore directory inode */
297 int error; /* error return value */
298 int incr_isize; /* total change in size */
299 int new_isize; /* di_size after adding name */
300 int objchange; /* changing to 8-byte inodes */
301 xfs_dir2_data_aoff_t offset; /* offset for new entry */
302 int old_isize; /* di_size before adding name */
303 int pick; /* which algorithm to use */
304 xfs_dir2_sf_t *sfp; /* shortform structure */
305 xfs_dir2_sf_entry_t *sfep; /* shortform entry */
306
307 xfs_dir2_trace_args("sf_addname", args);
308 ASSERT(xfs_dir2_sf_lookup(args) == ENOENT);
309 dp = args->dp;
310 ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
311 /*
312 * Make sure the shortform value has some of its header.
313 */
314 if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
315 ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
316 return XFS_ERROR(EIO);
317 }
318 ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
319 ASSERT(dp->i_df.if_u1.if_data != NULL);
320 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
321 ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count));
322 /*
323 * Compute entry (and change in) size.
324 */
325 add_entsize = XFS_DIR2_SF_ENTSIZE_BYNAME(sfp, args->namelen);
326 incr_isize = add_entsize;
327 objchange = 0;
328#if XFS_BIG_INUMS
329 /*
330 * Do we have to change to 8 byte inodes?
331 */
332 if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->hdr.i8count == 0) {
333 /*
334 * Yes, adjust the entry size and the total size.
335 */
336 add_entsize +=
337 (uint)sizeof(xfs_dir2_ino8_t) -
338 (uint)sizeof(xfs_dir2_ino4_t);
339 incr_isize +=
340 (sfp->hdr.count + 2) *
341 ((uint)sizeof(xfs_dir2_ino8_t) -
342 (uint)sizeof(xfs_dir2_ino4_t));
343 objchange = 1;
344 }
345#endif
346 old_isize = (int)dp->i_d.di_size;
347 new_isize = old_isize + incr_isize;
348 /*
349 * Won't fit as shortform any more (due to size),
350 * or the pick routine says it won't (due to offset values).
351 */
352 if (new_isize > XFS_IFORK_DSIZE(dp) ||
353 (pick =
354 xfs_dir2_sf_addname_pick(args, objchange, &sfep, &offset)) == 0) {
355 /*
356 * Just checking or no space reservation, it doesn't fit.
357 */
358 if (args->justcheck || args->total == 0)
359 return XFS_ERROR(ENOSPC);
360 /*
361 * Convert to block form then add the name.
362 */
363 error = xfs_dir2_sf_to_block(args);
364 if (error)
365 return error;
366 return xfs_dir2_block_addname(args);
367 }
368 /*
369 * Just checking, it fits.
370 */
371 if (args->justcheck)
372 return 0;
373 /*
374 * Do it the easy way - just add it at the end.
375 */
376 if (pick == 1)
377 xfs_dir2_sf_addname_easy(args, sfep, offset, new_isize);
378 /*
379 * Do it the hard way - look for a place to insert the new entry.
380 * Convert to 8 byte inode numbers first if necessary.
381 */
382 else {
383 ASSERT(pick == 2);
384#if XFS_BIG_INUMS
385 if (objchange)
386 xfs_dir2_sf_toino8(args);
387#endif
388 xfs_dir2_sf_addname_hard(args, objchange, new_isize);
389 }
390 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
391 return 0;
392}
393
394/*
395 * Add the new entry the "easy" way.
396 * This is copying the old directory and adding the new entry at the end.
397 * Since it's sorted by "offset" we need room after the last offset
398 * that's already there, and then room to convert to a block directory.
399 * This is already checked by the pick routine.
400 */
401static void
402xfs_dir2_sf_addname_easy(
403 xfs_da_args_t *args, /* operation arguments */
404 xfs_dir2_sf_entry_t *sfep, /* pointer to new entry */
405 xfs_dir2_data_aoff_t offset, /* offset to use for new ent */
406 int new_isize) /* new directory size */
407{
408 int byteoff; /* byte offset in sf dir */
409 xfs_inode_t *dp; /* incore directory inode */
410 xfs_dir2_sf_t *sfp; /* shortform structure */
411
412 dp = args->dp;
413
414 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
415 byteoff = (int)((char *)sfep - (char *)sfp);
416 /*
417 * Grow the in-inode space.
418 */
419 xfs_idata_realloc(dp, XFS_DIR2_SF_ENTSIZE_BYNAME(sfp, args->namelen),
420 XFS_DATA_FORK);
421 /*
422 * Need to set up again due to realloc of the inode data.
423 */
424 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
425 sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + byteoff);
426 /*
427 * Fill in the new entry.
428 */
429 sfep->namelen = args->namelen;
430 XFS_DIR2_SF_PUT_OFFSET(sfep, offset);
431 memcpy(sfep->name, args->name, sfep->namelen);
432 XFS_DIR2_SF_PUT_INUMBER(sfp, &args->inumber,
433 XFS_DIR2_SF_INUMBERP(sfep));
434 /*
435 * Update the header and inode.
436 */
437 sfp->hdr.count++;
438#if XFS_BIG_INUMS
439 if (args->inumber > XFS_DIR2_MAX_SHORT_INUM)
440 sfp->hdr.i8count++;
441#endif
442 dp->i_d.di_size = new_isize;
443 xfs_dir2_sf_check(args);
444}
445
446/*
447 * Add the new entry the "hard" way.
448 * The caller has already converted to 8 byte inode numbers if necessary,
449 * in which case we need to leave the i8count at 1.
450 * Find a hole that the new entry will fit into, and copy
451 * the first part of the entries, the new entry, and the last part of
452 * the entries.
453 */
454/* ARGSUSED */
455static void
456xfs_dir2_sf_addname_hard(
457 xfs_da_args_t *args, /* operation arguments */
458 int objchange, /* changing inode number size */
459 int new_isize) /* new directory size */
460{
461 int add_datasize; /* data size need for new ent */
462 char *buf; /* buffer for old */
463 xfs_inode_t *dp; /* incore directory inode */
464 int eof; /* reached end of old dir */
465 int nbytes; /* temp for byte copies */
466 xfs_dir2_data_aoff_t new_offset; /* next offset value */
467 xfs_dir2_data_aoff_t offset; /* current offset value */
468 int old_isize; /* previous di_size */
469 xfs_dir2_sf_entry_t *oldsfep; /* entry in original dir */
470 xfs_dir2_sf_t *oldsfp; /* original shortform dir */
471 xfs_dir2_sf_entry_t *sfep; /* entry in new dir */
472 xfs_dir2_sf_t *sfp; /* new shortform dir */
473
474 /*
475 * Copy the old directory to the stack buffer.
476 */
477 dp = args->dp;
478
479 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
480 old_isize = (int)dp->i_d.di_size;
481 buf = kmem_alloc(old_isize, KM_SLEEP);
482 oldsfp = (xfs_dir2_sf_t *)buf;
483 memcpy(oldsfp, sfp, old_isize);
484 /*
485 * Loop over the old directory finding the place we're going
486 * to insert the new entry.
487 * If it's going to end up at the end then oldsfep will point there.
488 */
489 for (offset = XFS_DIR2_DATA_FIRST_OFFSET,
490 oldsfep = XFS_DIR2_SF_FIRSTENTRY(oldsfp),
491 add_datasize = XFS_DIR2_DATA_ENTSIZE(args->namelen),
492 eof = (char *)oldsfep == &buf[old_isize];
493 !eof;
494 offset = new_offset + XFS_DIR2_DATA_ENTSIZE(oldsfep->namelen),
495 oldsfep = XFS_DIR2_SF_NEXTENTRY(oldsfp, oldsfep),
496 eof = (char *)oldsfep == &buf[old_isize]) {
497 new_offset = XFS_DIR2_SF_GET_OFFSET(oldsfep);
498 if (offset + add_datasize <= new_offset)
499 break;
500 }
501 /*
502 * Get rid of the old directory, then allocate space for
503 * the new one. We do this so xfs_idata_realloc won't copy
504 * the data.
505 */
506 xfs_idata_realloc(dp, -old_isize, XFS_DATA_FORK);
507 xfs_idata_realloc(dp, new_isize, XFS_DATA_FORK);
508 /*
509 * Reset the pointer since the buffer was reallocated.
510 */
511 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
512 /*
513 * Copy the first part of the directory, including the header.
514 */
515 nbytes = (int)((char *)oldsfep - (char *)oldsfp);
516 memcpy(sfp, oldsfp, nbytes);
517 sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + nbytes);
518 /*
519 * Fill in the new entry, and update the header counts.
520 */
521 sfep->namelen = args->namelen;
522 XFS_DIR2_SF_PUT_OFFSET(sfep, offset);
523 memcpy(sfep->name, args->name, sfep->namelen);
524 XFS_DIR2_SF_PUT_INUMBER(sfp, &args->inumber,
525 XFS_DIR2_SF_INUMBERP(sfep));
526 sfp->hdr.count++;
527#if XFS_BIG_INUMS
528 if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange)
529 sfp->hdr.i8count++;
530#endif
531 /*
532 * If there's more left to copy, do that.
533 */
534 if (!eof) {
535 sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep);
536 memcpy(sfep, oldsfep, old_isize - nbytes);
537 }
538 kmem_free(buf, old_isize);
539 dp->i_d.di_size = new_isize;
540 xfs_dir2_sf_check(args);
541}
542
543/*
544 * Decide if the new entry will fit at all.
545 * If it will fit, pick between adding the new entry to the end (easy)
546 * or somewhere else (hard).
547 * Return 0 (won't fit), 1 (easy), 2 (hard).
548 */
549/*ARGSUSED*/
550static int /* pick result */
551xfs_dir2_sf_addname_pick(
552 xfs_da_args_t *args, /* operation arguments */
553 int objchange, /* inode # size changes */
554 xfs_dir2_sf_entry_t **sfepp, /* out(1): new entry ptr */
555 xfs_dir2_data_aoff_t *offsetp) /* out(1): new offset */
556{
557 xfs_inode_t *dp; /* incore directory inode */
558 int holefit; /* found hole it will fit in */
559 int i; /* entry number */
560 xfs_mount_t *mp; /* filesystem mount point */
561 xfs_dir2_data_aoff_t offset; /* data block offset */
562 xfs_dir2_sf_entry_t *sfep; /* shortform entry */
563 xfs_dir2_sf_t *sfp; /* shortform structure */
564 int size; /* entry's data size */
565 int used; /* data bytes used */
566
567 dp = args->dp;
568 mp = dp->i_mount;
569
570 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
571 size = XFS_DIR2_DATA_ENTSIZE(args->namelen);
572 offset = XFS_DIR2_DATA_FIRST_OFFSET;
573 sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
574 holefit = 0;
575 /*
576 * Loop over sf entries.
577 * Keep track of data offset and whether we've seen a place
578 * to insert the new entry.
579 */
580 for (i = 0; i < sfp->hdr.count; i++) {
581 if (!holefit)
582 holefit = offset + size <= XFS_DIR2_SF_GET_OFFSET(sfep);
583 offset = XFS_DIR2_SF_GET_OFFSET(sfep) +
584 XFS_DIR2_DATA_ENTSIZE(sfep->namelen);
585 sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep);
586 }
587 /*
588 * Calculate data bytes used excluding the new entry, if this
589 * was a data block (block form directory).
590 */
591 used = offset +
592 (sfp->hdr.count + 3) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
593 (uint)sizeof(xfs_dir2_block_tail_t);
594 /*
595 * If it won't fit in a block form then we can't insert it,
596 * we'll go back, convert to block, then try the insert and convert
597 * to leaf.
598 */
599 if (used + (holefit ? 0 : size) > mp->m_dirblksize)
600 return 0;
601 /*
602 * If changing the inode number size, do it the hard way.
603 */
604#if XFS_BIG_INUMS
605 if (objchange) {
606 return 2;
607 }
608#else
609 ASSERT(objchange == 0);
610#endif
611 /*
612 * If it won't fit at the end then do it the hard way (use the hole).
613 */
614 if (used + size > mp->m_dirblksize)
615 return 2;
616 /*
617 * Do it the easy way.
618 */
619 *sfepp = sfep;
620 *offsetp = offset;
621 return 1;
622}
623
624#ifdef DEBUG
625/*
626 * Check consistency of shortform directory, assert if bad.
627 */
628static void
629xfs_dir2_sf_check(
630 xfs_da_args_t *args) /* operation arguments */
631{
632 xfs_inode_t *dp; /* incore directory inode */
633 int i; /* entry number */
634 int i8count; /* number of big inode#s */
635 xfs_ino_t ino; /* entry inode number */
636 int offset; /* data offset */
637 xfs_dir2_sf_entry_t *sfep; /* shortform dir entry */
638 xfs_dir2_sf_t *sfp; /* shortform structure */
639
640 dp = args->dp;
641
642 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
643 offset = XFS_DIR2_DATA_FIRST_OFFSET;
644 ino = XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent);
645 i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
646
647 for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
648 i < sfp->hdr.count;
649 i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) {
650 ASSERT(XFS_DIR2_SF_GET_OFFSET(sfep) >= offset);
651 ino = XFS_DIR2_SF_GET_INUMBER(sfp, XFS_DIR2_SF_INUMBERP(sfep));
652 i8count += ino > XFS_DIR2_MAX_SHORT_INUM;
653 offset =
654 XFS_DIR2_SF_GET_OFFSET(sfep) +
655 XFS_DIR2_DATA_ENTSIZE(sfep->namelen);
656 }
657 ASSERT(i8count == sfp->hdr.i8count);
658 ASSERT(XFS_BIG_INUMS || i8count == 0);
659 ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size);
660 ASSERT(offset +
661 (sfp->hdr.count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
662 (uint)sizeof(xfs_dir2_block_tail_t) <=
663 dp->i_mount->m_dirblksize);
664}
665#endif /* DEBUG */
666
667/*
668 * Create a new (shortform) directory.
669 */
670int /* error, always 0 */
671xfs_dir2_sf_create(
672 xfs_da_args_t *args, /* operation arguments */
673 xfs_ino_t pino) /* parent inode number */
674{
675 xfs_inode_t *dp; /* incore directory inode */
676 int i8count; /* parent inode is an 8-byte number */
677 xfs_dir2_sf_t *sfp; /* shortform structure */
678 int size; /* directory size */
679
680 xfs_dir2_trace_args_i("sf_create", args, pino);
681 dp = args->dp;
682
683 ASSERT(dp != NULL);
684 ASSERT(dp->i_d.di_size == 0);
685 /*
686 * If it's currently a zero-length extent file,
687 * convert it to local format.
688 */
689 if (dp->i_d.di_format == XFS_DINODE_FMT_EXTENTS) {
690 dp->i_df.if_flags &= ~XFS_IFEXTENTS; /* just in case */
691 dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
692 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
693 dp->i_df.if_flags |= XFS_IFINLINE;
694 }
695 ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
696 ASSERT(dp->i_df.if_bytes == 0);
697 i8count = pino > XFS_DIR2_MAX_SHORT_INUM;
698 size = XFS_DIR2_SF_HDR_SIZE(i8count);
699 /*
700 * Make a buffer for the data.
701 */
702 xfs_idata_realloc(dp, size, XFS_DATA_FORK);
703 /*
704 * Fill in the header,
705 */
706 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
707 sfp->hdr.i8count = i8count;
708 /*
709 * Now can put in the inode number, since i8count is set.
710 */
711 XFS_DIR2_SF_PUT_INUMBER(sfp, &pino, &sfp->hdr.parent);
712 sfp->hdr.count = 0;
713 dp->i_d.di_size = size;
714 xfs_dir2_sf_check(args);
715 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
716 return 0;
717}
718
719int /* error */
720xfs_dir2_sf_getdents(
721 xfs_inode_t *dp, /* incore directory inode */
722 uio_t *uio, /* caller's buffer control */
723 int *eofp, /* eof reached? (out) */
724 xfs_dirent_t *dbp, /* caller's buffer */
725 xfs_dir2_put_t put) /* abi's formatting function */
726{
727 int error; /* error return value */
728 int i; /* shortform entry number */
729 xfs_mount_t *mp; /* filesystem mount point */
730 xfs_dir2_dataptr_t off; /* current entry's offset */
731 xfs_dir2_put_args_t p; /* arg package for put rtn */
732 xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
733 xfs_dir2_sf_t *sfp; /* shortform structure */
734 xfs_off_t dir_offset;
735
736 mp = dp->i_mount;
737
738 ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
739 /*
740 * Give up if the directory is way too short.
741 */
742 if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
743 ASSERT(XFS_FORCED_SHUTDOWN(mp));
744 return XFS_ERROR(EIO);
745 }
746
747 dir_offset = uio->uio_offset;
748
749 ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
750 ASSERT(dp->i_df.if_u1.if_data != NULL);
751
752 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
753
754 ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count));
755
756 /*
757 * If the block number in the offset is out of range, we're done.
758 */
759 if (XFS_DIR2_DATAPTR_TO_DB(mp, dir_offset) > mp->m_dirdatablk) {
760 *eofp = 1;
761 return 0;
762 }
763
764 /*
765 * Set up putargs structure.
766 */
767 p.dbp = dbp;
768 p.put = put;
769 p.uio = uio;
770 /*
771 * Put . entry unless we're starting past it.
772 */
773 if (dir_offset <=
774 XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
775 XFS_DIR2_DATA_DOT_OFFSET)) {
776 p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, 0,
777 XFS_DIR2_DATA_DOTDOT_OFFSET);
778 p.ino = dp->i_ino;
779#if XFS_BIG_INUMS
780 p.ino += mp->m_inoadd;
781#endif
782 p.name = ".";
783 p.namelen = 1;
784
785 error = p.put(&p);
786
787 if (!p.done) {
788 uio->uio_offset =
789 XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
790 XFS_DIR2_DATA_DOT_OFFSET);
791 return error;
792 }
793 }
794
795 /*
796 * Put .. entry unless we're starting past it.
797 */
798 if (dir_offset <=
799 XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
800 XFS_DIR2_DATA_DOTDOT_OFFSET)) {
801 p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
802 XFS_DIR2_DATA_FIRST_OFFSET);
803 p.ino = XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent);
804#if XFS_BIG_INUMS
805 p.ino += mp->m_inoadd;
806#endif
807 p.name = "..";
808 p.namelen = 2;
809
810 error = p.put(&p);
811
812 if (!p.done) {
813 uio->uio_offset =
814 XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
815 XFS_DIR2_DATA_DOTDOT_OFFSET);
816 return error;
817 }
818 }
819
820 /*
821 * Loop while there are more entries and put'ing works.
822 */
823 for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
824 i < sfp->hdr.count;
825 i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) {
826
827 off = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
828 XFS_DIR2_SF_GET_OFFSET(sfep));
829
830 if (dir_offset > off)
831 continue;
832
833 p.namelen = sfep->namelen;
834
835 p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
836 XFS_DIR2_SF_GET_OFFSET(sfep) +
837 XFS_DIR2_DATA_ENTSIZE(p.namelen));
838
839 p.ino = XFS_DIR2_SF_GET_INUMBER(sfp, XFS_DIR2_SF_INUMBERP(sfep));
840#if XFS_BIG_INUMS
841 p.ino += mp->m_inoadd;
842#endif
843 p.name = (char *)sfep->name;
844
845 error = p.put(&p);
846
847 if (!p.done) {
848 uio->uio_offset = off;
849 return error;
850 }
851 }
852
853 /*
854 * They all fit.
855 */
856 *eofp = 1;
857
858 uio->uio_offset =
859 XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk + 1, 0);
860
861 return 0;
862}
863
864/*
865 * Lookup an entry in a shortform directory.
866 * Returns EEXIST if found, ENOENT if not found.
867 */
868int /* error */
869xfs_dir2_sf_lookup(
870 xfs_da_args_t *args) /* operation arguments */
871{
872 xfs_inode_t *dp; /* incore directory inode */
873 int i; /* entry index */
874 xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
875 xfs_dir2_sf_t *sfp; /* shortform structure */
876
877 xfs_dir2_trace_args("sf_lookup", args);
878 xfs_dir2_sf_check(args);
879 dp = args->dp;
880
881 ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
882 /*
883 * Bail out if the directory is way too short.
884 */
885 if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
886 ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
887 return XFS_ERROR(EIO);
888 }
889 ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
890 ASSERT(dp->i_df.if_u1.if_data != NULL);
891 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
892 ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count));
893 /*
894 * Special case for .
895 */
896 if (args->namelen == 1 && args->name[0] == '.') {
897 args->inumber = dp->i_ino;
898 return XFS_ERROR(EEXIST);
899 }
900 /*
901 * Special case for ..
902 */
903 if (args->namelen == 2 &&
904 args->name[0] == '.' && args->name[1] == '.') {
905 args->inumber = XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent);
906 return XFS_ERROR(EEXIST);
907 }
908 /*
909 * Loop over all the entries trying to match ours.
910 */
911 for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
912 i < sfp->hdr.count;
913 i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) {
914 if (sfep->namelen == args->namelen &&
915 sfep->name[0] == args->name[0] &&
916 memcmp(args->name, sfep->name, args->namelen) == 0) {
917 args->inumber =
918 XFS_DIR2_SF_GET_INUMBER(sfp,
919 XFS_DIR2_SF_INUMBERP(sfep));
920 return XFS_ERROR(EEXIST);
921 }
922 }
923 /*
924 * Didn't find it.
925 */
926 ASSERT(args->oknoent);
927 return XFS_ERROR(ENOENT);
928}
929
930/*
931 * Remove an entry from a shortform directory.
932 */
933int /* error */
934xfs_dir2_sf_removename(
935 xfs_da_args_t *args)
936{
937 int byteoff; /* offset of removed entry */
938 xfs_inode_t *dp; /* incore directory inode */
939 int entsize; /* this entry's size */
940 int i; /* shortform entry index */
941 int newsize; /* new inode size */
942 int oldsize; /* old inode size */
943 xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
944 xfs_dir2_sf_t *sfp; /* shortform structure */
945
946 xfs_dir2_trace_args("sf_removename", args);
947 dp = args->dp;
948
949 ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
950 oldsize = (int)dp->i_d.di_size;
951 /*
952 * Bail out if the directory is way too short.
953 */
954 if (oldsize < offsetof(xfs_dir2_sf_hdr_t, parent)) {
955 ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
956 return XFS_ERROR(EIO);
957 }
958 ASSERT(dp->i_df.if_bytes == oldsize);
959 ASSERT(dp->i_df.if_u1.if_data != NULL);
960 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
961 ASSERT(oldsize >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count));
962 /*
963 * Loop over the old directory entries.
964 * Find the one we're deleting.
965 */
966 for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
967 i < sfp->hdr.count;
968 i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) {
969 if (sfep->namelen == args->namelen &&
970 sfep->name[0] == args->name[0] &&
971 memcmp(sfep->name, args->name, args->namelen) == 0) {
972 ASSERT(XFS_DIR2_SF_GET_INUMBER(sfp,
973 XFS_DIR2_SF_INUMBERP(sfep)) ==
974 args->inumber);
975 break;
976 }
977 }
978 /*
979 * Didn't find it.
980 */
981 if (i == sfp->hdr.count) {
982 return XFS_ERROR(ENOENT);
983 }
984 /*
985 * Calculate sizes.
986 */
987 byteoff = (int)((char *)sfep - (char *)sfp);
988 entsize = XFS_DIR2_SF_ENTSIZE_BYNAME(sfp, args->namelen);
989 newsize = oldsize - entsize;
990 /*
991 * Copy the part if any after the removed entry, sliding it down.
992 */
993 if (byteoff + entsize < oldsize)
994 memmove((char *)sfp + byteoff, (char *)sfp + byteoff + entsize,
995 oldsize - (byteoff + entsize));
996 /*
997 * Fix up the header and file size.
998 */
999 sfp->hdr.count--;
1000 dp->i_d.di_size = newsize;
1001 /*
1002 * Reallocate, making it smaller.
1003 */
1004 xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK);
1005 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
1006#if XFS_BIG_INUMS
1007 /*
1008 * Are we changing inode number size?
1009 */
1010 if (args->inumber > XFS_DIR2_MAX_SHORT_INUM) {
1011 if (sfp->hdr.i8count == 1)
1012 xfs_dir2_sf_toino4(args);
1013 else
1014 sfp->hdr.i8count--;
1015 }
1016#endif
1017 xfs_dir2_sf_check(args);
1018 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
1019 return 0;
1020}
1021
1022/*
1023 * Replace the inode number of an entry in a shortform directory.
1024 */
1025int /* error */
1026xfs_dir2_sf_replace(
1027 xfs_da_args_t *args) /* operation arguments */
1028{
1029 xfs_inode_t *dp; /* incore directory inode */
1030 int i; /* entry index */
1031#if XFS_BIG_INUMS || defined(DEBUG)
1032 xfs_ino_t ino=0; /* entry old inode number */
1033#endif
1034#if XFS_BIG_INUMS
1035 int i8elevated; /* sf_toino8 set i8count=1 */
1036#endif
1037 xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
1038 xfs_dir2_sf_t *sfp; /* shortform structure */
1039
1040 xfs_dir2_trace_args("sf_replace", args);
1041 dp = args->dp;
1042
1043 ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
1044 /*
1045 * Bail out if the shortform directory is way too small.
1046 */
1047 if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
1048 ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
1049 return XFS_ERROR(EIO);
1050 }
1051 ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
1052 ASSERT(dp->i_df.if_u1.if_data != NULL);
1053 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
1054 ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count));
1055#if XFS_BIG_INUMS
1056 /*
1057 * New inode number is large, and need to convert to 8-byte inodes.
1058 */
1059 if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->hdr.i8count == 0) {
1060 int error; /* error return value */
1061 int newsize; /* new inode size */
1062
1063 newsize =
1064 dp->i_df.if_bytes +
1065 (sfp->hdr.count + 1) *
1066 ((uint)sizeof(xfs_dir2_ino8_t) -
1067 (uint)sizeof(xfs_dir2_ino4_t));
1068 /*
1069 * Won't fit as shortform, convert to block then do replace.
1070 */
1071 if (newsize > XFS_IFORK_DSIZE(dp)) {
1072 error = xfs_dir2_sf_to_block(args);
1073 if (error) {
1074 return error;
1075 }
1076 return xfs_dir2_block_replace(args);
1077 }
1078 /*
1079 * Still fits, convert to 8-byte now.
1080 */
1081 xfs_dir2_sf_toino8(args);
1082 i8elevated = 1;
1083 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
1084 } else
1085 i8elevated = 0;
1086#endif
1087 ASSERT(args->namelen != 1 || args->name[0] != '.');
1088 /*
1089 * Replace ..'s entry.
1090 */
1091 if (args->namelen == 2 &&
1092 args->name[0] == '.' && args->name[1] == '.') {
1093#if XFS_BIG_INUMS || defined(DEBUG)
1094 ino = XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent);
1095 ASSERT(args->inumber != ino);
1096#endif
1097 XFS_DIR2_SF_PUT_INUMBER(sfp, &args->inumber, &sfp->hdr.parent);
1098 }
1099 /*
1100 * Normal entry, look for the name.
1101 */
1102 else {
1103 for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
1104 i < sfp->hdr.count;
1105 i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) {
1106 if (sfep->namelen == args->namelen &&
1107 sfep->name[0] == args->name[0] &&
1108 memcmp(args->name, sfep->name, args->namelen) == 0) {
1109#if XFS_BIG_INUMS || defined(DEBUG)
1110 ino = XFS_DIR2_SF_GET_INUMBER(sfp,
1111 XFS_DIR2_SF_INUMBERP(sfep));
1112 ASSERT(args->inumber != ino);
1113#endif
1114 XFS_DIR2_SF_PUT_INUMBER(sfp, &args->inumber,
1115 XFS_DIR2_SF_INUMBERP(sfep));
1116 break;
1117 }
1118 }
1119 /*
1120 * Didn't find it.
1121 */
1122 if (i == sfp->hdr.count) {
1123 ASSERT(args->oknoent);
1124#if XFS_BIG_INUMS
1125 if (i8elevated)
1126 xfs_dir2_sf_toino4(args);
1127#endif
1128 return XFS_ERROR(ENOENT);
1129 }
1130 }
1131#if XFS_BIG_INUMS
1132 /*
1133 * See if the old number was large, the new number is small.
1134 */
1135 if (ino > XFS_DIR2_MAX_SHORT_INUM &&
1136 args->inumber <= XFS_DIR2_MAX_SHORT_INUM) {
1137 /*
1138 * And the old count was one, so need to convert to small.
1139 */
1140 if (sfp->hdr.i8count == 1)
1141 xfs_dir2_sf_toino4(args);
1142 else
1143 sfp->hdr.i8count--;
1144 }
1145 /*
1146 * See if the old number was small, the new number is large.
1147 */
1148 if (ino <= XFS_DIR2_MAX_SHORT_INUM &&
1149 args->inumber > XFS_DIR2_MAX_SHORT_INUM) {
1150 /*
1151 * add to the i8count unless we just converted to 8-byte
1152 * inodes (which does an implied i8count = 1)
1153 */
1154 ASSERT(sfp->hdr.i8count != 0);
1155 if (!i8elevated)
1156 sfp->hdr.i8count++;
1157 }
1158#endif
1159 xfs_dir2_sf_check(args);
1160 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
1161 return 0;
1162}
1163
1164#if XFS_BIG_INUMS
1165/*
1166 * Convert from 8-byte inode numbers to 4-byte inode numbers.
1167 * The last 8-byte inode number is gone, but the count is still 1.
1168 */
1169static void
1170xfs_dir2_sf_toino4(
1171 xfs_da_args_t *args) /* operation arguments */
1172{
1173 char *buf; /* old dir's buffer */
1174 xfs_inode_t *dp; /* incore directory inode */
1175 int i; /* entry index */
1176 xfs_ino_t ino; /* entry inode number */
1177 int newsize; /* new inode size */
1178 xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */
1179 xfs_dir2_sf_t *oldsfp; /* old sf directory */
1180 int oldsize; /* old inode size */
1181 xfs_dir2_sf_entry_t *sfep; /* new sf entry */
1182 xfs_dir2_sf_t *sfp; /* new sf directory */
1183
1184 xfs_dir2_trace_args("sf_toino4", args);
1185 dp = args->dp;
1186
1187 /*
1188 * Copy the old directory to the buffer.
1189 * Then nuke it from the inode, and add the new buffer to the inode.
1190 * Don't want xfs_idata_realloc copying the data here.
1191 */
1192 oldsize = dp->i_df.if_bytes;
1193 buf = kmem_alloc(oldsize, KM_SLEEP);
1194 oldsfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
1195 ASSERT(oldsfp->hdr.i8count == 1);
1196 memcpy(buf, oldsfp, oldsize);
1197 /*
1198 * Compute the new inode size.
1199 */
1200 newsize =
1201 oldsize -
1202 (oldsfp->hdr.count + 1) *
1203 ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
1204 xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
1205 xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
1206 /*
1207 * Reset our pointers, the data has moved.
1208 */
1209 oldsfp = (xfs_dir2_sf_t *)buf;
1210 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
1211 /*
1212 * Fill in the new header.
1213 */
1214 sfp->hdr.count = oldsfp->hdr.count;
1215 sfp->hdr.i8count = 0;
1216 ino = XFS_DIR2_SF_GET_INUMBER(oldsfp, &oldsfp->hdr.parent);
1217 XFS_DIR2_SF_PUT_INUMBER(sfp, &ino, &sfp->hdr.parent);
1218 /*
1219 * Copy the entries field by field.
1220 */
1221 for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp),
1222 oldsfep = XFS_DIR2_SF_FIRSTENTRY(oldsfp);
1223 i < sfp->hdr.count;
1224 i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep),
1225 oldsfep = XFS_DIR2_SF_NEXTENTRY(oldsfp, oldsfep)) {
1226 sfep->namelen = oldsfep->namelen;
1227 sfep->offset = oldsfep->offset;
1228 memcpy(sfep->name, oldsfep->name, sfep->namelen);
1229 ino = XFS_DIR2_SF_GET_INUMBER(oldsfp,
1230 XFS_DIR2_SF_INUMBERP(oldsfep));
1231 XFS_DIR2_SF_PUT_INUMBER(sfp, &ino, XFS_DIR2_SF_INUMBERP(sfep));
1232 }
1233 /*
1234 * Clean up the inode.
1235 */
1236 kmem_free(buf, oldsize);
1237 dp->i_d.di_size = newsize;
1238 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
1239}
1240
1241/*
1242 * Convert from 4-byte inode numbers to 8-byte inode numbers.
1243 * The new 8-byte inode number is not there yet, we leave with the
1244 * count 1 but no corresponding entry.
1245 */
1246static void
1247xfs_dir2_sf_toino8(
1248 xfs_da_args_t *args) /* operation arguments */
1249{
1250 char *buf; /* old dir's buffer */
1251 xfs_inode_t *dp; /* incore directory inode */
1252 int i; /* entry index */
1253 xfs_ino_t ino; /* entry inode number */
1254 int newsize; /* new inode size */
1255 xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */
1256 xfs_dir2_sf_t *oldsfp; /* old sf directory */
1257 int oldsize; /* old inode size */
1258 xfs_dir2_sf_entry_t *sfep; /* new sf entry */
1259 xfs_dir2_sf_t *sfp; /* new sf directory */
1260
1261 xfs_dir2_trace_args("sf_toino8", args);
1262 dp = args->dp;
1263
1264 /*
1265 * Copy the old directory to the buffer.
1266 * Then nuke it from the inode, and add the new buffer to the inode.
1267 * Don't want xfs_idata_realloc copying the data here.
1268 */
1269 oldsize = dp->i_df.if_bytes;
1270 buf = kmem_alloc(oldsize, KM_SLEEP);
1271 oldsfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
1272 ASSERT(oldsfp->hdr.i8count == 0);
1273 memcpy(buf, oldsfp, oldsize);
1274 /*
1275 * Compute the new inode size.
1276 */
1277 newsize =
1278 oldsize +
1279 (oldsfp->hdr.count + 1) *
1280 ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
1281 xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
1282 xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
1283 /*
1284 * Reset our pointers, the data has moved.
1285 */
1286 oldsfp = (xfs_dir2_sf_t *)buf;
1287 sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
1288 /*
1289 * Fill in the new header.
1290 */
1291 sfp->hdr.count = oldsfp->hdr.count;
1292 sfp->hdr.i8count = 1;
1293 ino = XFS_DIR2_SF_GET_INUMBER(oldsfp, &oldsfp->hdr.parent);
1294 XFS_DIR2_SF_PUT_INUMBER(sfp, &ino, &sfp->hdr.parent);
1295 /*
1296 * Copy the entries field by field.
1297 */
1298 for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp),
1299 oldsfep = XFS_DIR2_SF_FIRSTENTRY(oldsfp);
1300 i < sfp->hdr.count;
1301 i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep),
1302 oldsfep = XFS_DIR2_SF_NEXTENTRY(oldsfp, oldsfep)) {
1303 sfep->namelen = oldsfep->namelen;
1304 sfep->offset = oldsfep->offset;
1305 memcpy(sfep->name, oldsfep->name, sfep->namelen);
1306 ino = XFS_DIR2_SF_GET_INUMBER(oldsfp,
1307 XFS_DIR2_SF_INUMBERP(oldsfep));
1308 XFS_DIR2_SF_PUT_INUMBER(sfp, &ino, XFS_DIR2_SF_INUMBERP(sfep));
1309 }
1310 /*
1311 * Clean up the inode.
1312 */
1313 kmem_free(buf, oldsize);
1314 dp->i_d.di_size = newsize;
1315 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
1316}
1317#endif /* XFS_BIG_INUMS */
diff --git a/fs/xfs/xfs_dir2_sf.h b/fs/xfs/xfs_dir2_sf.h
new file mode 100644
index 000000000000..bac6f5a2a312
--- /dev/null
+++ b/fs/xfs/xfs_dir2_sf.h
@@ -0,0 +1,243 @@
1/*
2 * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_DIR2_SF_H__
33#define __XFS_DIR2_SF_H__
34
35/*
36 * Directory layout when stored internal to an inode.
37 *
38 * Small directories are packed as tightly as possible so as to
39 * fit into the literal area of the inode.
40 */
41
42struct uio;
43struct xfs_dabuf;
44struct xfs_da_args;
45struct xfs_dir2_block;
46struct xfs_inode;
47struct xfs_mount;
48struct xfs_trans;
49
50/*
51 * Maximum size of a shortform directory.
52 */
53#define XFS_DIR2_SF_MAX_SIZE \
54 (XFS_DINODE_MAX_SIZE - (uint)sizeof(xfs_dinode_core_t) - \
55 (uint)sizeof(xfs_agino_t))
56
57/*
58 * Inode number stored as 8 8-bit values.
59 */
60typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t;
61
62/*
63 * Inode number stored as 4 8-bit values.
64 * Works a lot of the time, when all the inode numbers in a directory
65 * fit in 32 bits.
66 */
67typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t;
68
69typedef union {
70 xfs_dir2_ino8_t i8;
71 xfs_dir2_ino4_t i4;
72} xfs_dir2_inou_t;
73#define XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL)
74
75/*
76 * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t.
77 * Only need 16 bits, this is the byte offset into the single block form.
78 */
79typedef struct { __uint8_t i[2]; } xfs_dir2_sf_off_t;
80
81/*
82 * The parent directory has a dedicated field, and the self-pointer must
83 * be calculated on the fly.
84 *
85 * Entries are packed toward the top as tightly as possible. The header
86 * and the elements must be memcpy'd out into a work area to get correct
87 * alignment for the inode number fields.
88 */
89typedef struct xfs_dir2_sf_hdr {
90 __uint8_t count; /* count of entries */
91 __uint8_t i8count; /* count of 8-byte inode #s */
92 xfs_dir2_inou_t parent; /* parent dir inode number */
93} xfs_dir2_sf_hdr_t;
94
95typedef struct xfs_dir2_sf_entry {
96 __uint8_t namelen; /* actual name length */
97 xfs_dir2_sf_off_t offset; /* saved offset */
98 __uint8_t name[1]; /* name, variable size */
99 xfs_dir2_inou_t inumber; /* inode number, var. offset */
100} xfs_dir2_sf_entry_t;
101
102typedef struct xfs_dir2_sf {
103 xfs_dir2_sf_hdr_t hdr; /* shortform header */
104 xfs_dir2_sf_entry_t list[1]; /* shortform entries */
105} xfs_dir2_sf_t;
106
107#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_HDR_SIZE)
108int xfs_dir2_sf_hdr_size(int i8count);
109#define XFS_DIR2_SF_HDR_SIZE(i8count) xfs_dir2_sf_hdr_size(i8count)
110#else
111#define XFS_DIR2_SF_HDR_SIZE(i8count) \
112 ((uint)sizeof(xfs_dir2_sf_hdr_t) - \
113 ((i8count) == 0) * \
114 ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)))
115#endif
116
117#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_INUMBERP)
118xfs_dir2_inou_t *xfs_dir2_sf_inumberp(xfs_dir2_sf_entry_t *sfep);
119#define XFS_DIR2_SF_INUMBERP(sfep) xfs_dir2_sf_inumberp(sfep)
120#else
121#define XFS_DIR2_SF_INUMBERP(sfep) \
122 ((xfs_dir2_inou_t *)&(sfep)->name[(sfep)->namelen])
123#endif
124
125#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_GET_INUMBER)
126xfs_intino_t xfs_dir2_sf_get_inumber(xfs_dir2_sf_t *sfp, xfs_dir2_inou_t *from);
127#define XFS_DIR2_SF_GET_INUMBER(sfp, from) \
128 xfs_dir2_sf_get_inumber(sfp, from)
129
130#else
131#define XFS_DIR2_SF_GET_INUMBER(sfp, from) \
132 ((sfp)->hdr.i8count == 0 ? \
133 (xfs_intino_t)XFS_GET_DIR_INO4((from)->i4) : \
134 (xfs_intino_t)XFS_GET_DIR_INO8((from)->i8))
135#endif
136
137#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_PUT_INUMBER)
138void xfs_dir2_sf_put_inumber(xfs_dir2_sf_t *sfp, xfs_ino_t *from,
139 xfs_dir2_inou_t *to);
140#define XFS_DIR2_SF_PUT_INUMBER(sfp,from,to) \
141 xfs_dir2_sf_put_inumber(sfp,from,to)
142#else
143#define XFS_DIR2_SF_PUT_INUMBER(sfp,from,to) \
144 if ((sfp)->hdr.i8count == 0) { \
145 XFS_PUT_DIR_INO4(*(from), (to)->i4); \
146 } else { \
147 XFS_PUT_DIR_INO8(*(from), (to)->i8); \
148 }
149#endif
150
151#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_GET_OFFSET)
152xfs_dir2_data_aoff_t xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep);
153#define XFS_DIR2_SF_GET_OFFSET(sfep) \
154 xfs_dir2_sf_get_offset(sfep)
155#else
156#define XFS_DIR2_SF_GET_OFFSET(sfep) \
157 INT_GET_UNALIGNED_16_BE(&(sfep)->offset.i)
158#endif
159
160#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_PUT_OFFSET)
161void xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep,
162 xfs_dir2_data_aoff_t off);
163#define XFS_DIR2_SF_PUT_OFFSET(sfep,off) \
164 xfs_dir2_sf_put_offset(sfep,off)
165#else
166#define XFS_DIR2_SF_PUT_OFFSET(sfep,off) \
167 INT_SET_UNALIGNED_16_BE(&(sfep)->offset.i,off)
168#endif
169
170#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_ENTSIZE_BYNAME)
171int xfs_dir2_sf_entsize_byname(xfs_dir2_sf_t *sfp, int len);
172#define XFS_DIR2_SF_ENTSIZE_BYNAME(sfp,len) \
173 xfs_dir2_sf_entsize_byname(sfp,len)
174#else
175#define XFS_DIR2_SF_ENTSIZE_BYNAME(sfp,len) /* space a name uses */ \
176 ((uint)sizeof(xfs_dir2_sf_entry_t) - 1 + (len) - \
177 ((sfp)->hdr.i8count == 0) * \
178 ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)))
179#endif
180
181#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_ENTSIZE_BYENTRY)
182int xfs_dir2_sf_entsize_byentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep);
183#define XFS_DIR2_SF_ENTSIZE_BYENTRY(sfp,sfep) \
184 xfs_dir2_sf_entsize_byentry(sfp,sfep)
185#else
186#define XFS_DIR2_SF_ENTSIZE_BYENTRY(sfp,sfep) /* space an entry uses */ \
187 ((uint)sizeof(xfs_dir2_sf_entry_t) - 1 + (sfep)->namelen - \
188 ((sfp)->hdr.i8count == 0) * \
189 ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)))
190#endif
191
192#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_FIRSTENTRY)
193xfs_dir2_sf_entry_t *xfs_dir2_sf_firstentry(xfs_dir2_sf_t *sfp);
194#define XFS_DIR2_SF_FIRSTENTRY(sfp) xfs_dir2_sf_firstentry(sfp)
195#else
196#define XFS_DIR2_SF_FIRSTENTRY(sfp) /* first entry in struct */ \
197 ((xfs_dir2_sf_entry_t *) \
198 ((char *)(sfp) + XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)))
199#endif
200
201#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_NEXTENTRY)
202xfs_dir2_sf_entry_t *xfs_dir2_sf_nextentry(xfs_dir2_sf_t *sfp,
203 xfs_dir2_sf_entry_t *sfep);
204#define XFS_DIR2_SF_NEXTENTRY(sfp,sfep) xfs_dir2_sf_nextentry(sfp,sfep)
205#else
206#define XFS_DIR2_SF_NEXTENTRY(sfp,sfep) /* next entry in struct */ \
207 ((xfs_dir2_sf_entry_t *) \
208 ((char *)(sfep) + XFS_DIR2_SF_ENTSIZE_BYENTRY(sfp,sfep)))
209#endif
210
211/*
212 * Functions.
213 */
214
215extern int
216 xfs_dir2_block_sfsize(struct xfs_inode *dp,
217 struct xfs_dir2_block *block,
218 xfs_dir2_sf_hdr_t *sfhp);
219
220extern int
221 xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_dabuf *bp,
222 int size, xfs_dir2_sf_hdr_t *sfhp);
223
224extern int
225 xfs_dir2_sf_addname(struct xfs_da_args *args);
226
227extern int
228 xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
229
230extern int
231 xfs_dir2_sf_getdents(struct xfs_inode *dp, struct uio *uio, int *eofp,
232 struct xfs_dirent *dbp, xfs_dir2_put_t put);
233
234extern int
235 xfs_dir2_sf_lookup(struct xfs_da_args *args);
236
237extern int
238 xfs_dir2_sf_removename(struct xfs_da_args *args);
239
240extern int
241 xfs_dir2_sf_replace(struct xfs_da_args *args);
242
243#endif /* __XFS_DIR2_SF_H__ */
diff --git a/fs/xfs/xfs_dir2_trace.c b/fs/xfs/xfs_dir2_trace.c
new file mode 100644
index 000000000000..9d6417393140
--- /dev/null
+++ b/fs/xfs/xfs_dir2_trace.c
@@ -0,0 +1,235 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33/*
34 * xfs_dir2_trace.c
35 * Tracing for xfs v2 directories.
36 */
37#include "xfs.h"
38
39#include "xfs_types.h"
40#include "xfs_inum.h"
41#include "xfs_dir.h"
42#include "xfs_dir2.h"
43#include "xfs_bmap_btree.h"
44#include "xfs_attr_sf.h"
45#include "xfs_dir_sf.h"
46#include "xfs_dir2_sf.h"
47#include "xfs_dinode.h"
48#include "xfs_inode.h"
49#include "xfs_da_btree.h"
50#include "xfs_dir2_trace.h"
51
52#ifdef XFS_DIR2_TRACE
53ktrace_t *xfs_dir2_trace_buf;
54
55/*
56 * Enter something in the trace buffers.
57 */
58static void
59xfs_dir2_trace_enter(
60 xfs_inode_t *dp,
61 int type,
62 char *where,
63 char *name,
64 int namelen,
65 void *a0,
66 void *a1,
67 void *a2,
68 void *a3,
69 void *a4,
70 void *a5,
71 void *a6,
72 void *a7)
73{
74 void *n[5];
75
76 ASSERT(xfs_dir2_trace_buf);
77 ASSERT(dp->i_dir_trace);
78 if (name)
79 memcpy(n, name, min((int)sizeof(n), namelen));
80 else
81 memset((char *)n, 0, sizeof(n));
82 ktrace_enter(xfs_dir2_trace_buf,
83 (void *)(long)type, (void *)where,
84 (void *)a0, (void *)a1, (void *)a2, (void *)a3,
85 (void *)a4, (void *)a5, (void *)a6, (void *)a7,
86 (void *)(long)namelen,
87 (void *)n[0], (void *)n[1], (void *)n[2],
88 (void *)n[3], (void *)n[4]);
89 ktrace_enter(dp->i_dir_trace,
90 (void *)(long)type, (void *)where,
91 (void *)a0, (void *)a1, (void *)a2, (void *)a3,
92 (void *)a4, (void *)a5, (void *)a6, (void *)a7,
93 (void *)(long)namelen,
94 (void *)n[0], (void *)n[1], (void *)n[2],
95 (void *)n[3], (void *)n[4]);
96}
97
98void
99xfs_dir2_trace_args(
100 char *where,
101 xfs_da_args_t *args)
102{
103 xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS, where,
104 (char *)args->name, (int)args->namelen,
105 (void *)(unsigned long)args->hashval,
106 (void *)((unsigned long)(args->inumber >> 32)),
107 (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
108 (void *)args->dp, (void *)args->trans,
109 (void *)(unsigned long)args->justcheck, NULL, NULL);
110}
111
112void
113xfs_dir2_trace_args_b(
114 char *where,
115 xfs_da_args_t *args,
116 xfs_dabuf_t *bp)
117{
118 xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_B, where,
119 (char *)args->name, (int)args->namelen,
120 (void *)(unsigned long)args->hashval,
121 (void *)((unsigned long)(args->inumber >> 32)),
122 (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
123 (void *)args->dp, (void *)args->trans,
124 (void *)(unsigned long)args->justcheck,
125 (void *)(bp ? bp->bps[0] : NULL), NULL);
126}
127
128void
129xfs_dir2_trace_args_bb(
130 char *where,
131 xfs_da_args_t *args,
132 xfs_dabuf_t *lbp,
133 xfs_dabuf_t *dbp)
134{
135 xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_BB, where,
136 (char *)args->name, (int)args->namelen,
137 (void *)(unsigned long)args->hashval,
138 (void *)((unsigned long)(args->inumber >> 32)),
139 (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
140 (void *)args->dp, (void *)args->trans,
141 (void *)(unsigned long)args->justcheck,
142 (void *)(lbp ? lbp->bps[0] : NULL),
143 (void *)(dbp ? dbp->bps[0] : NULL));
144}
145
146void
147xfs_dir2_trace_args_bibii(
148 char *where,
149 xfs_da_args_t *args,
150 xfs_dabuf_t *bs,
151 int ss,
152 xfs_dabuf_t *bd,
153 int sd,
154 int c)
155{
156 xfs_buf_t *bpbs = bs ? bs->bps[0] : NULL;
157 xfs_buf_t *bpbd = bd ? bd->bps[0] : NULL;
158
159 xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_BIBII, where,
160 (char *)args->name, (int)args->namelen,
161 (void *)args->dp, (void *)args->trans,
162 (void *)bpbs, (void *)(long)ss, (void *)bpbd, (void *)(long)sd,
163 (void *)(long)c, NULL);
164}
165
166void
167xfs_dir2_trace_args_db(
168 char *where,
169 xfs_da_args_t *args,
170 xfs_dir2_db_t db,
171 xfs_dabuf_t *bp)
172{
173 xfs_buf_t *dbp = bp ? bp->bps[0] : NULL;
174
175 xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_DB, where,
176 (char *)args->name, (int)args->namelen,
177 (void *)(unsigned long)args->hashval,
178 (void *)((unsigned long)(args->inumber >> 32)),
179 (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
180 (void *)args->dp, (void *)args->trans,
181 (void *)(unsigned long)args->justcheck, (void *)(long)db,
182 (void *)dbp);
183}
184
185void
186xfs_dir2_trace_args_i(
187 char *where,
188 xfs_da_args_t *args,
189 xfs_ino_t i)
190{
191 xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_I, where,
192 (char *)args->name, (int)args->namelen,
193 (void *)(unsigned long)args->hashval,
194 (void *)((unsigned long)(args->inumber >> 32)),
195 (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
196 (void *)args->dp, (void *)args->trans,
197 (void *)(unsigned long)args->justcheck,
198 (void *)((unsigned long)(i >> 32)),
199 (void *)((unsigned long)(i & 0xFFFFFFFF)));
200}
201
202void
203xfs_dir2_trace_args_s(
204 char *where,
205 xfs_da_args_t *args,
206 int s)
207{
208 xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_S, where,
209 (char *)args->name, (int)args->namelen,
210 (void *)(unsigned long)args->hashval,
211 (void *)((unsigned long)(args->inumber >> 32)),
212 (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
213 (void *)args->dp, (void *)args->trans,
214 (void *)(unsigned long)args->justcheck, (void *)(long)s, NULL);
215}
216
217void
218xfs_dir2_trace_args_sb(
219 char *where,
220 xfs_da_args_t *args,
221 int s,
222 xfs_dabuf_t *bp)
223{
224 xfs_buf_t *dbp = bp ? bp->bps[0] : NULL;
225
226 xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_SB, where,
227 (char *)args->name, (int)args->namelen,
228 (void *)(unsigned long)args->hashval,
229 (void *)((unsigned long)(args->inumber >> 32)),
230 (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
231 (void *)args->dp, (void *)args->trans,
232 (void *)(unsigned long)args->justcheck, (void *)(long)s,
233 (void *)dbp);
234}
235#endif /* XFS_DIR2_TRACE */
diff --git a/fs/xfs/xfs_dir2_trace.h b/fs/xfs/xfs_dir2_trace.h
new file mode 100644
index 000000000000..0a178bffa806
--- /dev/null
+++ b/fs/xfs/xfs_dir2_trace.h
@@ -0,0 +1,86 @@
1/*
2 * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_DIR2_TRACE_H__
33#define __XFS_DIR2_TRACE_H__
34
35/*
36 * Tracing for xfs v2 directories.
37 */
38
39#if defined(XFS_DIR2_TRACE)
40
41struct ktrace;
42struct xfs_dabuf;
43struct xfs_da_args;
44
45#define XFS_DIR2_GTRACE_SIZE 4096 /* global buffer */
46#define XFS_DIR2_KTRACE_SIZE 32 /* per-inode buffer */
47extern struct ktrace *xfs_dir2_trace_buf;
48
49#define XFS_DIR2_KTRACE_ARGS 1 /* args only */
50#define XFS_DIR2_KTRACE_ARGS_B 2 /* args + buffer */
51#define XFS_DIR2_KTRACE_ARGS_BB 3 /* args + 2 buffers */
52#define XFS_DIR2_KTRACE_ARGS_DB 4 /* args, db, buffer */
53#define XFS_DIR2_KTRACE_ARGS_I 5 /* args, inum */
54#define XFS_DIR2_KTRACE_ARGS_S 6 /* args, int */
55#define XFS_DIR2_KTRACE_ARGS_SB 7 /* args, int, buffer */
56#define XFS_DIR2_KTRACE_ARGS_BIBII 8 /* args, buf/int/buf/int/int */
57
58void xfs_dir2_trace_args(char *where, struct xfs_da_args *args);
59void xfs_dir2_trace_args_b(char *where, struct xfs_da_args *args,
60 struct xfs_dabuf *bp);
61void xfs_dir2_trace_args_bb(char *where, struct xfs_da_args *args,
62 struct xfs_dabuf *lbp, struct xfs_dabuf *dbp);
63void xfs_dir2_trace_args_bibii(char *where, struct xfs_da_args *args,
64 struct xfs_dabuf *bs, int ss,
65 struct xfs_dabuf *bd, int sd, int c);
66void xfs_dir2_trace_args_db(char *where, struct xfs_da_args *args,
67 xfs_dir2_db_t db, struct xfs_dabuf *bp);
68void xfs_dir2_trace_args_i(char *where, struct xfs_da_args *args, xfs_ino_t i);
69void xfs_dir2_trace_args_s(char *where, struct xfs_da_args *args, int s);
70void xfs_dir2_trace_args_sb(char *where, struct xfs_da_args *args, int s,
71 struct xfs_dabuf *bp);
72
73#else /* XFS_DIR2_TRACE */
74
75#define xfs_dir2_trace_args(where, args)
76#define xfs_dir2_trace_args_b(where, args, bp)
77#define xfs_dir2_trace_args_bb(where, args, lbp, dbp)
78#define xfs_dir2_trace_args_bibii(where, args, bs, ss, bd, sd, c)
79#define xfs_dir2_trace_args_db(where, args, db, bp)
80#define xfs_dir2_trace_args_i(where, args, i)
81#define xfs_dir2_trace_args_s(where, args, s)
82#define xfs_dir2_trace_args_sb(where, args, s, bp)
83
84#endif /* XFS_DIR2_TRACE */
85
86#endif /* __XFS_DIR2_TRACE_H__ */
diff --git a/fs/xfs/xfs_dir_leaf.c b/fs/xfs/xfs_dir_leaf.c
new file mode 100644
index 000000000000..617018d6bbdc
--- /dev/null
+++ b/fs/xfs/xfs_dir_leaf.c
@@ -0,0 +1,2231 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33/*
34 * xfs_dir_leaf.c
35 *
36 * GROT: figure out how to recover gracefully when bmap returns ENOSPC.
37 */
38
39#include "xfs.h"
40
41#include "xfs_macros.h"
42#include "xfs_types.h"
43#include "xfs_inum.h"
44#include "xfs_log.h"
45#include "xfs_trans.h"
46#include "xfs_sb.h"
47#include "xfs_dir.h"
48#include "xfs_dir2.h"
49#include "xfs_dmapi.h"
50#include "xfs_mount.h"
51#include "xfs_alloc_btree.h"
52#include "xfs_bmap_btree.h"
53#include "xfs_ialloc_btree.h"
54#include "xfs_alloc.h"
55#include "xfs_btree.h"
56#include "xfs_attr_sf.h"
57#include "xfs_dir_sf.h"
58#include "xfs_dir2_sf.h"
59#include "xfs_dinode.h"
60#include "xfs_inode_item.h"
61#include "xfs_inode.h"
62#include "xfs_bmap.h"
63#include "xfs_da_btree.h"
64#include "xfs_dir_leaf.h"
65#include "xfs_error.h"
66
67/*
68 * xfs_dir_leaf.c
69 *
70 * Routines to implement leaf blocks of directories as Btrees of hashed names.
71 */
72
73/*========================================================================
74 * Function prototypes for the kernel.
75 *========================================================================*/
76
77/*
78 * Routines used for growing the Btree.
79 */
80STATIC void xfs_dir_leaf_add_work(xfs_dabuf_t *leaf_buffer, xfs_da_args_t *args,
81 int insertion_index,
82 int freemap_index);
83STATIC int xfs_dir_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *leaf_buffer,
84 int musthave, int justcheck);
85STATIC void xfs_dir_leaf_rebalance(xfs_da_state_t *state,
86 xfs_da_state_blk_t *blk1,
87 xfs_da_state_blk_t *blk2);
88STATIC int xfs_dir_leaf_figure_balance(xfs_da_state_t *state,
89 xfs_da_state_blk_t *leaf_blk_1,
90 xfs_da_state_blk_t *leaf_blk_2,
91 int *number_entries_in_blk1,
92 int *number_namebytes_in_blk1);
93
94/*
95 * Utility routines.
96 */
97STATIC void xfs_dir_leaf_moveents(xfs_dir_leafblock_t *src_leaf,
98 int src_start,
99 xfs_dir_leafblock_t *dst_leaf,
100 int dst_start, int move_count,
101 xfs_mount_t *mp);
102
103
104/*========================================================================
105 * External routines when dirsize < XFS_IFORK_DSIZE(dp).
106 *========================================================================*/
107
108
109/*
110 * Validate a given inode number.
111 */
112int
113xfs_dir_ino_validate(xfs_mount_t *mp, xfs_ino_t ino)
114{
115 xfs_agblock_t agblkno;
116 xfs_agino_t agino;
117 xfs_agnumber_t agno;
118 int ino_ok;
119 int ioff;
120
121 agno = XFS_INO_TO_AGNO(mp, ino);
122 agblkno = XFS_INO_TO_AGBNO(mp, ino);
123 ioff = XFS_INO_TO_OFFSET(mp, ino);
124 agino = XFS_OFFBNO_TO_AGINO(mp, agblkno, ioff);
125 ino_ok =
126 agno < mp->m_sb.sb_agcount &&
127 agblkno < mp->m_sb.sb_agblocks &&
128 agblkno != 0 &&
129 ioff < (1 << mp->m_sb.sb_inopblog) &&
130 XFS_AGINO_TO_INO(mp, agno, agino) == ino;
131 if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE,
132 XFS_RANDOM_DIR_INO_VALIDATE))) {
133 xfs_fs_cmn_err(CE_WARN, mp, "Invalid inode number 0x%Lx",
134 (unsigned long long) ino);
135 XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp);
136 return XFS_ERROR(EFSCORRUPTED);
137 }
138 return 0;
139}
140
141/*
142 * Create the initial contents of a shortform directory.
143 */
144int
145xfs_dir_shortform_create(xfs_da_args_t *args, xfs_ino_t parent)
146{
147 xfs_dir_sf_hdr_t *hdr;
148 xfs_inode_t *dp;
149
150 dp = args->dp;
151 ASSERT(dp != NULL);
152 ASSERT(dp->i_d.di_size == 0);
153 if (dp->i_d.di_format == XFS_DINODE_FMT_EXTENTS) {
154 dp->i_df.if_flags &= ~XFS_IFEXTENTS; /* just in case */
155 dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
156 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
157 dp->i_df.if_flags |= XFS_IFINLINE;
158 }
159 ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
160 ASSERT(dp->i_df.if_bytes == 0);
161 xfs_idata_realloc(dp, sizeof(*hdr), XFS_DATA_FORK);
162 hdr = (xfs_dir_sf_hdr_t *)dp->i_df.if_u1.if_data;
163 XFS_DIR_SF_PUT_DIRINO(&parent, &hdr->parent);
164
165 hdr->count = 0;
166 dp->i_d.di_size = sizeof(*hdr);
167 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
168 return(0);
169}
170
171/*
172 * Add a name to the shortform directory structure.
173 * Overflow from the inode has already been checked for.
174 */
175int
176xfs_dir_shortform_addname(xfs_da_args_t *args)
177{
178 xfs_dir_shortform_t *sf;
179 xfs_dir_sf_entry_t *sfe;
180 int i, offset, size;
181 xfs_inode_t *dp;
182
183 dp = args->dp;
184 ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
185 /*
186 * Catch the case where the conversion from shortform to leaf
187 * failed part way through.
188 */
189 if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
190 ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
191 return XFS_ERROR(EIO);
192 }
193 ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
194 ASSERT(dp->i_df.if_u1.if_data != NULL);
195 sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
196 sfe = &sf->list[0];
197 for (i = INT_GET(sf->hdr.count, ARCH_CONVERT)-1; i >= 0; i--) {
198 if (sfe->namelen == args->namelen &&
199 args->name[0] == sfe->name[0] &&
200 memcmp(args->name, sfe->name, args->namelen) == 0)
201 return(XFS_ERROR(EEXIST));
202 sfe = XFS_DIR_SF_NEXTENTRY(sfe);
203 }
204
205 offset = (int)((char *)sfe - (char *)sf);
206 size = XFS_DIR_SF_ENTSIZE_BYNAME(args->namelen);
207 xfs_idata_realloc(dp, size, XFS_DATA_FORK);
208 sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
209 sfe = (xfs_dir_sf_entry_t *)((char *)sf + offset);
210
211 XFS_DIR_SF_PUT_DIRINO(&args->inumber, &sfe->inumber);
212 sfe->namelen = args->namelen;
213 memcpy(sfe->name, args->name, sfe->namelen);
214 INT_MOD(sf->hdr.count, ARCH_CONVERT, +1);
215
216 dp->i_d.di_size += size;
217 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
218
219 return(0);
220}
221
222/*
223 * Remove a name from the shortform directory structure.
224 */
225int
226xfs_dir_shortform_removename(xfs_da_args_t *args)
227{
228 xfs_dir_shortform_t *sf;
229 xfs_dir_sf_entry_t *sfe;
230 int base, size = 0, i;
231 xfs_inode_t *dp;
232
233 dp = args->dp;
234 ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
235 /*
236 * Catch the case where the conversion from shortform to leaf
237 * failed part way through.
238 */
239 if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
240 ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
241 return XFS_ERROR(EIO);
242 }
243 ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
244 ASSERT(dp->i_df.if_u1.if_data != NULL);
245 base = sizeof(xfs_dir_sf_hdr_t);
246 sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
247 sfe = &sf->list[0];
248 for (i = INT_GET(sf->hdr.count, ARCH_CONVERT)-1; i >= 0; i--) {
249 size = XFS_DIR_SF_ENTSIZE_BYENTRY(sfe);
250 if (sfe->namelen == args->namelen &&
251 sfe->name[0] == args->name[0] &&
252 memcmp(sfe->name, args->name, args->namelen) == 0)
253 break;
254 base += size;
255 sfe = XFS_DIR_SF_NEXTENTRY(sfe);
256 }
257 if (i < 0) {
258 ASSERT(args->oknoent);
259 return(XFS_ERROR(ENOENT));
260 }
261
262 if ((base + size) != dp->i_d.di_size) {
263 memmove(&((char *)sf)[base], &((char *)sf)[base+size],
264 dp->i_d.di_size - (base+size));
265 }
266 INT_MOD(sf->hdr.count, ARCH_CONVERT, -1);
267
268 xfs_idata_realloc(dp, -size, XFS_DATA_FORK);
269 dp->i_d.di_size -= size;
270 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
271
272 return(0);
273}
274
275/*
276 * Look up a name in a shortform directory structure.
277 */
278int
279xfs_dir_shortform_lookup(xfs_da_args_t *args)
280{
281 xfs_dir_shortform_t *sf;
282 xfs_dir_sf_entry_t *sfe;
283 int i;
284 xfs_inode_t *dp;
285
286 dp = args->dp;
287 ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
288 /*
289 * Catch the case where the conversion from shortform to leaf
290 * failed part way through.
291 */
292 if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
293 ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
294 return XFS_ERROR(EIO);
295 }
296 ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
297 ASSERT(dp->i_df.if_u1.if_data != NULL);
298 sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
299 if (args->namelen == 2 &&
300 args->name[0] == '.' && args->name[1] == '.') {
301 XFS_DIR_SF_GET_DIRINO(&sf->hdr.parent, &args->inumber);
302 return(XFS_ERROR(EEXIST));
303 }
304 if (args->namelen == 1 && args->name[0] == '.') {
305 args->inumber = dp->i_ino;
306 return(XFS_ERROR(EEXIST));
307 }
308 sfe = &sf->list[0];
309 for (i = INT_GET(sf->hdr.count, ARCH_CONVERT)-1; i >= 0; i--) {
310 if (sfe->namelen == args->namelen &&
311 sfe->name[0] == args->name[0] &&
312 memcmp(args->name, sfe->name, args->namelen) == 0) {
313 XFS_DIR_SF_GET_DIRINO(&sfe->inumber, &args->inumber);
314 return(XFS_ERROR(EEXIST));
315 }
316 sfe = XFS_DIR_SF_NEXTENTRY(sfe);
317 }
318 ASSERT(args->oknoent);
319 return(XFS_ERROR(ENOENT));
320}
321
322/*
323 * Convert from using the shortform to the leaf.
324 */
325int
326xfs_dir_shortform_to_leaf(xfs_da_args_t *iargs)
327{
328 xfs_inode_t *dp;
329 xfs_dir_shortform_t *sf;
330 xfs_dir_sf_entry_t *sfe;
331 xfs_da_args_t args;
332 xfs_ino_t inumber;
333 char *tmpbuffer;
334 int retval, i, size;
335 xfs_dablk_t blkno;
336 xfs_dabuf_t *bp;
337
338 dp = iargs->dp;
339 /*
340 * Catch the case where the conversion from shortform to leaf
341 * failed part way through.
342 */
343 if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
344 ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
345 return XFS_ERROR(EIO);
346 }
347 ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
348 ASSERT(dp->i_df.if_u1.if_data != NULL);
349 size = dp->i_df.if_bytes;
350 tmpbuffer = kmem_alloc(size, KM_SLEEP);
351 ASSERT(tmpbuffer != NULL);
352
353 memcpy(tmpbuffer, dp->i_df.if_u1.if_data, size);
354
355 sf = (xfs_dir_shortform_t *)tmpbuffer;
356 XFS_DIR_SF_GET_DIRINO(&sf->hdr.parent, &inumber);
357
358 xfs_idata_realloc(dp, -size, XFS_DATA_FORK);
359 dp->i_d.di_size = 0;
360 xfs_trans_log_inode(iargs->trans, dp, XFS_ILOG_CORE);
361 retval = xfs_da_grow_inode(iargs, &blkno);
362 if (retval)
363 goto out;
364
365 ASSERT(blkno == 0);
366 retval = xfs_dir_leaf_create(iargs, blkno, &bp);
367 if (retval)
368 goto out;
369 xfs_da_buf_done(bp);
370
371 args.name = ".";
372 args.namelen = 1;
373 args.hashval = xfs_dir_hash_dot;
374 args.inumber = dp->i_ino;
375 args.dp = dp;
376 args.firstblock = iargs->firstblock;
377 args.flist = iargs->flist;
378 args.total = iargs->total;
379 args.whichfork = XFS_DATA_FORK;
380 args.trans = iargs->trans;
381 args.justcheck = 0;
382 args.addname = args.oknoent = 1;
383 retval = xfs_dir_leaf_addname(&args);
384 if (retval)
385 goto out;
386
387 args.name = "..";
388 args.namelen = 2;
389 args.hashval = xfs_dir_hash_dotdot;
390 args.inumber = inumber;
391 retval = xfs_dir_leaf_addname(&args);
392 if (retval)
393 goto out;
394
395 sfe = &sf->list[0];
396 for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT); i++) {
397 args.name = (char *)(sfe->name);
398 args.namelen = sfe->namelen;
399 args.hashval = xfs_da_hashname((char *)(sfe->name),
400 sfe->namelen);
401 XFS_DIR_SF_GET_DIRINO(&sfe->inumber, &args.inumber);
402 retval = xfs_dir_leaf_addname(&args);
403 if (retval)
404 goto out;
405 sfe = XFS_DIR_SF_NEXTENTRY(sfe);
406 }
407 retval = 0;
408
409out:
410 kmem_free(tmpbuffer, size);
411 return(retval);
412}
413
414STATIC int
415xfs_dir_shortform_compare(const void *a, const void *b)
416{
417 xfs_dir_sf_sort_t *sa, *sb;
418
419 sa = (xfs_dir_sf_sort_t *)a;
420 sb = (xfs_dir_sf_sort_t *)b;
421 if (sa->hash < sb->hash)
422 return -1;
423 else if (sa->hash > sb->hash)
424 return 1;
425 else
426 return sa->entno - sb->entno;
427}
428
429/*
430 * Copy out directory entries for getdents(), for shortform directories.
431 */
432/*ARGSUSED*/
433int
434xfs_dir_shortform_getdents(xfs_inode_t *dp, uio_t *uio, int *eofp,
435 xfs_dirent_t *dbp, xfs_dir_put_t put)
436{
437 xfs_dir_shortform_t *sf;
438 xfs_dir_sf_entry_t *sfe;
439 int retval, i, sbsize, nsbuf, lastresid=0, want_entno;
440 xfs_mount_t *mp;
441 xfs_dahash_t cookhash, hash;
442 xfs_dir_put_args_t p;
443 xfs_dir_sf_sort_t *sbuf, *sbp;
444
445 mp = dp->i_mount;
446 sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
447 cookhash = XFS_DA_COOKIE_HASH(mp, uio->uio_offset);
448 want_entno = XFS_DA_COOKIE_ENTRY(mp, uio->uio_offset);
449 nsbuf = INT_GET(sf->hdr.count, ARCH_CONVERT) + 2;
450 sbsize = (nsbuf + 1) * sizeof(*sbuf);
451 sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP);
452
453 xfs_dir_trace_g_du("sf: start", dp, uio);
454
455 /*
456 * Collect all the entries into the buffer.
457 * Entry 0 is .
458 */
459 sbp->entno = 0;
460 sbp->seqno = 0;
461 sbp->hash = xfs_dir_hash_dot;
462 sbp->ino = dp->i_ino;
463 sbp->name = ".";
464 sbp->namelen = 1;
465 sbp++;
466
467 /*
468 * Entry 1 is ..
469 */
470 sbp->entno = 1;
471 sbp->seqno = 0;
472 sbp->hash = xfs_dir_hash_dotdot;
473 sbp->ino = XFS_GET_DIR_INO8(sf->hdr.parent);
474 sbp->name = "..";
475 sbp->namelen = 2;
476 sbp++;
477
478 /*
479 * Scan the directory data for the rest of the entries.
480 */
481 for (i = 0, sfe = &sf->list[0];
482 i < INT_GET(sf->hdr.count, ARCH_CONVERT); i++) {
483
484 if (unlikely(
485 ((char *)sfe < (char *)sf) ||
486 ((char *)sfe >= ((char *)sf + dp->i_df.if_bytes)))) {
487 xfs_dir_trace_g_du("sf: corrupted", dp, uio);
488 XFS_CORRUPTION_ERROR("xfs_dir_shortform_getdents",
489 XFS_ERRLEVEL_LOW, mp, sfe);
490 kmem_free(sbuf, sbsize);
491 return XFS_ERROR(EFSCORRUPTED);
492 }
493
494 sbp->entno = i + 2;
495 sbp->seqno = 0;
496 sbp->hash = xfs_da_hashname((char *)sfe->name, sfe->namelen);
497 sbp->ino = XFS_GET_DIR_INO8(sfe->inumber);
498 sbp->name = (char *)sfe->name;
499 sbp->namelen = sfe->namelen;
500 sfe = XFS_DIR_SF_NEXTENTRY(sfe);
501 sbp++;
502 }
503
504 /*
505 * Sort the entries on hash then entno.
506 */
507 qsort(sbuf, nsbuf, sizeof(*sbuf), xfs_dir_shortform_compare);
508 /*
509 * Stuff in last entry.
510 */
511 sbp->entno = nsbuf;
512 sbp->hash = XFS_DA_MAXHASH;
513 sbp->seqno = 0;
514 /*
515 * Figure out the sequence numbers in case there's a hash duplicate.
516 */
517 for (hash = sbuf->hash, sbp = sbuf + 1;
518 sbp < &sbuf[nsbuf + 1]; sbp++) {
519 if (sbp->hash == hash)
520 sbp->seqno = sbp[-1].seqno + 1;
521 else
522 hash = sbp->hash;
523 }
524
525 /*
526 * Set up put routine.
527 */
528 p.dbp = dbp;
529 p.put = put;
530 p.uio = uio;
531
532 /*
533 * Find our place.
534 */
535 for (sbp = sbuf; sbp < &sbuf[nsbuf + 1]; sbp++) {
536 if (sbp->hash > cookhash ||
537 (sbp->hash == cookhash && sbp->seqno >= want_entno))
538 break;
539 }
540
541 /*
542 * Did we fail to find anything? We stop at the last entry,
543 * the one we put maxhash into.
544 */
545 if (sbp == &sbuf[nsbuf]) {
546 kmem_free(sbuf, sbsize);
547 xfs_dir_trace_g_du("sf: hash beyond end", dp, uio);
548 uio->uio_offset = XFS_DA_MAKE_COOKIE(mp, 0, 0, XFS_DA_MAXHASH);
549 *eofp = 1;
550 return 0;
551 }
552
553 /*
554 * Loop putting entries into the user buffer.
555 */
556 while (sbp < &sbuf[nsbuf]) {
557 /*
558 * Save the first resid in a run of equal-hashval entries
559 * so that we can back them out if they don't all fit.
560 */
561 if (sbp->seqno == 0 || sbp == sbuf)
562 lastresid = uio->uio_resid;
563 XFS_PUT_COOKIE(p.cook, mp, 0, sbp[1].seqno, sbp[1].hash);
564 p.ino = sbp->ino;
565#if XFS_BIG_INUMS
566 p.ino += mp->m_inoadd;
567#endif
568 p.name = sbp->name;
569 p.namelen = sbp->namelen;
570 retval = p.put(&p);
571 if (!p.done) {
572 uio->uio_offset =
573 XFS_DA_MAKE_COOKIE(mp, 0, 0, sbp->hash);
574 kmem_free(sbuf, sbsize);
575 uio->uio_resid = lastresid;
576 xfs_dir_trace_g_du("sf: E-O-B", dp, uio);
577 return retval;
578 }
579 sbp++;
580 }
581 kmem_free(sbuf, sbsize);
582 uio->uio_offset = p.cook.o;
583 *eofp = 1;
584 xfs_dir_trace_g_du("sf: E-O-F", dp, uio);
585 return 0;
586}
587
588/*
589 * Look up a name in a shortform directory structure, replace the inode number.
590 */
591int
592xfs_dir_shortform_replace(xfs_da_args_t *args)
593{
594 xfs_dir_shortform_t *sf;
595 xfs_dir_sf_entry_t *sfe;
596 xfs_inode_t *dp;
597 int i;
598
599 dp = args->dp;
600 ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
601 /*
602 * Catch the case where the conversion from shortform to leaf
603 * failed part way through.
604 */
605 if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
606 ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
607 return XFS_ERROR(EIO);
608 }
609 ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
610 ASSERT(dp->i_df.if_u1.if_data != NULL);
611 sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
612 if (args->namelen == 2 &&
613 args->name[0] == '.' && args->name[1] == '.') {
614 /* XXX - replace assert? */
615 XFS_DIR_SF_PUT_DIRINO(&args->inumber, &sf->hdr.parent);
616 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
617 return(0);
618 }
619 ASSERT(args->namelen != 1 || args->name[0] != '.');
620 sfe = &sf->list[0];
621 for (i = INT_GET(sf->hdr.count, ARCH_CONVERT)-1; i >= 0; i--) {
622 if (sfe->namelen == args->namelen &&
623 sfe->name[0] == args->name[0] &&
624 memcmp(args->name, sfe->name, args->namelen) == 0) {
625 ASSERT(memcmp((char *)&args->inumber,
626 (char *)&sfe->inumber, sizeof(xfs_ino_t)));
627 XFS_DIR_SF_PUT_DIRINO(&args->inumber, &sfe->inumber);
628 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
629 return(0);
630 }
631 sfe = XFS_DIR_SF_NEXTENTRY(sfe);
632 }
633 ASSERT(args->oknoent);
634 return(XFS_ERROR(ENOENT));
635}
636
637/*
638 * Convert a leaf directory to shortform structure
639 */
640int
641xfs_dir_leaf_to_shortform(xfs_da_args_t *iargs)
642{
643 xfs_dir_leafblock_t *leaf;
644 xfs_dir_leaf_hdr_t *hdr;
645 xfs_dir_leaf_entry_t *entry;
646 xfs_dir_leaf_name_t *namest;
647 xfs_da_args_t args;
648 xfs_inode_t *dp;
649 xfs_ino_t parent;
650 char *tmpbuffer;
651 int retval, i;
652 xfs_dabuf_t *bp;
653
654 dp = iargs->dp;
655 tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP);
656 ASSERT(tmpbuffer != NULL);
657
658 retval = xfs_da_read_buf(iargs->trans, iargs->dp, 0, -1, &bp,
659 XFS_DATA_FORK);
660 if (retval)
661 goto out;
662 ASSERT(bp != NULL);
663 memcpy(tmpbuffer, bp->data, XFS_LBSIZE(dp->i_mount));
664 leaf = (xfs_dir_leafblock_t *)tmpbuffer;
665 ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
666 memset(bp->data, 0, XFS_LBSIZE(dp->i_mount));
667
668 /*
669 * Find and special case the parent inode number
670 */
671 hdr = &leaf->hdr;
672 entry = &leaf->entries[0];
673 for (i = INT_GET(hdr->count, ARCH_CONVERT)-1; i >= 0; entry++, i--) {
674 namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
675 if ((entry->namelen == 2) &&
676 (namest->name[0] == '.') &&
677 (namest->name[1] == '.')) {
678 XFS_DIR_SF_GET_DIRINO(&namest->inumber, &parent);
679 entry->nameidx = 0;
680 } else if ((entry->namelen == 1) && (namest->name[0] == '.')) {
681 entry->nameidx = 0;
682 }
683 }
684 retval = xfs_da_shrink_inode(iargs, 0, bp);
685 if (retval)
686 goto out;
687 retval = xfs_dir_shortform_create(iargs, parent);
688 if (retval)
689 goto out;
690
691 /*
692 * Copy the rest of the filenames
693 */
694 entry = &leaf->entries[0];
695 args.dp = dp;
696 args.firstblock = iargs->firstblock;
697 args.flist = iargs->flist;
698 args.total = iargs->total;
699 args.whichfork = XFS_DATA_FORK;
700 args.trans = iargs->trans;
701 args.justcheck = 0;
702 args.addname = args.oknoent = 1;
703 for (i = 0; i < INT_GET(hdr->count, ARCH_CONVERT); entry++, i++) {
704 if (!entry->nameidx)
705 continue;
706 namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
707 args.name = (char *)(namest->name);
708 args.namelen = entry->namelen;
709 args.hashval = INT_GET(entry->hashval, ARCH_CONVERT);
710 XFS_DIR_SF_GET_DIRINO(&namest->inumber, &args.inumber);
711 xfs_dir_shortform_addname(&args);
712 }
713
714out:
715 kmem_free(tmpbuffer, XFS_LBSIZE(dp->i_mount));
716 return(retval);
717}
718
719/*
720 * Convert from using a single leaf to a root node and a leaf.
721 */
722int
723xfs_dir_leaf_to_node(xfs_da_args_t *args)
724{
725 xfs_dir_leafblock_t *leaf;
726 xfs_da_intnode_t *node;
727 xfs_inode_t *dp;
728 xfs_dabuf_t *bp1, *bp2;
729 xfs_dablk_t blkno;
730 int retval;
731
732 dp = args->dp;
733 retval = xfs_da_grow_inode(args, &blkno);
734 ASSERT(blkno == 1);
735 if (retval)
736 return(retval);
737 retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1,
738 XFS_DATA_FORK);
739 if (retval)
740 return(retval);
741 ASSERT(bp1 != NULL);
742 retval = xfs_da_get_buf(args->trans, args->dp, 1, -1, &bp2,
743 XFS_DATA_FORK);
744 if (retval) {
745 xfs_da_buf_done(bp1);
746 return(retval);
747 }
748 ASSERT(bp2 != NULL);
749 memcpy(bp2->data, bp1->data, XFS_LBSIZE(dp->i_mount));
750 xfs_da_buf_done(bp1);
751 xfs_da_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1);
752
753 /*
754 * Set up the new root node.
755 */
756 retval = xfs_da_node_create(args, 0, 1, &bp1, XFS_DATA_FORK);
757 if (retval) {
758 xfs_da_buf_done(bp2);
759 return(retval);
760 }
761 node = bp1->data;
762 leaf = bp2->data;
763 ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
764 INT_SET(node->btree[0].hashval, ARCH_CONVERT, INT_GET(leaf->entries[ INT_GET(leaf->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT));
765 xfs_da_buf_done(bp2);
766 INT_SET(node->btree[0].before, ARCH_CONVERT, blkno);
767 INT_SET(node->hdr.count, ARCH_CONVERT, 1);
768 xfs_da_log_buf(args->trans, bp1,
769 XFS_DA_LOGRANGE(node, &node->btree[0], sizeof(node->btree[0])));
770 xfs_da_buf_done(bp1);
771
772 return(retval);
773}
774
775
776/*========================================================================
777 * Routines used for growing the Btree.
778 *========================================================================*/
779
780/*
781 * Create the initial contents of a leaf directory
782 * or a leaf in a node directory.
783 */
784int
785xfs_dir_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp)
786{
787 xfs_dir_leafblock_t *leaf;
788 xfs_dir_leaf_hdr_t *hdr;
789 xfs_inode_t *dp;
790 xfs_dabuf_t *bp;
791 int retval;
792
793 dp = args->dp;
794 ASSERT(dp != NULL);
795 retval = xfs_da_get_buf(args->trans, dp, blkno, -1, &bp, XFS_DATA_FORK);
796 if (retval)
797 return(retval);
798 ASSERT(bp != NULL);
799 leaf = bp->data;
800 memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount));
801 hdr = &leaf->hdr;
802 INT_SET(hdr->info.magic, ARCH_CONVERT, XFS_DIR_LEAF_MAGIC);
803 INT_SET(hdr->firstused, ARCH_CONVERT, XFS_LBSIZE(dp->i_mount));
804 if (!hdr->firstused)
805 INT_SET(hdr->firstused, ARCH_CONVERT, XFS_LBSIZE(dp->i_mount) - 1);
806 INT_SET(hdr->freemap[0].base, ARCH_CONVERT, sizeof(xfs_dir_leaf_hdr_t));
807 INT_SET(hdr->freemap[0].size, ARCH_CONVERT, INT_GET(hdr->firstused, ARCH_CONVERT) - INT_GET(hdr->freemap[0].base, ARCH_CONVERT));
808
809 xfs_da_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1);
810
811 *bpp = bp;
812 return(0);
813}
814
815/*
816 * Split the leaf node, rebalance, then add the new entry.
817 */
818int
819xfs_dir_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
820 xfs_da_state_blk_t *newblk)
821{
822 xfs_dablk_t blkno;
823 xfs_da_args_t *args;
824 int error;
825
826 /*
827 * Allocate space for a new leaf node.
828 */
829 args = state->args;
830 ASSERT(args != NULL);
831 ASSERT(oldblk->magic == XFS_DIR_LEAF_MAGIC);
832 error = xfs_da_grow_inode(args, &blkno);
833 if (error)
834 return(error);
835 error = xfs_dir_leaf_create(args, blkno, &newblk->bp);
836 if (error)
837 return(error);
838 newblk->blkno = blkno;
839 newblk->magic = XFS_DIR_LEAF_MAGIC;
840
841 /*
842 * Rebalance the entries across the two leaves.
843 */
844 xfs_dir_leaf_rebalance(state, oldblk, newblk);
845 error = xfs_da_blk_link(state, oldblk, newblk);
846 if (error)
847 return(error);
848
849 /*
850 * Insert the new entry in the correct block.
851 */
852 if (state->inleaf) {
853 error = xfs_dir_leaf_add(oldblk->bp, args, oldblk->index);
854 } else {
855 error = xfs_dir_leaf_add(newblk->bp, args, newblk->index);
856 }
857
858 /*
859 * Update last hashval in each block since we added the name.
860 */
861 oldblk->hashval = xfs_dir_leaf_lasthash(oldblk->bp, NULL);
862 newblk->hashval = xfs_dir_leaf_lasthash(newblk->bp, NULL);
863 return(error);
864}
865
866/*
867 * Add a name to the leaf directory structure.
868 *
869 * Must take into account fragmented leaves and leaves where spacemap has
870 * lost some freespace information (ie: holes).
871 */
872int
873xfs_dir_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args, int index)
874{
875 xfs_dir_leafblock_t *leaf;
876 xfs_dir_leaf_hdr_t *hdr;
877 xfs_dir_leaf_map_t *map;
878 int tablesize, entsize, sum, i, tmp, error;
879
880 leaf = bp->data;
881 ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
882 ASSERT((index >= 0) && (index <= INT_GET(leaf->hdr.count, ARCH_CONVERT)));
883 hdr = &leaf->hdr;
884 entsize = XFS_DIR_LEAF_ENTSIZE_BYNAME(args->namelen);
885
886 /*
887 * Search through freemap for first-fit on new name length.
888 * (may need to figure in size of entry struct too)
889 */
890 tablesize = (INT_GET(hdr->count, ARCH_CONVERT) + 1) * (uint)sizeof(xfs_dir_leaf_entry_t)
891 + (uint)sizeof(xfs_dir_leaf_hdr_t);
892 map = &hdr->freemap[XFS_DIR_LEAF_MAPSIZE-1];
893 for (sum = 0, i = XFS_DIR_LEAF_MAPSIZE-1; i >= 0; map--, i--) {
894 if (tablesize > INT_GET(hdr->firstused, ARCH_CONVERT)) {
895 sum += INT_GET(map->size, ARCH_CONVERT);
896 continue;
897 }
898 if (!map->size)
899 continue; /* no space in this map */
900 tmp = entsize;
901 if (INT_GET(map->base, ARCH_CONVERT) < INT_GET(hdr->firstused, ARCH_CONVERT))
902 tmp += (uint)sizeof(xfs_dir_leaf_entry_t);
903 if (INT_GET(map->size, ARCH_CONVERT) >= tmp) {
904 if (!args->justcheck)
905 xfs_dir_leaf_add_work(bp, args, index, i);
906 return(0);
907 }
908 sum += INT_GET(map->size, ARCH_CONVERT);
909 }
910
911 /*
912 * If there are no holes in the address space of the block,
913 * and we don't have enough freespace, then compaction will do us
914 * no good and we should just give up.
915 */
916 if (!hdr->holes && (sum < entsize))
917 return(XFS_ERROR(ENOSPC));
918
919 /*
920 * Compact the entries to coalesce free space.
921 * Pass the justcheck flag so the checking pass can return
922 * an error, without changing anything, if it won't fit.
923 */
924 error = xfs_dir_leaf_compact(args->trans, bp,
925 args->total == 0 ?
926 entsize +
927 (uint)sizeof(xfs_dir_leaf_entry_t) : 0,
928 args->justcheck);
929 if (error)
930 return(error);
931 /*
932 * After compaction, the block is guaranteed to have only one
933 * free region, in freemap[0]. If it is not big enough, give up.
934 */
935 if (INT_GET(hdr->freemap[0].size, ARCH_CONVERT) <
936 (entsize + (uint)sizeof(xfs_dir_leaf_entry_t)))
937 return(XFS_ERROR(ENOSPC));
938
939 if (!args->justcheck)
940 xfs_dir_leaf_add_work(bp, args, index, 0);
941 return(0);
942}
943
944/*
945 * Add a name to a leaf directory structure.
946 */
947STATIC void
948xfs_dir_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int index,
949 int mapindex)
950{
951 xfs_dir_leafblock_t *leaf;
952 xfs_dir_leaf_hdr_t *hdr;
953 xfs_dir_leaf_entry_t *entry;
954 xfs_dir_leaf_name_t *namest;
955 xfs_dir_leaf_map_t *map;
956 /* REFERENCED */
957 xfs_mount_t *mp;
958 int tmp, i;
959
960 leaf = bp->data;
961 ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
962 hdr = &leaf->hdr;
963 ASSERT((mapindex >= 0) && (mapindex < XFS_DIR_LEAF_MAPSIZE));
964 ASSERT((index >= 0) && (index <= INT_GET(hdr->count, ARCH_CONVERT)));
965
966 /*
967 * Force open some space in the entry array and fill it in.
968 */
969 entry = &leaf->entries[index];
970 if (index < INT_GET(hdr->count, ARCH_CONVERT)) {
971 tmp = INT_GET(hdr->count, ARCH_CONVERT) - index;
972 tmp *= (uint)sizeof(xfs_dir_leaf_entry_t);
973 memmove(entry + 1, entry, tmp);
974 xfs_da_log_buf(args->trans, bp,
975 XFS_DA_LOGRANGE(leaf, entry, tmp + (uint)sizeof(*entry)));
976 }
977 INT_MOD(hdr->count, ARCH_CONVERT, +1);
978
979 /*
980 * Allocate space for the new string (at the end of the run).
981 */
982 map = &hdr->freemap[mapindex];
983 mp = args->trans->t_mountp;
984 ASSERT(INT_GET(map->base, ARCH_CONVERT) < XFS_LBSIZE(mp));
985 ASSERT(INT_GET(map->size, ARCH_CONVERT) >= XFS_DIR_LEAF_ENTSIZE_BYNAME(args->namelen));
986 ASSERT(INT_GET(map->size, ARCH_CONVERT) < XFS_LBSIZE(mp));
987 INT_MOD(map->size, ARCH_CONVERT, -(XFS_DIR_LEAF_ENTSIZE_BYNAME(args->namelen)));
988 INT_SET(entry->nameidx, ARCH_CONVERT, INT_GET(map->base, ARCH_CONVERT) + INT_GET(map->size, ARCH_CONVERT));
989 INT_SET(entry->hashval, ARCH_CONVERT, args->hashval);
990 entry->namelen = args->namelen;
991 xfs_da_log_buf(args->trans, bp,
992 XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
993
994 /*
995 * Copy the string and inode number into the new space.
996 */
997 namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
998 XFS_DIR_SF_PUT_DIRINO(&args->inumber, &namest->inumber);
999 memcpy(namest->name, args->name, args->namelen);
1000 xfs_da_log_buf(args->trans, bp,
1001 XFS_DA_LOGRANGE(leaf, namest, XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry)));
1002
1003 /*
1004 * Update the control info for this leaf node
1005 */
1006 if (INT_GET(entry->nameidx, ARCH_CONVERT) < INT_GET(hdr->firstused, ARCH_CONVERT))
1007 INT_COPY(hdr->firstused, entry->nameidx, ARCH_CONVERT);
1008 ASSERT(INT_GET(hdr->firstused, ARCH_CONVERT) >= ((INT_GET(hdr->count, ARCH_CONVERT)*sizeof(*entry))+sizeof(*hdr)));
1009 tmp = (INT_GET(hdr->count, ARCH_CONVERT)-1) * (uint)sizeof(xfs_dir_leaf_entry_t)
1010 + (uint)sizeof(xfs_dir_leaf_hdr_t);
1011 map = &hdr->freemap[0];
1012 for (i = 0; i < XFS_DIR_LEAF_MAPSIZE; map++, i++) {
1013 if (INT_GET(map->base, ARCH_CONVERT) == tmp) {
1014 INT_MOD(map->base, ARCH_CONVERT, (uint)sizeof(xfs_dir_leaf_entry_t));
1015 INT_MOD(map->size, ARCH_CONVERT, -((uint)sizeof(xfs_dir_leaf_entry_t)));
1016 }
1017 }
1018 INT_MOD(hdr->namebytes, ARCH_CONVERT, args->namelen);
1019 xfs_da_log_buf(args->trans, bp,
1020 XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
1021}
1022
1023/*
1024 * Garbage collect a leaf directory block by copying it to a new buffer.
1025 */
1026STATIC int
1027xfs_dir_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp, int musthave,
1028 int justcheck)
1029{
1030 xfs_dir_leafblock_t *leaf_s, *leaf_d;
1031 xfs_dir_leaf_hdr_t *hdr_s, *hdr_d;
1032 xfs_mount_t *mp;
1033 char *tmpbuffer;
1034 char *tmpbuffer2=NULL;
1035 int rval;
1036 int lbsize;
1037
1038 mp = trans->t_mountp;
1039 lbsize = XFS_LBSIZE(mp);
1040 tmpbuffer = kmem_alloc(lbsize, KM_SLEEP);
1041 ASSERT(tmpbuffer != NULL);
1042 memcpy(tmpbuffer, bp->data, lbsize);
1043
1044 /*
1045 * Make a second copy in case xfs_dir_leaf_moveents()
1046 * below destroys the original.
1047 */
1048 if (musthave || justcheck) {
1049 tmpbuffer2 = kmem_alloc(lbsize, KM_SLEEP);
1050 memcpy(tmpbuffer2, bp->data, lbsize);
1051 }
1052 memset(bp->data, 0, lbsize);
1053
1054 /*
1055 * Copy basic information
1056 */
1057 leaf_s = (xfs_dir_leafblock_t *)tmpbuffer;
1058 leaf_d = bp->data;
1059 hdr_s = &leaf_s->hdr;
1060 hdr_d = &leaf_d->hdr;
1061 hdr_d->info = hdr_s->info; /* struct copy */
1062 INT_SET(hdr_d->firstused, ARCH_CONVERT, lbsize);
1063 if (!hdr_d->firstused)
1064 INT_SET(hdr_d->firstused, ARCH_CONVERT, lbsize - 1);
1065 hdr_d->namebytes = 0;
1066 hdr_d->count = 0;
1067 hdr_d->holes = 0;
1068 INT_SET(hdr_d->freemap[0].base, ARCH_CONVERT, sizeof(xfs_dir_leaf_hdr_t));
1069 INT_SET(hdr_d->freemap[0].size, ARCH_CONVERT, INT_GET(hdr_d->firstused, ARCH_CONVERT) - INT_GET(hdr_d->freemap[0].base, ARCH_CONVERT));
1070
1071 /*
1072 * Copy all entry's in the same (sorted) order,
1073 * but allocate filenames packed and in sequence.
1074 * This changes the source (leaf_s) as well.
1075 */
1076 xfs_dir_leaf_moveents(leaf_s, 0, leaf_d, 0, (int)INT_GET(hdr_s->count, ARCH_CONVERT), mp);
1077
1078 if (musthave && INT_GET(hdr_d->freemap[0].size, ARCH_CONVERT) < musthave)
1079 rval = XFS_ERROR(ENOSPC);
1080 else
1081 rval = 0;
1082
1083 if (justcheck || rval == ENOSPC) {
1084 ASSERT(tmpbuffer2);
1085 memcpy(bp->data, tmpbuffer2, lbsize);
1086 } else {
1087 xfs_da_log_buf(trans, bp, 0, lbsize - 1);
1088 }
1089
1090 kmem_free(tmpbuffer, lbsize);
1091 if (musthave || justcheck)
1092 kmem_free(tmpbuffer2, lbsize);
1093 return(rval);
1094}
1095
1096/*
1097 * Redistribute the directory entries between two leaf nodes,
1098 * taking into account the size of the new entry.
1099 *
1100 * NOTE: if new block is empty, then it will get the upper half of old block.
1101 */
1102STATIC void
1103xfs_dir_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
1104 xfs_da_state_blk_t *blk2)
1105{
1106 xfs_da_state_blk_t *tmp_blk;
1107 xfs_dir_leafblock_t *leaf1, *leaf2;
1108 xfs_dir_leaf_hdr_t *hdr1, *hdr2;
1109 int count, totallen, max, space, swap;
1110
1111 /*
1112 * Set up environment.
1113 */
1114 ASSERT(blk1->magic == XFS_DIR_LEAF_MAGIC);
1115 ASSERT(blk2->magic == XFS_DIR_LEAF_MAGIC);
1116 leaf1 = blk1->bp->data;
1117 leaf2 = blk2->bp->data;
1118 ASSERT(INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
1119 ASSERT(INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
1120
1121 /*
1122 * Check ordering of blocks, reverse if it makes things simpler.
1123 */
1124 swap = 0;
1125 if (xfs_dir_leaf_order(blk1->bp, blk2->bp)) {
1126 tmp_blk = blk1;
1127 blk1 = blk2;
1128 blk2 = tmp_blk;
1129 leaf1 = blk1->bp->data;
1130 leaf2 = blk2->bp->data;
1131 swap = 1;
1132 }
1133 hdr1 = &leaf1->hdr;
1134 hdr2 = &leaf2->hdr;
1135
1136 /*
1137 * Examine entries until we reduce the absolute difference in
1138 * byte usage between the two blocks to a minimum. Then get
1139 * the direction to copy and the number of elements to move.
1140 */
1141 state->inleaf = xfs_dir_leaf_figure_balance(state, blk1, blk2,
1142 &count, &totallen);
1143 if (swap)
1144 state->inleaf = !state->inleaf;
1145
1146 /*
1147 * Move any entries required from leaf to leaf:
1148 */
1149 if (count < INT_GET(hdr1->count, ARCH_CONVERT)) {
1150 /*
1151 * Figure the total bytes to be added to the destination leaf.
1152 */
1153 count = INT_GET(hdr1->count, ARCH_CONVERT) - count; /* number entries being moved */
1154 space = INT_GET(hdr1->namebytes, ARCH_CONVERT) - totallen;
1155 space += count * ((uint)sizeof(xfs_dir_leaf_name_t)-1);
1156 space += count * (uint)sizeof(xfs_dir_leaf_entry_t);
1157
1158 /*
1159 * leaf2 is the destination, compact it if it looks tight.
1160 */
1161 max = INT_GET(hdr2->firstused, ARCH_CONVERT) - (uint)sizeof(xfs_dir_leaf_hdr_t);
1162 max -= INT_GET(hdr2->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t);
1163 if (space > max) {
1164 xfs_dir_leaf_compact(state->args->trans, blk2->bp,
1165 0, 0);
1166 }
1167
1168 /*
1169 * Move high entries from leaf1 to low end of leaf2.
1170 */
1171 xfs_dir_leaf_moveents(leaf1, INT_GET(hdr1->count, ARCH_CONVERT) - count,
1172 leaf2, 0, count, state->mp);
1173
1174 xfs_da_log_buf(state->args->trans, blk1->bp, 0,
1175 state->blocksize-1);
1176 xfs_da_log_buf(state->args->trans, blk2->bp, 0,
1177 state->blocksize-1);
1178
1179 } else if (count > INT_GET(hdr1->count, ARCH_CONVERT)) {
1180 /*
1181 * Figure the total bytes to be added to the destination leaf.
1182 */
1183 count -= INT_GET(hdr1->count, ARCH_CONVERT); /* number entries being moved */
1184 space = totallen - INT_GET(hdr1->namebytes, ARCH_CONVERT);
1185 space += count * ((uint)sizeof(xfs_dir_leaf_name_t)-1);
1186 space += count * (uint)sizeof(xfs_dir_leaf_entry_t);
1187
1188 /*
1189 * leaf1 is the destination, compact it if it looks tight.
1190 */
1191 max = INT_GET(hdr1->firstused, ARCH_CONVERT) - (uint)sizeof(xfs_dir_leaf_hdr_t);
1192 max -= INT_GET(hdr1->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t);
1193 if (space > max) {
1194 xfs_dir_leaf_compact(state->args->trans, blk1->bp,
1195 0, 0);
1196 }
1197
1198 /*
1199 * Move low entries from leaf2 to high end of leaf1.
1200 */
1201 xfs_dir_leaf_moveents(leaf2, 0, leaf1, (int)INT_GET(hdr1->count, ARCH_CONVERT),
1202 count, state->mp);
1203
1204 xfs_da_log_buf(state->args->trans, blk1->bp, 0,
1205 state->blocksize-1);
1206 xfs_da_log_buf(state->args->trans, blk2->bp, 0,
1207 state->blocksize-1);
1208 }
1209
1210 /*
1211 * Copy out last hashval in each block for B-tree code.
1212 */
1213 blk1->hashval = INT_GET(leaf1->entries[ INT_GET(leaf1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
1214 blk2->hashval = INT_GET(leaf2->entries[ INT_GET(leaf2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
1215
1216 /*
1217 * Adjust the expected index for insertion.
1218 * GROT: this doesn't work unless blk2 was originally empty.
1219 */
1220 if (!state->inleaf) {
1221 blk2->index = blk1->index - INT_GET(leaf1->hdr.count, ARCH_CONVERT);
1222 }
1223}
1224
1225/*
1226 * Examine entries until we reduce the absolute difference in
1227 * byte usage between the two blocks to a minimum.
1228 * GROT: Is this really necessary? With other than a 512 byte blocksize,
1229 * GROT: there will always be enough room in either block for a new entry.
1230 * GROT: Do a double-split for this case?
1231 */
1232STATIC int
1233xfs_dir_leaf_figure_balance(xfs_da_state_t *state,
1234 xfs_da_state_blk_t *blk1,
1235 xfs_da_state_blk_t *blk2,
1236 int *countarg, int *namebytesarg)
1237{
1238 xfs_dir_leafblock_t *leaf1, *leaf2;
1239 xfs_dir_leaf_hdr_t *hdr1, *hdr2;
1240 xfs_dir_leaf_entry_t *entry;
1241 int count, max, totallen, half;
1242 int lastdelta, foundit, tmp;
1243
1244 /*
1245 * Set up environment.
1246 */
1247 leaf1 = blk1->bp->data;
1248 leaf2 = blk2->bp->data;
1249 hdr1 = &leaf1->hdr;
1250 hdr2 = &leaf2->hdr;
1251 foundit = 0;
1252 totallen = 0;
1253
1254 /*
1255 * Examine entries until we reduce the absolute difference in
1256 * byte usage between the two blocks to a minimum.
1257 */
1258 max = INT_GET(hdr1->count, ARCH_CONVERT) + INT_GET(hdr2->count, ARCH_CONVERT);
1259 half = (max+1) * (uint)(sizeof(*entry)+sizeof(xfs_dir_leaf_entry_t)-1);
1260 half += INT_GET(hdr1->namebytes, ARCH_CONVERT) + INT_GET(hdr2->namebytes, ARCH_CONVERT) + state->args->namelen;
1261 half /= 2;
1262 lastdelta = state->blocksize;
1263 entry = &leaf1->entries[0];
1264 for (count = 0; count < max; entry++, count++) {
1265
1266#define XFS_DIR_ABS(A) (((A) < 0) ? -(A) : (A))
1267 /*
1268 * The new entry is in the first block, account for it.
1269 */
1270 if (count == blk1->index) {
1271 tmp = totallen + (uint)sizeof(*entry)
1272 + XFS_DIR_LEAF_ENTSIZE_BYNAME(state->args->namelen);
1273 if (XFS_DIR_ABS(half - tmp) > lastdelta)
1274 break;
1275 lastdelta = XFS_DIR_ABS(half - tmp);
1276 totallen = tmp;
1277 foundit = 1;
1278 }
1279
1280 /*
1281 * Wrap around into the second block if necessary.
1282 */
1283 if (count == INT_GET(hdr1->count, ARCH_CONVERT)) {
1284 leaf1 = leaf2;
1285 entry = &leaf1->entries[0];
1286 }
1287
1288 /*
1289 * Figure out if next leaf entry would be too much.
1290 */
1291 tmp = totallen + (uint)sizeof(*entry)
1292 + XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry);
1293 if (XFS_DIR_ABS(half - tmp) > lastdelta)
1294 break;
1295 lastdelta = XFS_DIR_ABS(half - tmp);
1296 totallen = tmp;
1297#undef XFS_DIR_ABS
1298 }
1299
1300 /*
1301 * Calculate the number of namebytes that will end up in lower block.
1302 * If new entry not in lower block, fix up the count.
1303 */
1304 totallen -=
1305 count * (uint)(sizeof(*entry)+sizeof(xfs_dir_leaf_entry_t)-1);
1306 if (foundit) {
1307 totallen -= (sizeof(*entry)+sizeof(xfs_dir_leaf_entry_t)-1) +
1308 state->args->namelen;
1309 }
1310
1311 *countarg = count;
1312 *namebytesarg = totallen;
1313 return(foundit);
1314}
1315
1316/*========================================================================
1317 * Routines used for shrinking the Btree.
1318 *========================================================================*/
1319
1320/*
1321 * Check a leaf block and its neighbors to see if the block should be
1322 * collapsed into one or the other neighbor. Always keep the block
1323 * with the smaller block number.
1324 * If the current block is over 50% full, don't try to join it, return 0.
1325 * If the block is empty, fill in the state structure and return 2.
1326 * If it can be collapsed, fill in the state structure and return 1.
1327 * If nothing can be done, return 0.
1328 */
1329int
1330xfs_dir_leaf_toosmall(xfs_da_state_t *state, int *action)
1331{
1332 xfs_dir_leafblock_t *leaf;
1333 xfs_da_state_blk_t *blk;
1334 xfs_da_blkinfo_t *info;
1335 int count, bytes, forward, error, retval, i;
1336 xfs_dablk_t blkno;
1337 xfs_dabuf_t *bp;
1338
1339 /*
1340 * Check for the degenerate case of the block being over 50% full.
1341 * If so, it's not worth even looking to see if we might be able
1342 * to coalesce with a sibling.
1343 */
1344 blk = &state->path.blk[ state->path.active-1 ];
1345 info = blk->bp->data;
1346 ASSERT(INT_GET(info->magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
1347 leaf = (xfs_dir_leafblock_t *)info;
1348 count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
1349 bytes = (uint)sizeof(xfs_dir_leaf_hdr_t) +
1350 count * (uint)sizeof(xfs_dir_leaf_entry_t) +
1351 count * ((uint)sizeof(xfs_dir_leaf_name_t)-1) +
1352 INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
1353 if (bytes > (state->blocksize >> 1)) {
1354 *action = 0; /* blk over 50%, don't try to join */
1355 return(0);
1356 }
1357
1358 /*
1359 * Check for the degenerate case of the block being empty.
1360 * If the block is empty, we'll simply delete it, no need to
1361 * coalesce it with a sibling block. We choose (aribtrarily)
1362 * to merge with the forward block unless it is NULL.
1363 */
1364 if (count == 0) {
1365 /*
1366 * Make altpath point to the block we want to keep and
1367 * path point to the block we want to drop (this one).
1368 */
1369 forward = info->forw;
1370 memcpy(&state->altpath, &state->path, sizeof(state->path));
1371 error = xfs_da_path_shift(state, &state->altpath, forward,
1372 0, &retval);
1373 if (error)
1374 return(error);
1375 if (retval) {
1376 *action = 0;
1377 } else {
1378 *action = 2;
1379 }
1380 return(0);
1381 }
1382
1383 /*
1384 * Examine each sibling block to see if we can coalesce with
1385 * at least 25% free space to spare. We need to figure out
1386 * whether to merge with the forward or the backward block.
1387 * We prefer coalescing with the lower numbered sibling so as
1388 * to shrink a directory over time.
1389 */
1390 forward = (INT_GET(info->forw, ARCH_CONVERT) < INT_GET(info->back, ARCH_CONVERT)); /* start with smaller blk num */
1391 for (i = 0; i < 2; forward = !forward, i++) {
1392 if (forward)
1393 blkno = INT_GET(info->forw, ARCH_CONVERT);
1394 else
1395 blkno = INT_GET(info->back, ARCH_CONVERT);
1396 if (blkno == 0)
1397 continue;
1398 error = xfs_da_read_buf(state->args->trans, state->args->dp,
1399 blkno, -1, &bp,
1400 XFS_DATA_FORK);
1401 if (error)
1402 return(error);
1403 ASSERT(bp != NULL);
1404
1405 leaf = (xfs_dir_leafblock_t *)info;
1406 count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
1407 bytes = state->blocksize - (state->blocksize>>2);
1408 bytes -= INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
1409 leaf = bp->data;
1410 ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
1411 count += INT_GET(leaf->hdr.count, ARCH_CONVERT);
1412 bytes -= INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
1413 bytes -= count * ((uint)sizeof(xfs_dir_leaf_name_t) - 1);
1414 bytes -= count * (uint)sizeof(xfs_dir_leaf_entry_t);
1415 bytes -= (uint)sizeof(xfs_dir_leaf_hdr_t);
1416 if (bytes >= 0)
1417 break; /* fits with at least 25% to spare */
1418
1419 xfs_da_brelse(state->args->trans, bp);
1420 }
1421 if (i >= 2) {
1422 *action = 0;
1423 return(0);
1424 }
1425 xfs_da_buf_done(bp);
1426
1427 /*
1428 * Make altpath point to the block we want to keep (the lower
1429 * numbered block) and path point to the block we want to drop.
1430 */
1431 memcpy(&state->altpath, &state->path, sizeof(state->path));
1432 if (blkno < blk->blkno) {
1433 error = xfs_da_path_shift(state, &state->altpath, forward,
1434 0, &retval);
1435 } else {
1436 error = xfs_da_path_shift(state, &state->path, forward,
1437 0, &retval);
1438 }
1439 if (error)
1440 return(error);
1441 if (retval) {
1442 *action = 0;
1443 } else {
1444 *action = 1;
1445 }
1446 return(0);
1447}
1448
1449/*
1450 * Remove a name from the leaf directory structure.
1451 *
1452 * Return 1 if leaf is less than 37% full, 0 if >= 37% full.
1453 * If two leaves are 37% full, when combined they will leave 25% free.
1454 */
1455int
1456xfs_dir_leaf_remove(xfs_trans_t *trans, xfs_dabuf_t *bp, int index)
1457{
1458 xfs_dir_leafblock_t *leaf;
1459 xfs_dir_leaf_hdr_t *hdr;
1460 xfs_dir_leaf_map_t *map;
1461 xfs_dir_leaf_entry_t *entry;
1462 xfs_dir_leaf_name_t *namest;
1463 int before, after, smallest, entsize;
1464 int tablesize, tmp, i;
1465 xfs_mount_t *mp;
1466
1467 leaf = bp->data;
1468 ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
1469 hdr = &leaf->hdr;
1470 mp = trans->t_mountp;
1471 ASSERT((INT_GET(hdr->count, ARCH_CONVERT) > 0) && (INT_GET(hdr->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8)));
1472 ASSERT((index >= 0) && (index < INT_GET(hdr->count, ARCH_CONVERT)));
1473 ASSERT(INT_GET(hdr->firstused, ARCH_CONVERT) >= ((INT_GET(hdr->count, ARCH_CONVERT)*sizeof(*entry))+sizeof(*hdr)));
1474 entry = &leaf->entries[index];
1475 ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) >= INT_GET(hdr->firstused, ARCH_CONVERT));
1476 ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) < XFS_LBSIZE(mp));
1477
1478 /*
1479 * Scan through free region table:
1480 * check for adjacency of free'd entry with an existing one,
1481 * find smallest free region in case we need to replace it,
1482 * adjust any map that borders the entry table,
1483 */
1484 tablesize = INT_GET(hdr->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t)
1485 + (uint)sizeof(xfs_dir_leaf_hdr_t);
1486 map = &hdr->freemap[0];
1487 tmp = INT_GET(map->size, ARCH_CONVERT);
1488 before = after = -1;
1489 smallest = XFS_DIR_LEAF_MAPSIZE - 1;
1490 entsize = XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry);
1491 for (i = 0; i < XFS_DIR_LEAF_MAPSIZE; map++, i++) {
1492 ASSERT(INT_GET(map->base, ARCH_CONVERT) < XFS_LBSIZE(mp));
1493 ASSERT(INT_GET(map->size, ARCH_CONVERT) < XFS_LBSIZE(mp));
1494 if (INT_GET(map->base, ARCH_CONVERT) == tablesize) {
1495 INT_MOD(map->base, ARCH_CONVERT, -((uint)sizeof(xfs_dir_leaf_entry_t)));
1496 INT_MOD(map->size, ARCH_CONVERT, (uint)sizeof(xfs_dir_leaf_entry_t));
1497 }
1498
1499 if ((INT_GET(map->base, ARCH_CONVERT) + INT_GET(map->size, ARCH_CONVERT)) == INT_GET(entry->nameidx, ARCH_CONVERT)) {
1500 before = i;
1501 } else if (INT_GET(map->base, ARCH_CONVERT) == (INT_GET(entry->nameidx, ARCH_CONVERT) + entsize)) {
1502 after = i;
1503 } else if (INT_GET(map->size, ARCH_CONVERT) < tmp) {
1504 tmp = INT_GET(map->size, ARCH_CONVERT);
1505 smallest = i;
1506 }
1507 }
1508
1509 /*
1510 * Coalesce adjacent freemap regions,
1511 * or replace the smallest region.
1512 */
1513 if ((before >= 0) || (after >= 0)) {
1514 if ((before >= 0) && (after >= 0)) {
1515 map = &hdr->freemap[before];
1516 INT_MOD(map->size, ARCH_CONVERT, entsize);
1517 INT_MOD(map->size, ARCH_CONVERT, INT_GET(hdr->freemap[after].size, ARCH_CONVERT));
1518 hdr->freemap[after].base = 0;
1519 hdr->freemap[after].size = 0;
1520 } else if (before >= 0) {
1521 map = &hdr->freemap[before];
1522 INT_MOD(map->size, ARCH_CONVERT, entsize);
1523 } else {
1524 map = &hdr->freemap[after];
1525 INT_COPY(map->base, entry->nameidx, ARCH_CONVERT);
1526 INT_MOD(map->size, ARCH_CONVERT, entsize);
1527 }
1528 } else {
1529 /*
1530 * Replace smallest region (if it is smaller than free'd entry)
1531 */
1532 map = &hdr->freemap[smallest];
1533 if (INT_GET(map->size, ARCH_CONVERT) < entsize) {
1534 INT_COPY(map->base, entry->nameidx, ARCH_CONVERT);
1535 INT_SET(map->size, ARCH_CONVERT, entsize);
1536 }
1537 }
1538
1539 /*
1540 * Did we remove the first entry?
1541 */
1542 if (INT_GET(entry->nameidx, ARCH_CONVERT) == INT_GET(hdr->firstused, ARCH_CONVERT))
1543 smallest = 1;
1544 else
1545 smallest = 0;
1546
1547 /*
1548 * Compress the remaining entries and zero out the removed stuff.
1549 */
1550 namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
1551 memset((char *)namest, 0, entsize);
1552 xfs_da_log_buf(trans, bp, XFS_DA_LOGRANGE(leaf, namest, entsize));
1553
1554 INT_MOD(hdr->namebytes, ARCH_CONVERT, -(entry->namelen));
1555 tmp = (INT_GET(hdr->count, ARCH_CONVERT) - index) * (uint)sizeof(xfs_dir_leaf_entry_t);
1556 memmove(entry, entry + 1, tmp);
1557 INT_MOD(hdr->count, ARCH_CONVERT, -1);
1558 xfs_da_log_buf(trans, bp,
1559 XFS_DA_LOGRANGE(leaf, entry, tmp + (uint)sizeof(*entry)));
1560 entry = &leaf->entries[INT_GET(hdr->count, ARCH_CONVERT)];
1561 memset((char *)entry, 0, sizeof(xfs_dir_leaf_entry_t));
1562
1563 /*
1564 * If we removed the first entry, re-find the first used byte
1565 * in the name area. Note that if the entry was the "firstused",
1566 * then we don't have a "hole" in our block resulting from
1567 * removing the name.
1568 */
1569 if (smallest) {
1570 tmp = XFS_LBSIZE(mp);
1571 entry = &leaf->entries[0];
1572 for (i = INT_GET(hdr->count, ARCH_CONVERT)-1; i >= 0; entry++, i--) {
1573 ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) >= INT_GET(hdr->firstused, ARCH_CONVERT));
1574 ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) < XFS_LBSIZE(mp));
1575 if (INT_GET(entry->nameidx, ARCH_CONVERT) < tmp)
1576 tmp = INT_GET(entry->nameidx, ARCH_CONVERT);
1577 }
1578 INT_SET(hdr->firstused, ARCH_CONVERT, tmp);
1579 if (!hdr->firstused)
1580 INT_SET(hdr->firstused, ARCH_CONVERT, tmp - 1);
1581 } else {
1582 hdr->holes = 1; /* mark as needing compaction */
1583 }
1584
1585 xfs_da_log_buf(trans, bp, XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
1586
1587 /*
1588 * Check if leaf is less than 50% full, caller may want to
1589 * "join" the leaf with a sibling if so.
1590 */
1591 tmp = (uint)sizeof(xfs_dir_leaf_hdr_t);
1592 tmp += INT_GET(leaf->hdr.count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t);
1593 tmp += INT_GET(leaf->hdr.count, ARCH_CONVERT) * ((uint)sizeof(xfs_dir_leaf_name_t) - 1);
1594 tmp += INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
1595 if (tmp < mp->m_dir_magicpct)
1596 return(1); /* leaf is < 37% full */
1597 return(0);
1598}
1599
1600/*
1601 * Move all the directory entries from drop_leaf into save_leaf.
1602 */
1603void
1604xfs_dir_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
1605 xfs_da_state_blk_t *save_blk)
1606{
1607 xfs_dir_leafblock_t *drop_leaf, *save_leaf, *tmp_leaf;
1608 xfs_dir_leaf_hdr_t *drop_hdr, *save_hdr, *tmp_hdr;
1609 xfs_mount_t *mp;
1610 char *tmpbuffer;
1611
1612 /*
1613 * Set up environment.
1614 */
1615 mp = state->mp;
1616 ASSERT(drop_blk->magic == XFS_DIR_LEAF_MAGIC);
1617 ASSERT(save_blk->magic == XFS_DIR_LEAF_MAGIC);
1618 drop_leaf = drop_blk->bp->data;
1619 save_leaf = save_blk->bp->data;
1620 ASSERT(INT_GET(drop_leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
1621 ASSERT(INT_GET(save_leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
1622 drop_hdr = &drop_leaf->hdr;
1623 save_hdr = &save_leaf->hdr;
1624
1625 /*
1626 * Save last hashval from dying block for later Btree fixup.
1627 */
1628 drop_blk->hashval = INT_GET(drop_leaf->entries[ drop_leaf->hdr.count-1 ].hashval, ARCH_CONVERT);
1629
1630 /*
1631 * Check if we need a temp buffer, or can we do it in place.
1632 * Note that we don't check "leaf" for holes because we will
1633 * always be dropping it, toosmall() decided that for us already.
1634 */
1635 if (save_hdr->holes == 0) {
1636 /*
1637 * dest leaf has no holes, so we add there. May need
1638 * to make some room in the entry array.
1639 */
1640 if (xfs_dir_leaf_order(save_blk->bp, drop_blk->bp)) {
1641 xfs_dir_leaf_moveents(drop_leaf, 0, save_leaf, 0,
1642 (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
1643 } else {
1644 xfs_dir_leaf_moveents(drop_leaf, 0,
1645 save_leaf, INT_GET(save_hdr->count, ARCH_CONVERT),
1646 (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
1647 }
1648 } else {
1649 /*
1650 * Destination has holes, so we make a temporary copy
1651 * of the leaf and add them both to that.
1652 */
1653 tmpbuffer = kmem_alloc(state->blocksize, KM_SLEEP);
1654 ASSERT(tmpbuffer != NULL);
1655 memset(tmpbuffer, 0, state->blocksize);
1656 tmp_leaf = (xfs_dir_leafblock_t *)tmpbuffer;
1657 tmp_hdr = &tmp_leaf->hdr;
1658 tmp_hdr->info = save_hdr->info; /* struct copy */
1659 tmp_hdr->count = 0;
1660 INT_SET(tmp_hdr->firstused, ARCH_CONVERT, state->blocksize);
1661 if (!tmp_hdr->firstused)
1662 INT_SET(tmp_hdr->firstused, ARCH_CONVERT, state->blocksize - 1);
1663 tmp_hdr->namebytes = 0;
1664 if (xfs_dir_leaf_order(save_blk->bp, drop_blk->bp)) {
1665 xfs_dir_leaf_moveents(drop_leaf, 0, tmp_leaf, 0,
1666 (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
1667 xfs_dir_leaf_moveents(save_leaf, 0,
1668 tmp_leaf, INT_GET(tmp_leaf->hdr.count, ARCH_CONVERT),
1669 (int)INT_GET(save_hdr->count, ARCH_CONVERT), mp);
1670 } else {
1671 xfs_dir_leaf_moveents(save_leaf, 0, tmp_leaf, 0,
1672 (int)INT_GET(save_hdr->count, ARCH_CONVERT), mp);
1673 xfs_dir_leaf_moveents(drop_leaf, 0,
1674 tmp_leaf, INT_GET(tmp_leaf->hdr.count, ARCH_CONVERT),
1675 (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
1676 }
1677 memcpy(save_leaf, tmp_leaf, state->blocksize);
1678 kmem_free(tmpbuffer, state->blocksize);
1679 }
1680
1681 xfs_da_log_buf(state->args->trans, save_blk->bp, 0,
1682 state->blocksize - 1);
1683
1684 /*
1685 * Copy out last hashval in each block for B-tree code.
1686 */
1687 save_blk->hashval = INT_GET(save_leaf->entries[ INT_GET(save_leaf->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
1688}
1689
1690/*========================================================================
1691 * Routines used for finding things in the Btree.
1692 *========================================================================*/
1693
1694/*
1695 * Look up a name in a leaf directory structure.
1696 * This is the internal routine, it uses the caller's buffer.
1697 *
1698 * Note that duplicate keys are allowed, but only check within the
1699 * current leaf node. The Btree code must check in adjacent leaf nodes.
1700 *
1701 * Return in *index the index into the entry[] array of either the found
1702 * entry, or where the entry should have been (insert before that entry).
1703 *
1704 * Don't change the args->inumber unless we find the filename.
1705 */
1706int
1707xfs_dir_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args, int *index)
1708{
1709 xfs_dir_leafblock_t *leaf;
1710 xfs_dir_leaf_entry_t *entry;
1711 xfs_dir_leaf_name_t *namest;
1712 int probe, span;
1713 xfs_dahash_t hashval;
1714
1715 leaf = bp->data;
1716 ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
1717 ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT) < (XFS_LBSIZE(args->dp->i_mount)/8));
1718
1719 /*
1720 * Binary search. (note: small blocks will skip this loop)
1721 */
1722 hashval = args->hashval;
1723 probe = span = INT_GET(leaf->hdr.count, ARCH_CONVERT) / 2;
1724 for (entry = &leaf->entries[probe]; span > 4;
1725 entry = &leaf->entries[probe]) {
1726 span /= 2;
1727 if (INT_GET(entry->hashval, ARCH_CONVERT) < hashval)
1728 probe += span;
1729 else if (INT_GET(entry->hashval, ARCH_CONVERT) > hashval)
1730 probe -= span;
1731 else
1732 break;
1733 }
1734 ASSERT((probe >= 0) && \
1735 ((!leaf->hdr.count) || (probe < INT_GET(leaf->hdr.count, ARCH_CONVERT))));
1736 ASSERT((span <= 4) || (INT_GET(entry->hashval, ARCH_CONVERT) == hashval));
1737
1738 /*
1739 * Since we may have duplicate hashval's, find the first matching
1740 * hashval in the leaf.
1741 */
1742 while ((probe > 0) && (INT_GET(entry->hashval, ARCH_CONVERT) >= hashval)) {
1743 entry--;
1744 probe--;
1745 }
1746 while ((probe < INT_GET(leaf->hdr.count, ARCH_CONVERT)) && (INT_GET(entry->hashval, ARCH_CONVERT) < hashval)) {
1747 entry++;
1748 probe++;
1749 }
1750 if ((probe == INT_GET(leaf->hdr.count, ARCH_CONVERT)) || (INT_GET(entry->hashval, ARCH_CONVERT) != hashval)) {
1751 *index = probe;
1752 ASSERT(args->oknoent);
1753 return(XFS_ERROR(ENOENT));
1754 }
1755
1756 /*
1757 * Duplicate keys may be present, so search all of them for a match.
1758 */
1759 while ((probe < INT_GET(leaf->hdr.count, ARCH_CONVERT)) && (INT_GET(entry->hashval, ARCH_CONVERT) == hashval)) {
1760 namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
1761 if (entry->namelen == args->namelen &&
1762 namest->name[0] == args->name[0] &&
1763 memcmp(args->name, namest->name, args->namelen) == 0) {
1764 XFS_DIR_SF_GET_DIRINO(&namest->inumber, &args->inumber);
1765 *index = probe;
1766 return(XFS_ERROR(EEXIST));
1767 }
1768 entry++;
1769 probe++;
1770 }
1771 *index = probe;
1772 ASSERT(probe == INT_GET(leaf->hdr.count, ARCH_CONVERT) || args->oknoent);
1773 return(XFS_ERROR(ENOENT));
1774}
1775
1776/*========================================================================
1777 * Utility routines.
1778 *========================================================================*/
1779
1780/*
1781 * Move the indicated entries from one leaf to another.
1782 * NOTE: this routine modifies both source and destination leaves.
1783 */
1784/* ARGSUSED */
1785STATIC void
1786xfs_dir_leaf_moveents(xfs_dir_leafblock_t *leaf_s, int start_s,
1787 xfs_dir_leafblock_t *leaf_d, int start_d,
1788 int count, xfs_mount_t *mp)
1789{
1790 xfs_dir_leaf_hdr_t *hdr_s, *hdr_d;
1791 xfs_dir_leaf_entry_t *entry_s, *entry_d;
1792 int tmp, i;
1793
1794 /*
1795 * Check for nothing to do.
1796 */
1797 if (count == 0)
1798 return;
1799
1800 /*
1801 * Set up environment.
1802 */
1803 ASSERT(INT_GET(leaf_s->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
1804 ASSERT(INT_GET(leaf_d->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
1805 hdr_s = &leaf_s->hdr;
1806 hdr_d = &leaf_d->hdr;
1807 ASSERT((INT_GET(hdr_s->count, ARCH_CONVERT) > 0) && (INT_GET(hdr_s->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8)));
1808 ASSERT(INT_GET(hdr_s->firstused, ARCH_CONVERT) >=
1809 ((INT_GET(hdr_s->count, ARCH_CONVERT)*sizeof(*entry_s))+sizeof(*hdr_s)));
1810 ASSERT(INT_GET(hdr_d->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8));
1811 ASSERT(INT_GET(hdr_d->firstused, ARCH_CONVERT) >=
1812 ((INT_GET(hdr_d->count, ARCH_CONVERT)*sizeof(*entry_d))+sizeof(*hdr_d)));
1813
1814 ASSERT(start_s < INT_GET(hdr_s->count, ARCH_CONVERT));
1815 ASSERT(start_d <= INT_GET(hdr_d->count, ARCH_CONVERT));
1816 ASSERT(count <= INT_GET(hdr_s->count, ARCH_CONVERT));
1817
1818 /*
1819 * Move the entries in the destination leaf up to make a hole?
1820 */
1821 if (start_d < INT_GET(hdr_d->count, ARCH_CONVERT)) {
1822 tmp = INT_GET(hdr_d->count, ARCH_CONVERT) - start_d;
1823 tmp *= (uint)sizeof(xfs_dir_leaf_entry_t);
1824 entry_s = &leaf_d->entries[start_d];
1825 entry_d = &leaf_d->entries[start_d + count];
1826 memcpy(entry_d, entry_s, tmp);
1827 }
1828
1829 /*
1830 * Copy all entry's in the same (sorted) order,
1831 * but allocate filenames packed and in sequence.
1832 */
1833 entry_s = &leaf_s->entries[start_s];
1834 entry_d = &leaf_d->entries[start_d];
1835 for (i = 0; i < count; entry_s++, entry_d++, i++) {
1836 ASSERT(INT_GET(entry_s->nameidx, ARCH_CONVERT) >= INT_GET(hdr_s->firstused, ARCH_CONVERT));
1837 tmp = XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry_s);
1838 INT_MOD(hdr_d->firstused, ARCH_CONVERT, -(tmp));
1839 entry_d->hashval = entry_s->hashval; /* INT_: direct copy */
1840 INT_COPY(entry_d->nameidx, hdr_d->firstused, ARCH_CONVERT);
1841 entry_d->namelen = entry_s->namelen;
1842 ASSERT(INT_GET(entry_d->nameidx, ARCH_CONVERT) + tmp <= XFS_LBSIZE(mp));
1843 memcpy(XFS_DIR_LEAF_NAMESTRUCT(leaf_d, INT_GET(entry_d->nameidx, ARCH_CONVERT)),
1844 XFS_DIR_LEAF_NAMESTRUCT(leaf_s, INT_GET(entry_s->nameidx, ARCH_CONVERT)), tmp);
1845 ASSERT(INT_GET(entry_s->nameidx, ARCH_CONVERT) + tmp <= XFS_LBSIZE(mp));
1846 memset((char *)XFS_DIR_LEAF_NAMESTRUCT(leaf_s, INT_GET(entry_s->nameidx, ARCH_CONVERT)),
1847 0, tmp);
1848 INT_MOD(hdr_s->namebytes, ARCH_CONVERT, -(entry_d->namelen));
1849 INT_MOD(hdr_d->namebytes, ARCH_CONVERT, entry_d->namelen);
1850 INT_MOD(hdr_s->count, ARCH_CONVERT, -1);
1851 INT_MOD(hdr_d->count, ARCH_CONVERT, +1);
1852 tmp = INT_GET(hdr_d->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t)
1853 + (uint)sizeof(xfs_dir_leaf_hdr_t);
1854 ASSERT(INT_GET(hdr_d->firstused, ARCH_CONVERT) >= tmp);
1855
1856 }
1857
1858 /*
1859 * Zero out the entries we just copied.
1860 */
1861 if (start_s == INT_GET(hdr_s->count, ARCH_CONVERT)) {
1862 tmp = count * (uint)sizeof(xfs_dir_leaf_entry_t);
1863 entry_s = &leaf_s->entries[start_s];
1864 ASSERT((char *)entry_s + tmp <= (char *)leaf_s + XFS_LBSIZE(mp));
1865 memset((char *)entry_s, 0, tmp);
1866 } else {
1867 /*
1868 * Move the remaining entries down to fill the hole,
1869 * then zero the entries at the top.
1870 */
1871 tmp = INT_GET(hdr_s->count, ARCH_CONVERT) - count;
1872 tmp *= (uint)sizeof(xfs_dir_leaf_entry_t);
1873 entry_s = &leaf_s->entries[start_s + count];
1874 entry_d = &leaf_s->entries[start_s];
1875 memcpy(entry_d, entry_s, tmp);
1876
1877 tmp = count * (uint)sizeof(xfs_dir_leaf_entry_t);
1878 entry_s = &leaf_s->entries[INT_GET(hdr_s->count, ARCH_CONVERT)];
1879 ASSERT((char *)entry_s + tmp <= (char *)leaf_s + XFS_LBSIZE(mp));
1880 memset((char *)entry_s, 0, tmp);
1881 }
1882
1883 /*
1884 * Fill in the freemap information
1885 */
1886 INT_SET(hdr_d->freemap[0].base, ARCH_CONVERT, (uint)sizeof(xfs_dir_leaf_hdr_t));
1887 INT_MOD(hdr_d->freemap[0].base, ARCH_CONVERT, INT_GET(hdr_d->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t));
1888 INT_SET(hdr_d->freemap[0].size, ARCH_CONVERT, INT_GET(hdr_d->firstused, ARCH_CONVERT) - INT_GET(hdr_d->freemap[0].base, ARCH_CONVERT));
1889 INT_SET(hdr_d->freemap[1].base, ARCH_CONVERT, (hdr_d->freemap[2].base = 0));
1890 INT_SET(hdr_d->freemap[1].size, ARCH_CONVERT, (hdr_d->freemap[2].size = 0));
1891 hdr_s->holes = 1; /* leaf may not be compact */
1892}
1893
1894/*
1895 * Compare two leaf blocks "order".
1896 */
1897int
1898xfs_dir_leaf_order(xfs_dabuf_t *leaf1_bp, xfs_dabuf_t *leaf2_bp)
1899{
1900 xfs_dir_leafblock_t *leaf1, *leaf2;
1901
1902 leaf1 = leaf1_bp->data;
1903 leaf2 = leaf2_bp->data;
1904 ASSERT((INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC) &&
1905 (INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC));
1906 if ((INT_GET(leaf1->hdr.count, ARCH_CONVERT) > 0) && (INT_GET(leaf2->hdr.count, ARCH_CONVERT) > 0) &&
1907 ((INT_GET(leaf2->entries[ 0 ].hashval, ARCH_CONVERT) <
1908 INT_GET(leaf1->entries[ 0 ].hashval, ARCH_CONVERT)) ||
1909 (INT_GET(leaf2->entries[ INT_GET(leaf2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT) <
1910 INT_GET(leaf1->entries[ INT_GET(leaf1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT)))) {
1911 return(1);
1912 }
1913 return(0);
1914}
1915
1916/*
1917 * Pick up the last hashvalue from a leaf block.
1918 */
1919xfs_dahash_t
1920xfs_dir_leaf_lasthash(xfs_dabuf_t *bp, int *count)
1921{
1922 xfs_dir_leafblock_t *leaf;
1923
1924 leaf = bp->data;
1925 ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
1926 if (count)
1927 *count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
1928 if (!leaf->hdr.count)
1929 return(0);
1930 return(INT_GET(leaf->entries[ INT_GET(leaf->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT));
1931}
1932
1933/*
1934 * Copy out directory entries for getdents(), for leaf directories.
1935 */
1936int
1937xfs_dir_leaf_getdents_int(
1938 xfs_dabuf_t *bp,
1939 xfs_inode_t *dp,
1940 xfs_dablk_t bno,
1941 uio_t *uio,
1942 int *eobp,
1943 xfs_dirent_t *dbp,
1944 xfs_dir_put_t put,
1945 xfs_daddr_t nextda)
1946{
1947 xfs_dir_leafblock_t *leaf;
1948 xfs_dir_leaf_entry_t *entry;
1949 xfs_dir_leaf_name_t *namest;
1950 int entno, want_entno, i, nextentno;
1951 xfs_mount_t *mp;
1952 xfs_dahash_t cookhash;
1953 xfs_dahash_t nexthash = 0;
1954#if (BITS_PER_LONG == 32)
1955 xfs_dahash_t lasthash = XFS_DA_MAXHASH;
1956#endif
1957 xfs_dir_put_args_t p;
1958
1959 mp = dp->i_mount;
1960 leaf = bp->data;
1961 if (INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) != XFS_DIR_LEAF_MAGIC) {
1962 *eobp = 1;
1963 return(XFS_ERROR(ENOENT)); /* XXX wrong code */
1964 }
1965
1966 want_entno = XFS_DA_COOKIE_ENTRY(mp, uio->uio_offset);
1967
1968 cookhash = XFS_DA_COOKIE_HASH(mp, uio->uio_offset);
1969
1970 xfs_dir_trace_g_dul("leaf: start", dp, uio, leaf);
1971
1972 /*
1973 * Re-find our place.
1974 */
1975 for (i = entno = 0, entry = &leaf->entries[0];
1976 i < INT_GET(leaf->hdr.count, ARCH_CONVERT);
1977 entry++, i++) {
1978
1979 namest = XFS_DIR_LEAF_NAMESTRUCT(leaf,
1980 INT_GET(entry->nameidx, ARCH_CONVERT));
1981
1982 if (unlikely(
1983 ((char *)namest < (char *)leaf) ||
1984 ((char *)namest >= (char *)leaf + XFS_LBSIZE(mp)))) {
1985 XFS_CORRUPTION_ERROR("xfs_dir_leaf_getdents_int(1)",
1986 XFS_ERRLEVEL_LOW, mp, leaf);
1987 xfs_dir_trace_g_du("leaf: corrupted", dp, uio);
1988 return XFS_ERROR(EFSCORRUPTED);
1989 }
1990 if (INT_GET(entry->hashval, ARCH_CONVERT) >= cookhash) {
1991 if ( entno < want_entno
1992 && INT_GET(entry->hashval, ARCH_CONVERT)
1993 == cookhash) {
1994 /*
1995 * Trying to get to a particular offset in a
1996 * run of equal-hashval entries.
1997 */
1998 entno++;
1999 } else if ( want_entno > 0
2000 && entno == want_entno
2001 && INT_GET(entry->hashval, ARCH_CONVERT)
2002 == cookhash) {
2003 break;
2004 } else {
2005 entno = 0;
2006 break;
2007 }
2008 }
2009 }
2010
2011 if (i == INT_GET(leaf->hdr.count, ARCH_CONVERT)) {
2012 xfs_dir_trace_g_du("leaf: hash not found", dp, uio);
2013 if (!INT_GET(leaf->hdr.info.forw, ARCH_CONVERT))
2014 uio->uio_offset =
2015 XFS_DA_MAKE_COOKIE(mp, 0, 0, XFS_DA_MAXHASH);
2016 /*
2017 * Don't set uio_offset if there's another block:
2018 * the node code will be setting uio_offset anyway.
2019 */
2020 *eobp = 0;
2021 return(0);
2022 }
2023 xfs_dir_trace_g_due("leaf: hash found", dp, uio, entry);
2024
2025 p.dbp = dbp;
2026 p.put = put;
2027 p.uio = uio;
2028
2029 /*
2030 * We're synchronized, start copying entries out to the user.
2031 */
2032 for (; entno >= 0 && i < INT_GET(leaf->hdr.count, ARCH_CONVERT);
2033 entry++, i++, (entno = nextentno)) {
2034 int lastresid=0, retval;
2035 xfs_dircook_t lastoffset;
2036 xfs_dahash_t thishash;
2037
2038 /*
2039 * Check for a damaged directory leaf block and pick up
2040 * the inode number from this entry.
2041 */
2042 namest = XFS_DIR_LEAF_NAMESTRUCT(leaf,
2043 INT_GET(entry->nameidx, ARCH_CONVERT));
2044
2045 if (unlikely(
2046 ((char *)namest < (char *)leaf) ||
2047 ((char *)namest >= (char *)leaf + XFS_LBSIZE(mp)))) {
2048 XFS_CORRUPTION_ERROR("xfs_dir_leaf_getdents_int(2)",
2049 XFS_ERRLEVEL_LOW, mp, leaf);
2050 xfs_dir_trace_g_du("leaf: corrupted", dp, uio);
2051 return XFS_ERROR(EFSCORRUPTED);
2052 }
2053
2054 xfs_dir_trace_g_duc("leaf: middle cookie ",
2055 dp, uio, p.cook.o);
2056
2057 if (i < (INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1)) {
2058 nexthash = INT_GET(entry[1].hashval, ARCH_CONVERT);
2059
2060 if (nexthash == INT_GET(entry->hashval, ARCH_CONVERT))
2061 nextentno = entno + 1;
2062 else
2063 nextentno = 0;
2064 XFS_PUT_COOKIE(p.cook, mp, bno, nextentno, nexthash);
2065 xfs_dir_trace_g_duc("leaf: middle cookie ",
2066 dp, uio, p.cook.o);
2067
2068 } else if ((thishash = INT_GET(leaf->hdr.info.forw,
2069 ARCH_CONVERT))) {
2070 xfs_dabuf_t *bp2;
2071 xfs_dir_leafblock_t *leaf2;
2072
2073 ASSERT(nextda != -1);
2074
2075 retval = xfs_da_read_buf(dp->i_transp, dp, thishash,
2076 nextda, &bp2, XFS_DATA_FORK);
2077 if (retval)
2078 return(retval);
2079
2080 ASSERT(bp2 != NULL);
2081
2082 leaf2 = bp2->data;
2083
2084 if (unlikely(
2085 (INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT)
2086 != XFS_DIR_LEAF_MAGIC)
2087 || (INT_GET(leaf2->hdr.info.back, ARCH_CONVERT)
2088 != bno))) { /* GROT */
2089 XFS_CORRUPTION_ERROR("xfs_dir_leaf_getdents_int(3)",
2090 XFS_ERRLEVEL_LOW, mp,
2091 leaf2);
2092 xfs_da_brelse(dp->i_transp, bp2);
2093
2094 return(XFS_ERROR(EFSCORRUPTED));
2095 }
2096
2097 nexthash = INT_GET(leaf2->entries[0].hashval,
2098 ARCH_CONVERT);
2099 nextentno = -1;
2100 XFS_PUT_COOKIE(p.cook, mp, thishash, 0, nexthash);
2101 xfs_da_brelse(dp->i_transp, bp2);
2102 xfs_dir_trace_g_duc("leaf: next blk cookie",
2103 dp, uio, p.cook.o);
2104 } else {
2105 nextentno = -1;
2106 XFS_PUT_COOKIE(p.cook, mp, 0, 0, XFS_DA_MAXHASH);
2107 }
2108
2109 /*
2110 * Save off the cookie so we can fall back should the
2111 * 'put' into the outgoing buffer fails. To handle a run
2112 * of equal-hashvals, the off_t structure on 64bit
2113 * builds has entno built into the cookie to ID the
2114 * entry. On 32bit builds, we only have space for the
2115 * hashval so we can't ID specific entries within a group
2116 * of same hashval entries. For this, lastoffset is set
2117 * to the first in the run of equal hashvals so we don't
2118 * include any entries unless we can include all entries
2119 * that share the same hashval. Hopefully the buffer
2120 * provided is big enough to handle it (see pv763517).
2121 */
2122#if (BITS_PER_LONG == 32)
2123 if ((thishash = INT_GET(entry->hashval, ARCH_CONVERT))
2124 != lasthash) {
2125 XFS_PUT_COOKIE(lastoffset, mp, bno, entno, thishash);
2126 lastresid = uio->uio_resid;
2127 lasthash = thishash;
2128 } else {
2129 xfs_dir_trace_g_duc("leaf: DUP COOKIES, skipped",
2130 dp, uio, p.cook.o);
2131 }
2132#else
2133 thishash = INT_GET(entry->hashval, ARCH_CONVERT);
2134 XFS_PUT_COOKIE(lastoffset, mp, bno, entno, thishash);
2135 lastresid = uio->uio_resid;
2136#endif /* BITS_PER_LONG == 32 */
2137
2138 /*
2139 * Put the current entry into the outgoing buffer. If we fail
2140 * then restore the UIO to the first entry in the current
2141 * run of equal-hashval entries (probably one 1 entry long).
2142 */
2143 p.ino = XFS_GET_DIR_INO8(namest->inumber);
2144#if XFS_BIG_INUMS
2145 p.ino += mp->m_inoadd;
2146#endif
2147 p.name = (char *)namest->name;
2148 p.namelen = entry->namelen;
2149
2150 retval = p.put(&p);
2151
2152 if (!p.done) {
2153 uio->uio_offset = lastoffset.o;
2154 uio->uio_resid = lastresid;
2155
2156 *eobp = 1;
2157
2158 xfs_dir_trace_g_du("leaf: E-O-B", dp, uio);
2159
2160 return(retval);
2161 }
2162 }
2163
2164 uio->uio_offset = p.cook.o;
2165
2166 *eobp = 0;
2167
2168 xfs_dir_trace_g_du("leaf: E-O-F", dp, uio);
2169
2170 return(0);
2171}
2172
2173/*
2174 * Format a dirent64 structure and copy it out the the user's buffer.
2175 */
2176int
2177xfs_dir_put_dirent64_direct(xfs_dir_put_args_t *pa)
2178{
2179 iovec_t *iovp;
2180 int reclen, namelen;
2181 xfs_dirent_t *idbp;
2182 uio_t *uio;
2183
2184 namelen = pa->namelen;
2185 reclen = DIRENTSIZE(namelen);
2186 uio = pa->uio;
2187 if (reclen > uio->uio_resid) {
2188 pa->done = 0;
2189 return 0;
2190 }
2191 iovp = uio->uio_iov;
2192 idbp = (xfs_dirent_t *)iovp->iov_base;
2193 iovp->iov_base = (char *)idbp + reclen;
2194 iovp->iov_len -= reclen;
2195 uio->uio_resid -= reclen;
2196 idbp->d_reclen = reclen;
2197 idbp->d_ino = pa->ino;
2198 idbp->d_off = pa->cook.o;
2199 idbp->d_name[namelen] = '\0';
2200 pa->done = 1;
2201 memcpy(idbp->d_name, pa->name, namelen);
2202 return 0;
2203}
2204
2205/*
2206 * Format a dirent64 structure and copy it out the the user's buffer.
2207 */
2208int
2209xfs_dir_put_dirent64_uio(xfs_dir_put_args_t *pa)
2210{
2211 int retval, reclen, namelen;
2212 xfs_dirent_t *idbp;
2213 uio_t *uio;
2214
2215 namelen = pa->namelen;
2216 reclen = DIRENTSIZE(namelen);
2217 uio = pa->uio;
2218 if (reclen > uio->uio_resid) {
2219 pa->done = 0;
2220 return 0;
2221 }
2222 idbp = pa->dbp;
2223 idbp->d_reclen = reclen;
2224 idbp->d_ino = pa->ino;
2225 idbp->d_off = pa->cook.o;
2226 idbp->d_name[namelen] = '\0';
2227 memcpy(idbp->d_name, pa->name, namelen);
2228 retval = uio_read((caddr_t)idbp, reclen, uio);
2229 pa->done = (retval == 0);
2230 return retval;
2231}
diff --git a/fs/xfs/xfs_dir_leaf.h b/fs/xfs/xfs_dir_leaf.h
new file mode 100644
index 000000000000..00d68d33cc7a
--- /dev/null
+++ b/fs/xfs/xfs_dir_leaf.h
@@ -0,0 +1,248 @@
1/*
2 * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_DIR_LEAF_H__
33#define __XFS_DIR_LEAF_H__
34
35/*
36 * Directory layout, internal structure, access macros, etc.
37 *
38 * Large directories are structured around Btrees where all the data
39 * elements are in the leaf nodes. Filenames are hashed into an int,
40 * then that int is used as the index into the Btree. Since the hashval
41 * of a filename may not be unique, we may have duplicate keys. The
42 * internal links in the Btree are logical block offsets into the file.
43 */
44
45struct uio;
46struct xfs_bmap_free;
47struct xfs_dabuf;
48struct xfs_da_args;
49struct xfs_da_state;
50struct xfs_da_state_blk;
51struct xfs_dir_put_args;
52struct xfs_inode;
53struct xfs_mount;
54struct xfs_trans;
55
56/*========================================================================
57 * Directory Structure when equal to XFS_LBSIZE(mp) bytes.
58 *========================================================================*/
59
60/*
61 * This is the structure of the leaf nodes in the Btree.
62 *
63 * Struct leaf_entry's are packed from the top. Names grow from the bottom
64 * but are not packed. The freemap contains run-length-encoded entries
65 * for the free bytes after the leaf_entry's, but only the N largest such,
66 * smaller runs are dropped. When the freemap doesn't show enough space
67 * for an allocation, we compact the namelist area and try again. If we
68 * still don't have enough space, then we have to split the block.
69 *
70 * Since we have duplicate hash keys, for each key that matches, compare
71 * the actual string. The root and intermediate node search always takes
72 * the first-in-the-block key match found, so we should only have to work
73 * "forw"ard. If none matches, continue with the "forw"ard leaf nodes
74 * until the hash key changes or the filename is found.
75 *
76 * The parent directory and the self-pointer are explicitly represented
77 * (ie: there are entries for "." and "..").
78 *
79 * Note that the count being a __uint16_t limits us to something like a
80 * blocksize of 1.3MB in the face of worst case (short) filenames.
81 */
82#define XFS_DIR_LEAF_MAPSIZE 3 /* how many freespace slots */
83
84typedef struct xfs_dir_leafblock {
85 struct xfs_dir_leaf_hdr { /* constant-structure header block */
86 xfs_da_blkinfo_t info; /* block type, links, etc. */
87 __uint16_t count; /* count of active leaf_entry's */
88 __uint16_t namebytes; /* num bytes of name strings stored */
89 __uint16_t firstused; /* first used byte in name area */
90 __uint8_t holes; /* != 0 if blk needs compaction */
91 __uint8_t pad1;
92 struct xfs_dir_leaf_map {/* RLE map of free bytes */
93 __uint16_t base; /* base of free region */
94 __uint16_t size; /* run length of free region */
95 } freemap[XFS_DIR_LEAF_MAPSIZE]; /* N largest free regions */
96 } hdr;
97 struct xfs_dir_leaf_entry { /* sorted on key, not name */
98 xfs_dahash_t hashval; /* hash value of name */
99 __uint16_t nameidx; /* index into buffer of name */
100 __uint8_t namelen; /* length of name string */
101 __uint8_t pad2;
102 } entries[1]; /* var sized array */
103 struct xfs_dir_leaf_name {
104 xfs_dir_ino_t inumber; /* inode number for this key */
105 __uint8_t name[1]; /* name string itself */
106 } namelist[1]; /* grows from bottom of buf */
107} xfs_dir_leafblock_t;
108typedef struct xfs_dir_leaf_hdr xfs_dir_leaf_hdr_t;
109typedef struct xfs_dir_leaf_map xfs_dir_leaf_map_t;
110typedef struct xfs_dir_leaf_entry xfs_dir_leaf_entry_t;
111typedef struct xfs_dir_leaf_name xfs_dir_leaf_name_t;
112
113/*
114 * Length of name for which a 512-byte block filesystem
115 * can get a double split.
116 */
117#define XFS_DIR_LEAF_CAN_DOUBLE_SPLIT_LEN \
118 (512 - (uint)sizeof(xfs_dir_leaf_hdr_t) - \
119 (uint)sizeof(xfs_dir_leaf_entry_t) * 2 - \
120 (uint)sizeof(xfs_dir_leaf_name_t) * 2 - (MAXNAMELEN - 2) + 1 + 1)
121
122typedef int (*xfs_dir_put_t)(struct xfs_dir_put_args *pa);
123
124typedef union {
125 xfs_off_t o; /* offset (cookie) */
126 /*
127 * Watch the order here (endian-ness dependent).
128 */
129 struct {
130#if __BYTE_ORDER == __LITTLE_ENDIAN
131 xfs_dahash_t h; /* hash value */
132 __uint32_t be; /* block and entry */
133#else /* __BYTE_ORDER == __BIG_ENDIAN */
134 __uint32_t be; /* block and entry */
135 xfs_dahash_t h; /* hash value */
136#endif /* __BYTE_ORDER == __BIG_ENDIAN */
137 } s;
138} xfs_dircook_t;
139
140#define XFS_PUT_COOKIE(c,mp,bno,entry,hash) \
141 ((c).s.be = XFS_DA_MAKE_BNOENTRY(mp, bno, entry), (c).s.h = (hash))
142
143typedef struct xfs_dir_put_args
144{
145 xfs_dircook_t cook; /* cookie of (next) entry */
146 xfs_intino_t ino; /* inode number */
147 struct xfs_dirent *dbp; /* buffer pointer */
148 char *name; /* directory entry name */
149 int namelen; /* length of name */
150 int done; /* output: set if value was stored */
151 xfs_dir_put_t put; /* put function ptr (i/o) */
152 struct uio *uio; /* uio control structure */
153} xfs_dir_put_args_t;
154
155#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_LEAF_ENTSIZE_BYNAME)
156int xfs_dir_leaf_entsize_byname(int len);
157#define XFS_DIR_LEAF_ENTSIZE_BYNAME(len) xfs_dir_leaf_entsize_byname(len)
158#else
159#define XFS_DIR_LEAF_ENTSIZE_BYNAME(len) /* space a name will use */ \
160 ((uint)sizeof(xfs_dir_leaf_name_t)-1 + len)
161#endif
162#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_LEAF_ENTSIZE_BYENTRY)
163int xfs_dir_leaf_entsize_byentry(xfs_dir_leaf_entry_t *entry);
164#define XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry) \
165 xfs_dir_leaf_entsize_byentry(entry)
166#else
167#define XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry) /* space an entry will use */ \
168 ((uint)sizeof(xfs_dir_leaf_name_t)-1 + (entry)->namelen)
169#endif
170#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_LEAF_NAMESTRUCT)
171xfs_dir_leaf_name_t *
172xfs_dir_leaf_namestruct(xfs_dir_leafblock_t *leafp, int offset);
173#define XFS_DIR_LEAF_NAMESTRUCT(leafp,offset) \
174 xfs_dir_leaf_namestruct(leafp,offset)
175#else
176#define XFS_DIR_LEAF_NAMESTRUCT(leafp,offset) /* point to name struct */ \
177 ((xfs_dir_leaf_name_t *)&((char *)(leafp))[offset])
178#endif
179
180/*========================================================================
181 * Function prototypes for the kernel.
182 *========================================================================*/
183
184/*
185 * Internal routines when dirsize < XFS_LITINO(mp).
186 */
187int xfs_dir_shortform_create(struct xfs_da_args *args, xfs_ino_t parent);
188int xfs_dir_shortform_addname(struct xfs_da_args *args);
189int xfs_dir_shortform_lookup(struct xfs_da_args *args);
190int xfs_dir_shortform_to_leaf(struct xfs_da_args *args);
191int xfs_dir_shortform_removename(struct xfs_da_args *args);
192int xfs_dir_shortform_getdents(struct xfs_inode *dp, struct uio *uio, int *eofp,
193 struct xfs_dirent *dbp, xfs_dir_put_t put);
194int xfs_dir_shortform_replace(struct xfs_da_args *args);
195
196/*
197 * Internal routines when dirsize == XFS_LBSIZE(mp).
198 */
199int xfs_dir_leaf_to_node(struct xfs_da_args *args);
200int xfs_dir_leaf_to_shortform(struct xfs_da_args *args);
201
202/*
203 * Routines used for growing the Btree.
204 */
205int xfs_dir_leaf_create(struct xfs_da_args *args, xfs_dablk_t which_block,
206 struct xfs_dabuf **bpp);
207int xfs_dir_leaf_split(struct xfs_da_state *state,
208 struct xfs_da_state_blk *oldblk,
209 struct xfs_da_state_blk *newblk);
210int xfs_dir_leaf_add(struct xfs_dabuf *leaf_buffer,
211 struct xfs_da_args *args, int insertion_index);
212int xfs_dir_leaf_addname(struct xfs_da_args *args);
213int xfs_dir_leaf_lookup_int(struct xfs_dabuf *leaf_buffer,
214 struct xfs_da_args *args,
215 int *index_found_at);
216int xfs_dir_leaf_remove(struct xfs_trans *trans,
217 struct xfs_dabuf *leaf_buffer,
218 int index_to_remove);
219int xfs_dir_leaf_getdents_int(struct xfs_dabuf *bp, struct xfs_inode *dp,
220 xfs_dablk_t bno, struct uio *uio,
221 int *eobp, struct xfs_dirent *dbp,
222 xfs_dir_put_t put, xfs_daddr_t nextda);
223
224/*
225 * Routines used for shrinking the Btree.
226 */
227int xfs_dir_leaf_toosmall(struct xfs_da_state *state, int *retval);
228void xfs_dir_leaf_unbalance(struct xfs_da_state *state,
229 struct xfs_da_state_blk *drop_blk,
230 struct xfs_da_state_blk *save_blk);
231
232/*
233 * Utility routines.
234 */
235uint xfs_dir_leaf_lasthash(struct xfs_dabuf *bp, int *count);
236int xfs_dir_leaf_order(struct xfs_dabuf *leaf1_bp,
237 struct xfs_dabuf *leaf2_bp);
238int xfs_dir_put_dirent64_direct(xfs_dir_put_args_t *pa);
239int xfs_dir_put_dirent64_uio(xfs_dir_put_args_t *pa);
240int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
241
242
243/*
244 * Global data.
245 */
246extern xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot;
247
248#endif /* __XFS_DIR_LEAF_H__ */
diff --git a/fs/xfs/xfs_dir_sf.h b/fs/xfs/xfs_dir_sf.h
new file mode 100644
index 000000000000..a61bcfc2a87d
--- /dev/null
+++ b/fs/xfs/xfs_dir_sf.h
@@ -0,0 +1,172 @@
1/*
2 * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_DIR_SF_H__
33#define __XFS_DIR_SF_H__
34
35/*
36 * Directory layout when stored internal to an inode.
37 *
38 * Small directories are packed as tightly as possible so as to
39 * fit into the literal area of the inode.
40 */
41
42typedef struct { __uint8_t i[sizeof(xfs_ino_t)]; } xfs_dir_ino_t;
43
44/*
45 * The parent directory has a dedicated field, and the self-pointer must
46 * be calculated on the fly.
47 *
48 * Entries are packed toward the top as tight as possible. The header
49 * and the elements much be memcpy'd out into a work area to get correct
50 * alignment for the inode number fields.
51 */
52typedef struct xfs_dir_shortform {
53 struct xfs_dir_sf_hdr { /* constant-structure header block */
54 xfs_dir_ino_t parent; /* parent dir inode number */
55 __uint8_t count; /* count of active entries */
56 } hdr;
57 struct xfs_dir_sf_entry {
58 xfs_dir_ino_t inumber; /* referenced inode number */
59 __uint8_t namelen; /* actual length of name (no NULL) */
60 __uint8_t name[1]; /* name */
61 } list[1]; /* variable sized array */
62} xfs_dir_shortform_t;
63typedef struct xfs_dir_sf_hdr xfs_dir_sf_hdr_t;
64typedef struct xfs_dir_sf_entry xfs_dir_sf_entry_t;
65
66/*
67 * We generate this then sort it, so that readdirs are returned in
68 * hash-order. Else seekdir won't work.
69 */
70typedef struct xfs_dir_sf_sort {
71 __uint8_t entno; /* .=0, ..=1, else entry# + 2 */
72 __uint8_t seqno; /* sequence # with same hash value */
73 __uint8_t namelen; /* length of name value (no null) */
74 xfs_dahash_t hash; /* this entry's hash value */
75 xfs_intino_t ino; /* this entry's inode number */
76 char *name; /* name value, pointer into buffer */
77} xfs_dir_sf_sort_t;
78
79#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_SF_GET_DIRINO)
80void xfs_dir_sf_get_dirino(xfs_dir_ino_t *from, xfs_ino_t *to);
81#define XFS_DIR_SF_GET_DIRINO(from,to) xfs_dir_sf_get_dirino(from, to)
82#else
83#define XFS_DIR_SF_GET_DIRINO(from,to) (*(to) = XFS_GET_DIR_INO8(*from))
84#endif
85#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_SF_PUT_DIRINO)
86void xfs_dir_sf_put_dirino(xfs_ino_t *from, xfs_dir_ino_t *to);
87#define XFS_DIR_SF_PUT_DIRINO(from,to) xfs_dir_sf_put_dirino(from, to)
88#else
89#define XFS_DIR_SF_PUT_DIRINO(from,to) XFS_PUT_DIR_INO8(*(from), *(to))
90#endif
91#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_SF_ENTSIZE_BYNAME)
92int xfs_dir_sf_entsize_byname(int len);
93#define XFS_DIR_SF_ENTSIZE_BYNAME(len) xfs_dir_sf_entsize_byname(len)
94#else
95#define XFS_DIR_SF_ENTSIZE_BYNAME(len) /* space a name uses */ \
96 ((uint)sizeof(xfs_dir_sf_entry_t)-1 + (len))
97#endif
98#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_SF_ENTSIZE_BYENTRY)
99int xfs_dir_sf_entsize_byentry(xfs_dir_sf_entry_t *sfep);
100#define XFS_DIR_SF_ENTSIZE_BYENTRY(sfep) xfs_dir_sf_entsize_byentry(sfep)
101#else
102#define XFS_DIR_SF_ENTSIZE_BYENTRY(sfep) /* space an entry uses */ \
103 ((uint)sizeof(xfs_dir_sf_entry_t)-1 + (sfep)->namelen)
104#endif
105#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_SF_NEXTENTRY)
106xfs_dir_sf_entry_t *xfs_dir_sf_nextentry(xfs_dir_sf_entry_t *sfep);
107#define XFS_DIR_SF_NEXTENTRY(sfep) xfs_dir_sf_nextentry(sfep)
108#else
109#define XFS_DIR_SF_NEXTENTRY(sfep) /* next entry in struct */ \
110 ((xfs_dir_sf_entry_t *) \
111 ((char *)(sfep) + XFS_DIR_SF_ENTSIZE_BYENTRY(sfep)))
112#endif
113#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_SF_ALLFIT)
114int xfs_dir_sf_allfit(int count, int totallen);
115#define XFS_DIR_SF_ALLFIT(count,totallen) \
116 xfs_dir_sf_allfit(count,totallen)
117#else
118#define XFS_DIR_SF_ALLFIT(count,totallen) /* will all entries fit? */ \
119 ((uint)sizeof(xfs_dir_sf_hdr_t) + \
120 ((uint)sizeof(xfs_dir_sf_entry_t)-1)*(count) + (totallen))
121#endif
122
123#if defined(XFS_DIR_TRACE)
124
125/*
126 * Kernel tracing support for directories.
127 */
128struct uio;
129struct xfs_inode;
130struct xfs_da_intnode;
131struct xfs_dinode;
132struct xfs_dir_leafblock;
133struct xfs_dir_leaf_entry;
134
135#define XFS_DIR_TRACE_SIZE 4096 /* size of global trace buffer */
136extern ktrace_t *xfs_dir_trace_buf;
137
138/*
139 * Trace record types.
140 */
141#define XFS_DIR_KTRACE_G_DU 1 /* dp, uio */
142#define XFS_DIR_KTRACE_G_DUB 2 /* dp, uio, bno */
143#define XFS_DIR_KTRACE_G_DUN 3 /* dp, uio, node */
144#define XFS_DIR_KTRACE_G_DUL 4 /* dp, uio, leaf */
145#define XFS_DIR_KTRACE_G_DUE 5 /* dp, uio, leaf entry */
146#define XFS_DIR_KTRACE_G_DUC 6 /* dp, uio, cookie */
147
148void xfs_dir_trace_g_du(char *where, struct xfs_inode *dp, struct uio *uio);
149void xfs_dir_trace_g_dub(char *where, struct xfs_inode *dp, struct uio *uio,
150 xfs_dablk_t bno);
151void xfs_dir_trace_g_dun(char *where, struct xfs_inode *dp, struct uio *uio,
152 struct xfs_da_intnode *node);
153void xfs_dir_trace_g_dul(char *where, struct xfs_inode *dp, struct uio *uio,
154 struct xfs_dir_leafblock *leaf);
155void xfs_dir_trace_g_due(char *where, struct xfs_inode *dp, struct uio *uio,
156 struct xfs_dir_leaf_entry *entry);
157void xfs_dir_trace_g_duc(char *where, struct xfs_inode *dp, struct uio *uio,
158 xfs_off_t cookie);
159void xfs_dir_trace_enter(int type, char *where,
160 void *a0, void *a1, void *a2, void *a3,
161 void *a4, void *a5, void *a6, void *a7,
162 void *a8, void *a9, void *a10, void *a11);
163#else
164#define xfs_dir_trace_g_du(w,d,u)
165#define xfs_dir_trace_g_dub(w,d,u,b)
166#define xfs_dir_trace_g_dun(w,d,u,n)
167#define xfs_dir_trace_g_dul(w,d,u,l)
168#define xfs_dir_trace_g_due(w,d,u,e)
169#define xfs_dir_trace_g_duc(w,d,u,c)
170#endif /* DEBUG */
171
172#endif /* __XFS_DIR_SF_H__ */
diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h
new file mode 100644
index 000000000000..55ae3e67d245
--- /dev/null
+++ b/fs/xfs/xfs_dmapi.h
@@ -0,0 +1,212 @@
1/*
2 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_DMAPI_H__
33#define __XFS_DMAPI_H__
34
35/* Values used to define the on-disk version of dm_attrname_t. All
36 * on-disk attribute names start with the 8-byte string "SGI_DMI_".
37 *
38 * In the on-disk inode, DMAPI attribute names consist of the user-provided
39 * name with the DMATTR_PREFIXSTRING pre-pended. This string must NEVER be
40 * changed.
41 */
42
43#define DMATTR_PREFIXLEN 8
44#define DMATTR_PREFIXSTRING "SGI_DMI_"
45
46typedef enum {
47 DM_EVENT_INVALID = -1,
48 DM_EVENT_CANCEL = 0, /* not supported */
49 DM_EVENT_MOUNT = 1,
50 DM_EVENT_PREUNMOUNT = 2,
51 DM_EVENT_UNMOUNT = 3,
52 DM_EVENT_DEBUT = 4, /* not supported */
53 DM_EVENT_CREATE = 5,
54 DM_EVENT_CLOSE = 6, /* not supported */
55 DM_EVENT_POSTCREATE = 7,
56 DM_EVENT_REMOVE = 8,
57 DM_EVENT_POSTREMOVE = 9,
58 DM_EVENT_RENAME = 10,
59 DM_EVENT_POSTRENAME = 11,
60 DM_EVENT_LINK = 12,
61 DM_EVENT_POSTLINK = 13,
62 DM_EVENT_SYMLINK = 14,
63 DM_EVENT_POSTSYMLINK = 15,
64 DM_EVENT_READ = 16,
65 DM_EVENT_WRITE = 17,
66 DM_EVENT_TRUNCATE = 18,
67 DM_EVENT_ATTRIBUTE = 19,
68 DM_EVENT_DESTROY = 20,
69 DM_EVENT_NOSPACE = 21,
70 DM_EVENT_USER = 22,
71 DM_EVENT_MAX = 23
72} dm_eventtype_t;
73#define HAVE_DM_EVENTTYPE_T
74
75typedef enum {
76 DM_RIGHT_NULL,
77 DM_RIGHT_SHARED,
78 DM_RIGHT_EXCL
79} dm_right_t;
80#define HAVE_DM_RIGHT_T
81
82/* Defines for determining if an event message should be sent. */
83#define DM_EVENT_ENABLED(vfsp, ip, event) ( \
84 unlikely ((vfsp)->vfs_flag & VFS_DMI) && \
85 ( ((ip)->i_d.di_dmevmask & (1 << event)) || \
86 ((ip)->i_mount->m_dmevmask & (1 << event)) ) \
87 )
88
89#define DM_EVENT_ENABLED_IO(vfsp, io, event) ( \
90 unlikely ((vfsp)->vfs_flag & VFS_DMI) && \
91 ( ((io)->io_dmevmask & (1 << event)) || \
92 ((io)->io_mount->m_dmevmask & (1 << event)) ) \
93 )
94
95#define DM_XFS_VALID_FS_EVENTS ( \
96 (1 << DM_EVENT_PREUNMOUNT) | \
97 (1 << DM_EVENT_UNMOUNT) | \
98 (1 << DM_EVENT_NOSPACE) | \
99 (1 << DM_EVENT_DEBUT) | \
100 (1 << DM_EVENT_CREATE) | \
101 (1 << DM_EVENT_POSTCREATE) | \
102 (1 << DM_EVENT_REMOVE) | \
103 (1 << DM_EVENT_POSTREMOVE) | \
104 (1 << DM_EVENT_RENAME) | \
105 (1 << DM_EVENT_POSTRENAME) | \
106 (1 << DM_EVENT_LINK) | \
107 (1 << DM_EVENT_POSTLINK) | \
108 (1 << DM_EVENT_SYMLINK) | \
109 (1 << DM_EVENT_POSTSYMLINK) | \
110 (1 << DM_EVENT_ATTRIBUTE) | \
111 (1 << DM_EVENT_DESTROY) )
112
113/* Events valid in dm_set_eventlist() when called with a file handle for
114 a regular file or a symlink. These events are persistent.
115*/
116
117#define DM_XFS_VALID_FILE_EVENTS ( \
118 (1 << DM_EVENT_ATTRIBUTE) | \
119 (1 << DM_EVENT_DESTROY) )
120
121/* Events valid in dm_set_eventlist() when called with a file handle for
122 a directory. These events are persistent.
123*/
124
125#define DM_XFS_VALID_DIRECTORY_EVENTS ( \
126 (1 << DM_EVENT_CREATE) | \
127 (1 << DM_EVENT_POSTCREATE) | \
128 (1 << DM_EVENT_REMOVE) | \
129 (1 << DM_EVENT_POSTREMOVE) | \
130 (1 << DM_EVENT_RENAME) | \
131 (1 << DM_EVENT_POSTRENAME) | \
132 (1 << DM_EVENT_LINK) | \
133 (1 << DM_EVENT_POSTLINK) | \
134 (1 << DM_EVENT_SYMLINK) | \
135 (1 << DM_EVENT_POSTSYMLINK) | \
136 (1 << DM_EVENT_ATTRIBUTE) | \
137 (1 << DM_EVENT_DESTROY) )
138
139/* Events supported by the XFS filesystem. */
140#define DM_XFS_SUPPORTED_EVENTS ( \
141 (1 << DM_EVENT_MOUNT) | \
142 (1 << DM_EVENT_PREUNMOUNT) | \
143 (1 << DM_EVENT_UNMOUNT) | \
144 (1 << DM_EVENT_NOSPACE) | \
145 (1 << DM_EVENT_CREATE) | \
146 (1 << DM_EVENT_POSTCREATE) | \
147 (1 << DM_EVENT_REMOVE) | \
148 (1 << DM_EVENT_POSTREMOVE) | \
149 (1 << DM_EVENT_RENAME) | \
150 (1 << DM_EVENT_POSTRENAME) | \
151 (1 << DM_EVENT_LINK) | \
152 (1 << DM_EVENT_POSTLINK) | \
153 (1 << DM_EVENT_SYMLINK) | \
154 (1 << DM_EVENT_POSTSYMLINK) | \
155 (1 << DM_EVENT_READ) | \
156 (1 << DM_EVENT_WRITE) | \
157 (1 << DM_EVENT_TRUNCATE) | \
158 (1 << DM_EVENT_ATTRIBUTE) | \
159 (1 << DM_EVENT_DESTROY) )
160
161
162/*
163 * Definitions used for the flags field on dm_send_*_event().
164 */
165
166#define DM_FLAGS_NDELAY 0x001 /* return EAGAIN after dm_pending() */
167#define DM_FLAGS_UNWANTED 0x002 /* event not in fsys dm_eventset_t */
168#define DM_FLAGS_ISEM 0x004 /* thread holds i_sem */
169#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
170#if LINUX_VERSION_CODE > KERNEL_VERSION(2,4,21)
171/* i_alloc_sem was added in 2.4.22-pre1 */
172#define DM_FLAGS_IALLOCSEM_RD 0x010 /* thread holds i_alloc_sem rd */
173#define DM_FLAGS_IALLOCSEM_WR 0x020 /* thread holds i_alloc_sem wr */
174#endif
175#endif
176
177/*
178 * Based on IO_ISDIRECT, decide which i_ flag is set.
179 */
180#ifdef DM_FLAGS_IALLOCSEM_RD
181#define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \
182 DM_FLAGS_IALLOCSEM_RD : DM_FLAGS_ISEM)
183#define DM_SEM_FLAG_WR (DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_ISEM)
184#else
185#define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \
186 0 : DM_FLAGS_ISEM)
187#define DM_SEM_FLAG_WR (DM_FLAGS_ISEM)
188#endif
189
190/*
191 * Macros to turn caller specified delay/block flags into
192 * dm_send_xxxx_event flag DM_FLAGS_NDELAY.
193 */
194
195#define FILP_DELAY_FLAG(filp) ((filp->f_flags&(O_NDELAY|O_NONBLOCK)) ? \
196 DM_FLAGS_NDELAY : 0)
197#define AT_DELAY_FLAG(f) ((f&ATTR_NONBLOCK) ? DM_FLAGS_NDELAY : 0)
198
199
200extern struct bhv_vfsops xfs_dmops;
201
202#ifdef CONFIG_XFS_DMAPI
203void xfs_dm_init(struct file_system_type *);
204void xfs_dm_exit(struct file_system_type *);
205#define XFS_DM_INIT(fstype) xfs_dm_init(fstype)
206#define XFS_DM_EXIT(fstype) xfs_dm_exit(fstype)
207#else
208#define XFS_DM_INIT(fstype)
209#define XFS_DM_EXIT(fstype)
210#endif
211
212#endif /* __XFS_DMAPI_H__ */
diff --git a/fs/xfs/xfs_dmops.c b/fs/xfs/xfs_dmops.c
new file mode 100644
index 000000000000..cec54ba800eb
--- /dev/null
+++ b/fs/xfs/xfs_dmops.c
@@ -0,0 +1,52 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#include "xfs.h"
33
34#include "xfs_macros.h"
35#include "xfs_types.h"
36#include "xfs_inum.h"
37#include "xfs_log.h"
38#include "xfs_trans.h"
39#include "xfs_sb.h"
40#include "xfs_ag.h"
41#include "xfs_dir.h"
42#include "xfs_dir2.h"
43#include "xfs_dmapi.h"
44#include "xfs_mount.h"
45
46xfs_dmops_t xfs_dmcore_stub = {
47 .xfs_send_data = (xfs_send_data_t)fs_nosys,
48 .xfs_send_mmap = (xfs_send_mmap_t)fs_noerr,
49 .xfs_send_destroy = (xfs_send_destroy_t)fs_nosys,
50 .xfs_send_namesp = (xfs_send_namesp_t)fs_nosys,
51 .xfs_send_unmount = (xfs_send_unmount_t)fs_noval,
52};
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
new file mode 100644
index 000000000000..bbe1dea11c08
--- /dev/null
+++ b/fs/xfs/xfs_error.c
@@ -0,0 +1,327 @@
1/*
2 * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34
35#include "xfs_macros.h"
36#include "xfs_types.h"
37#include "xfs_inum.h"
38#include "xfs_log.h"
39#include "xfs_sb.h"
40#include "xfs_trans.h"
41#include "xfs_dir.h"
42#include "xfs_dir2.h"
43#include "xfs_dmapi.h"
44#include "xfs_mount.h"
45#include "xfs_bmap_btree.h"
46#include "xfs_attr_sf.h"
47#include "xfs_dir_sf.h"
48#include "xfs_dir2_sf.h"
49#include "xfs_dinode.h"
50#include "xfs_inode.h"
51#include "xfs_utils.h"
52#include "xfs_error.h"
53
54#ifdef DEBUG
55
56int xfs_etrap[XFS_ERROR_NTRAP] = {
57 0,
58};
59
60int
61xfs_error_trap(int e)
62{
63 int i;
64
65 if (!e)
66 return 0;
67 for (i = 0; i < XFS_ERROR_NTRAP; i++) {
68 if (xfs_etrap[i] == 0)
69 break;
70 if (e != xfs_etrap[i])
71 continue;
72 cmn_err(CE_NOTE, "xfs_error_trap: error %d", e);
73 debug_stop_all_cpus((void *)-1LL);
74 BUG();
75 break;
76 }
77 return e;
78}
79#endif
80
81#if (defined(DEBUG) || defined(INDUCE_IO_ERROR))
82
83int xfs_etest[XFS_NUM_INJECT_ERROR];
84int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
85char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR];
86
87void
88xfs_error_test_init(void)
89{
90 memset(xfs_etest, 0, sizeof(xfs_etest));
91 memset(xfs_etest_fsid, 0, sizeof(xfs_etest_fsid));
92 memset(xfs_etest_fsname, 0, sizeof(xfs_etest_fsname));
93}
94
95int
96xfs_error_test(int error_tag, int *fsidp, char *expression,
97 int line, char *file, unsigned long randfactor)
98{
99 int i;
100 int64_t fsid;
101
102 if (random() % randfactor)
103 return 0;
104
105 memcpy(&fsid, fsidp, sizeof(xfs_fsid_t));
106
107 for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
108 if (xfs_etest[i] == error_tag && xfs_etest_fsid[i] == fsid) {
109 cmn_err(CE_WARN,
110 "Injecting error (%s) at file %s, line %d, on filesystem \"%s\"",
111 expression, file, line, xfs_etest_fsname[i]);
112 return 1;
113 }
114 }
115
116 return 0;
117}
118
119int
120xfs_errortag_add(int error_tag, xfs_mount_t *mp)
121{
122 int i;
123 int len;
124 int64_t fsid;
125
126 memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t));
127
128 for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
129 if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) {
130 cmn_err(CE_WARN, "XFS error tag #%d on", error_tag);
131 return 0;
132 }
133 }
134
135 for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
136 if (xfs_etest[i] == 0) {
137 cmn_err(CE_WARN, "Turned on XFS error tag #%d",
138 error_tag);
139 xfs_etest[i] = error_tag;
140 xfs_etest_fsid[i] = fsid;
141 len = strlen(mp->m_fsname);
142 xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP);
143 strcpy(xfs_etest_fsname[i], mp->m_fsname);
144 return 0;
145 }
146 }
147
148 cmn_err(CE_WARN, "error tag overflow, too many turned on");
149
150 return 1;
151}
152
153int
154xfs_errortag_clear(int error_tag, xfs_mount_t *mp)
155{
156 int i;
157 int64_t fsid;
158
159 memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t));
160
161 for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
162 if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) {
163 xfs_etest[i] = 0;
164 xfs_etest_fsid[i] = 0LL;
165 kmem_free(xfs_etest_fsname[i],
166 strlen(xfs_etest_fsname[i]) + 1);
167 xfs_etest_fsname[i] = NULL;
168 cmn_err(CE_WARN, "Cleared XFS error tag #%d",
169 error_tag);
170 return 0;
171 }
172 }
173
174 cmn_err(CE_WARN, "XFS error tag %d not on", error_tag);
175
176 return 1;
177}
178
179int
180xfs_errortag_clearall_umount(int64_t fsid, char *fsname, int loud)
181{
182 int i;
183 int cleared = 0;
184
185 for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
186 if ((fsid == 0LL || xfs_etest_fsid[i] == fsid) &&
187 xfs_etest[i] != 0) {
188 cleared = 1;
189 cmn_err(CE_WARN, "Clearing XFS error tag #%d",
190 xfs_etest[i]);
191 xfs_etest[i] = 0;
192 xfs_etest_fsid[i] = 0LL;
193 kmem_free(xfs_etest_fsname[i],
194 strlen(xfs_etest_fsname[i]) + 1);
195 xfs_etest_fsname[i] = NULL;
196 }
197 }
198
199 if (loud || cleared)
200 cmn_err(CE_WARN,
201 "Cleared all XFS error tags for filesystem \"%s\"",
202 fsname);
203
204 return 0;
205}
206
207int
208xfs_errortag_clearall(xfs_mount_t *mp)
209{
210 int64_t fsid;
211
212 memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t));
213
214 return xfs_errortag_clearall_umount(fsid, mp->m_fsname, 1);
215}
216#endif /* DEBUG || INDUCE_IO_ERROR */
217
218static void
219xfs_fs_vcmn_err(int level, xfs_mount_t *mp, char *fmt, va_list ap)
220{
221 if (mp != NULL) {
222 char *newfmt;
223 int len = 16 + mp->m_fsname_len + strlen(fmt);
224
225 newfmt = kmem_alloc(len, KM_SLEEP);
226 sprintf(newfmt, "Filesystem \"%s\": %s", mp->m_fsname, fmt);
227 icmn_err(level, newfmt, ap);
228 kmem_free(newfmt, len);
229 } else {
230 icmn_err(level, fmt, ap);
231 }
232}
233
234void
235xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...)
236{
237 va_list ap;
238
239 va_start(ap, fmt);
240 xfs_fs_vcmn_err(level, mp, fmt, ap);
241 va_end(ap);
242}
243
244void
245xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
246{
247 va_list ap;
248
249#ifdef DEBUG
250 xfs_panic_mask |= XFS_PTAG_SHUTDOWN_CORRUPT;
251#endif
252
253 if (xfs_panic_mask && (xfs_panic_mask & panic_tag)
254 && (level & CE_ALERT)) {
255 level &= ~CE_ALERT;
256 level |= CE_PANIC;
257 cmn_err(CE_ALERT, "XFS: Transforming an alert into a BUG.");
258 }
259 va_start(ap, fmt);
260 xfs_fs_vcmn_err(level, mp, fmt, ap);
261 va_end(ap);
262}
263
264void
265xfs_error_report(
266 char *tag,
267 int level,
268 xfs_mount_t *mp,
269 char *fname,
270 int linenum,
271 inst_t *ra)
272{
273 if (level <= xfs_error_level) {
274 xfs_cmn_err(XFS_PTAG_ERROR_REPORT,
275 CE_ALERT, mp,
276 "XFS internal error %s at line %d of file %s. Caller 0x%p\n",
277 tag, linenum, fname, ra);
278
279 xfs_stack_trace();
280 }
281}
282
283void
284xfs_hex_dump(void *p, int length)
285{
286 __uint8_t *uip = (__uint8_t*)p;
287 int i;
288 char sbuf[128], *s;
289
290 s = sbuf;
291 *s = '\0';
292 for (i=0; i<length; i++, uip++) {
293 if ((i % 16) == 0) {
294 if (*s != '\0')
295 cmn_err(CE_ALERT, "%s\n", sbuf);
296 s = sbuf;
297 sprintf(s, "0x%x: ", i);
298 while( *s != '\0')
299 s++;
300 }
301 sprintf(s, "%02x ", *uip);
302
303 /*
304 * the kernel sprintf is a void; user sprintf returns
305 * the sprintf'ed string's length. Find the new end-
306 * of-string
307 */
308 while( *s != '\0')
309 s++;
310 }
311 cmn_err(CE_ALERT, "%s\n", sbuf);
312}
313
314void
315xfs_corruption_error(
316 char *tag,
317 int level,
318 xfs_mount_t *mp,
319 void *p,
320 char *fname,
321 int linenum,
322 inst_t *ra)
323{
324 if (level <= xfs_error_level)
325 xfs_hex_dump(p, 16);
326 xfs_error_report(tag, level, mp, fname, linenum, ra);
327}
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
new file mode 100644
index 000000000000..6bc0535c0a65
--- /dev/null
+++ b/fs/xfs/xfs_error.h
@@ -0,0 +1,196 @@
1/*
2 * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_ERROR_H__
33#define __XFS_ERROR_H__
34
35#define prdev(fmt,targ,args...) \
36 printk("XFS: device %s- " fmt "\n", XFS_BUFTARG_NAME(targ), ## args)
37
38#define XFS_ERECOVER 1 /* Failure to recover log */
39#define XFS_ELOGSTAT 2 /* Failure to stat log in user space */
40#define XFS_ENOLOGSPACE 3 /* Reservation too large */
41#define XFS_ENOTSUP 4 /* Operation not supported */
42#define XFS_ENOLSN 5 /* Can't find the lsn you asked for */
43#define XFS_ENOTFOUND 6
44#define XFS_ENOTXFS 7 /* Not XFS filesystem */
45
46#ifdef DEBUG
47#define XFS_ERROR_NTRAP 10
48extern int xfs_etrap[XFS_ERROR_NTRAP];
49extern int xfs_error_trap(int);
50#define XFS_ERROR(e) xfs_error_trap(e)
51#else
52#define XFS_ERROR(e) (e)
53#endif
54
55struct xfs_mount;
56
57extern void
58xfs_error_report(
59 char *tag,
60 int level,
61 struct xfs_mount *mp,
62 char *fname,
63 int linenum,
64 inst_t *ra);
65
66extern void
67xfs_corruption_error(
68 char *tag,
69 int level,
70 struct xfs_mount *mp,
71 void *p,
72 char *fname,
73 int linenum,
74 inst_t *ra);
75
76extern void
77xfs_hex_dump(void *p, int length);
78
79#define XFS_ERROR_REPORT(e, lvl, mp) \
80 xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address)
81#define XFS_CORRUPTION_ERROR(e, lvl, mp, mem) \
82 xfs_corruption_error(e, lvl, mp, mem, \
83 __FILE__, __LINE__, __return_address)
84
85#define XFS_ERRLEVEL_OFF 0
86#define XFS_ERRLEVEL_LOW 1
87#define XFS_ERRLEVEL_HIGH 5
88
89/*
90 * error injection tags - the labels can be anything you want
91 * but each tag should have its own unique number
92 */
93
94#define XFS_ERRTAG_NOERROR 0
95#define XFS_ERRTAG_IFLUSH_1 1
96#define XFS_ERRTAG_IFLUSH_2 2
97#define XFS_ERRTAG_IFLUSH_3 3
98#define XFS_ERRTAG_IFLUSH_4 4
99#define XFS_ERRTAG_IFLUSH_5 5
100#define XFS_ERRTAG_IFLUSH_6 6
101#define XFS_ERRTAG_DA_READ_BUF 7
102#define XFS_ERRTAG_BTREE_CHECK_LBLOCK 8
103#define XFS_ERRTAG_BTREE_CHECK_SBLOCK 9
104#define XFS_ERRTAG_ALLOC_READ_AGF 10
105#define XFS_ERRTAG_IALLOC_READ_AGI 11
106#define XFS_ERRTAG_ITOBP_INOTOBP 12
107#define XFS_ERRTAG_IUNLINK 13
108#define XFS_ERRTAG_IUNLINK_REMOVE 14
109#define XFS_ERRTAG_DIR_INO_VALIDATE 15
110#define XFS_ERRTAG_BULKSTAT_READ_CHUNK 16
111#define XFS_ERRTAG_IODONE_IOERR 17
112#define XFS_ERRTAG_STRATREAD_IOERR 18
113#define XFS_ERRTAG_STRATCMPL_IOERR 19
114#define XFS_ERRTAG_DIOWRITE_IOERR 20
115#define XFS_ERRTAG_BMAPIFORMAT 21
116#define XFS_ERRTAG_MAX 22
117
118/*
119 * Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
120 */
121#define XFS_RANDOM_DEFAULT 100
122#define XFS_RANDOM_IFLUSH_1 XFS_RANDOM_DEFAULT
123#define XFS_RANDOM_IFLUSH_2 XFS_RANDOM_DEFAULT
124#define XFS_RANDOM_IFLUSH_3 XFS_RANDOM_DEFAULT
125#define XFS_RANDOM_IFLUSH_4 XFS_RANDOM_DEFAULT
126#define XFS_RANDOM_IFLUSH_5 XFS_RANDOM_DEFAULT
127#define XFS_RANDOM_IFLUSH_6 XFS_RANDOM_DEFAULT
128#define XFS_RANDOM_DA_READ_BUF XFS_RANDOM_DEFAULT
129#define XFS_RANDOM_BTREE_CHECK_LBLOCK (XFS_RANDOM_DEFAULT/4)
130#define XFS_RANDOM_BTREE_CHECK_SBLOCK XFS_RANDOM_DEFAULT
131#define XFS_RANDOM_ALLOC_READ_AGF XFS_RANDOM_DEFAULT
132#define XFS_RANDOM_IALLOC_READ_AGI XFS_RANDOM_DEFAULT
133#define XFS_RANDOM_ITOBP_INOTOBP XFS_RANDOM_DEFAULT
134#define XFS_RANDOM_IUNLINK XFS_RANDOM_DEFAULT
135#define XFS_RANDOM_IUNLINK_REMOVE XFS_RANDOM_DEFAULT
136#define XFS_RANDOM_DIR_INO_VALIDATE XFS_RANDOM_DEFAULT
137#define XFS_RANDOM_BULKSTAT_READ_CHUNK XFS_RANDOM_DEFAULT
138#define XFS_RANDOM_IODONE_IOERR (XFS_RANDOM_DEFAULT/10)
139#define XFS_RANDOM_STRATREAD_IOERR (XFS_RANDOM_DEFAULT/10)
140#define XFS_RANDOM_STRATCMPL_IOERR (XFS_RANDOM_DEFAULT/10)
141#define XFS_RANDOM_DIOWRITE_IOERR (XFS_RANDOM_DEFAULT/10)
142#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT
143
144#if (defined(DEBUG) || defined(INDUCE_IO_ERROR))
145extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
146void xfs_error_test_init(void);
147
148#define XFS_NUM_INJECT_ERROR 10
149
150#ifdef __ANSI_CPP__
151#define XFS_TEST_ERROR(expr, mp, tag, rf) \
152 ((expr) || \
153 xfs_error_test((tag), (mp)->m_fixedfsid, #expr, __LINE__, __FILE__, \
154 (rf)))
155#else
156#define XFS_TEST_ERROR(expr, mp, tag, rf) \
157 ((expr) || \
158 xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
159 (rf)))
160#endif /* __ANSI_CPP__ */
161
162int xfs_errortag_add(int error_tag, xfs_mount_t *mp);
163int xfs_errortag_clear(int error_tag, xfs_mount_t *mp);
164
165int xfs_errortag_clearall(xfs_mount_t *mp);
166int xfs_errortag_clearall_umount(int64_t fsid, char *fsname,
167 int loud);
168#else
169#define XFS_TEST_ERROR(expr, mp, tag, rf) (expr)
170#define xfs_errortag_add(tag, mp) (ENOSYS)
171#define xfs_errortag_clearall(mp) (ENOSYS)
172#endif /* (DEBUG || INDUCE_IO_ERROR) */
173
174/*
175 * XFS panic tags -- allow a call to xfs_cmn_err() be turned into
176 * a panic by setting xfs_panic_mask in a
177 * sysctl. update xfs_max[XFS_PARAM] if
178 * more are added.
179 */
180#define XFS_NO_PTAG 0
181#define XFS_PTAG_IFLUSH 0x00000001
182#define XFS_PTAG_LOGRES 0x00000002
183#define XFS_PTAG_AILDELETE 0x00000004
184#define XFS_PTAG_ERROR_REPORT 0x00000008
185#define XFS_PTAG_SHUTDOWN_CORRUPT 0x00000010
186#define XFS_PTAG_SHUTDOWN_IOERROR 0x00000020
187#define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040
188
189struct xfs_mount;
190/* PRINTFLIKE4 */
191void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp,
192 char *fmt, ...);
193/* PRINTFLIKE3 */
194void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...);
195
196#endif /* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
new file mode 100644
index 000000000000..5eafd5b63211
--- /dev/null
+++ b/fs/xfs/xfs_extfree_item.c
@@ -0,0 +1,668 @@
1/*
2 * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33/*
34 * This file contains the implementation of the xfs_efi_log_item
35 * and xfs_efd_log_item items.
36 */
37
38#include "xfs.h"
39
40#include "xfs_macros.h"
41#include "xfs_types.h"
42#include "xfs_inum.h"
43#include "xfs_log.h"
44#include "xfs_trans.h"
45#include "xfs_buf_item.h"
46#include "xfs_sb.h"
47#include "xfs_dir.h"
48#include "xfs_dmapi.h"
49#include "xfs_mount.h"
50#include "xfs_trans_priv.h"
51#include "xfs_extfree_item.h"
52
53
54kmem_zone_t *xfs_efi_zone;
55kmem_zone_t *xfs_efd_zone;
56
57STATIC void xfs_efi_item_unlock(xfs_efi_log_item_t *);
58STATIC void xfs_efi_item_abort(xfs_efi_log_item_t *);
59STATIC void xfs_efd_item_abort(xfs_efd_log_item_t *);
60
61
62
63/*
64 * This returns the number of iovecs needed to log the given efi item.
65 * We only need 1 iovec for an efi item. It just logs the efi_log_format
66 * structure.
67 */
68/*ARGSUSED*/
69STATIC uint
70xfs_efi_item_size(xfs_efi_log_item_t *efip)
71{
72 return 1;
73}
74
75/*
76 * This is called to fill in the vector of log iovecs for the
77 * given efi log item. We use only 1 iovec, and we point that
78 * at the efi_log_format structure embedded in the efi item.
79 * It is at this point that we assert that all of the extent
80 * slots in the efi item have been filled.
81 */
82STATIC void
83xfs_efi_item_format(xfs_efi_log_item_t *efip,
84 xfs_log_iovec_t *log_vector)
85{
86 uint size;
87
88 ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents);
89
90 efip->efi_format.efi_type = XFS_LI_EFI;
91
92 size = sizeof(xfs_efi_log_format_t);
93 size += (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t);
94 efip->efi_format.efi_size = 1;
95
96 log_vector->i_addr = (xfs_caddr_t)&(efip->efi_format);
97 log_vector->i_len = size;
98 ASSERT(size >= sizeof(xfs_efi_log_format_t));
99}
100
101
102/*
103 * Pinning has no meaning for an efi item, so just return.
104 */
105/*ARGSUSED*/
106STATIC void
107xfs_efi_item_pin(xfs_efi_log_item_t *efip)
108{
109 return;
110}
111
112
113/*
114 * While EFIs cannot really be pinned, the unpin operation is the
115 * last place at which the EFI is manipulated during a transaction.
116 * Here we coordinate with xfs_efi_cancel() to determine who gets to
117 * free the EFI.
118 */
119/*ARGSUSED*/
120STATIC void
121xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale)
122{
123 int nexts;
124 int size;
125 xfs_mount_t *mp;
126 SPLDECL(s);
127
128 mp = efip->efi_item.li_mountp;
129 AIL_LOCK(mp, s);
130 if (efip->efi_flags & XFS_EFI_CANCELED) {
131 /*
132 * xfs_trans_delete_ail() drops the AIL lock.
133 */
134 xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s);
135
136 nexts = efip->efi_format.efi_nextents;
137 if (nexts > XFS_EFI_MAX_FAST_EXTENTS) {
138 size = sizeof(xfs_efi_log_item_t);
139 size += (nexts - 1) * sizeof(xfs_extent_t);
140 kmem_free(efip, size);
141 } else {
142 kmem_zone_free(xfs_efi_zone, efip);
143 }
144 } else {
145 efip->efi_flags |= XFS_EFI_COMMITTED;
146 AIL_UNLOCK(mp, s);
147 }
148
149 return;
150}
151
152/*
153 * like unpin only we have to also clear the xaction descriptor
154 * pointing the log item if we free the item. This routine duplicates
155 * unpin because efi_flags is protected by the AIL lock. Freeing
156 * the descriptor and then calling unpin would force us to drop the AIL
157 * lock which would open up a race condition.
158 */
159STATIC void
160xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp)
161{
162 int nexts;
163 int size;
164 xfs_mount_t *mp;
165 xfs_log_item_desc_t *lidp;
166 SPLDECL(s);
167
168 mp = efip->efi_item.li_mountp;
169 AIL_LOCK(mp, s);
170 if (efip->efi_flags & XFS_EFI_CANCELED) {
171 /*
172 * free the xaction descriptor pointing to this item
173 */
174 lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) efip);
175 xfs_trans_free_item(tp, lidp);
176 /*
177 * pull the item off the AIL.
178 * xfs_trans_delete_ail() drops the AIL lock.
179 */
180 xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s);
181 /*
182 * now free the item itself
183 */
184 nexts = efip->efi_format.efi_nextents;
185 if (nexts > XFS_EFI_MAX_FAST_EXTENTS) {
186 size = sizeof(xfs_efi_log_item_t);
187 size += (nexts - 1) * sizeof(xfs_extent_t);
188 kmem_free(efip, size);
189 } else {
190 kmem_zone_free(xfs_efi_zone, efip);
191 }
192 } else {
193 efip->efi_flags |= XFS_EFI_COMMITTED;
194 AIL_UNLOCK(mp, s);
195 }
196
197 return;
198}
199
200/*
201 * Efi items have no locking or pushing. However, since EFIs are
202 * pulled from the AIL when their corresponding EFDs are committed
203 * to disk, their situation is very similar to being pinned. Return
204 * XFS_ITEM_PINNED so that the caller will eventually flush the log.
205 * This should help in getting the EFI out of the AIL.
206 */
207/*ARGSUSED*/
208STATIC uint
209xfs_efi_item_trylock(xfs_efi_log_item_t *efip)
210{
211 return XFS_ITEM_PINNED;
212}
213
214/*
215 * Efi items have no locking, so just return.
216 */
217/*ARGSUSED*/
218STATIC void
219xfs_efi_item_unlock(xfs_efi_log_item_t *efip)
220{
221 if (efip->efi_item.li_flags & XFS_LI_ABORTED)
222 xfs_efi_item_abort(efip);
223 return;
224}
225
226/*
227 * The EFI is logged only once and cannot be moved in the log, so
228 * simply return the lsn at which it's been logged. The canceled
229 * flag is not paid any attention here. Checking for that is delayed
230 * until the EFI is unpinned.
231 */
232/*ARGSUSED*/
233STATIC xfs_lsn_t
234xfs_efi_item_committed(xfs_efi_log_item_t *efip, xfs_lsn_t lsn)
235{
236 return lsn;
237}
238
239/*
240 * This is called when the transaction logging the EFI is aborted.
241 * Free up the EFI and return. No need to clean up the slot for
242 * the item in the transaction. That was done by the unpin code
243 * which is called prior to this routine in the abort/fs-shutdown path.
244 */
245STATIC void
246xfs_efi_item_abort(xfs_efi_log_item_t *efip)
247{
248 int nexts;
249 int size;
250
251 nexts = efip->efi_format.efi_nextents;
252 if (nexts > XFS_EFI_MAX_FAST_EXTENTS) {
253 size = sizeof(xfs_efi_log_item_t);
254 size += (nexts - 1) * sizeof(xfs_extent_t);
255 kmem_free(efip, size);
256 } else {
257 kmem_zone_free(xfs_efi_zone, efip);
258 }
259 return;
260}
261
262/*
263 * There isn't much you can do to push on an efi item. It is simply
264 * stuck waiting for all of its corresponding efd items to be
265 * committed to disk.
266 */
267/*ARGSUSED*/
268STATIC void
269xfs_efi_item_push(xfs_efi_log_item_t *efip)
270{
271 return;
272}
273
274/*
275 * The EFI dependency tracking op doesn't do squat. It can't because
276 * it doesn't know where the free extent is coming from. The dependency
277 * tracking has to be handled by the "enclosing" metadata object. For
278 * example, for inodes, the inode is locked throughout the extent freeing
279 * so the dependency should be recorded there.
280 */
281/*ARGSUSED*/
282STATIC void
283xfs_efi_item_committing(xfs_efi_log_item_t *efip, xfs_lsn_t lsn)
284{
285 return;
286}
287
288/*
289 * This is the ops vector shared by all efi log items.
290 */
291struct xfs_item_ops xfs_efi_item_ops = {
292 .iop_size = (uint(*)(xfs_log_item_t*))xfs_efi_item_size,
293 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
294 xfs_efi_item_format,
295 .iop_pin = (void(*)(xfs_log_item_t*))xfs_efi_item_pin,
296 .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_efi_item_unpin,
297 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
298 xfs_efi_item_unpin_remove,
299 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efi_item_trylock,
300 .iop_unlock = (void(*)(xfs_log_item_t*))xfs_efi_item_unlock,
301 .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
302 xfs_efi_item_committed,
303 .iop_push = (void(*)(xfs_log_item_t*))xfs_efi_item_push,
304 .iop_abort = (void(*)(xfs_log_item_t*))xfs_efi_item_abort,
305 .iop_pushbuf = NULL,
306 .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
307 xfs_efi_item_committing
308};
309
310
311/*
312 * Allocate and initialize an efi item with the given number of extents.
313 */
314xfs_efi_log_item_t *
315xfs_efi_init(xfs_mount_t *mp,
316 uint nextents)
317
318{
319 xfs_efi_log_item_t *efip;
320 uint size;
321
322 ASSERT(nextents > 0);
323 if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
324 size = (uint)(sizeof(xfs_efi_log_item_t) +
325 ((nextents - 1) * sizeof(xfs_extent_t)));
326 efip = (xfs_efi_log_item_t*)kmem_zalloc(size, KM_SLEEP);
327 } else {
328 efip = (xfs_efi_log_item_t*)kmem_zone_zalloc(xfs_efi_zone,
329 KM_SLEEP);
330 }
331
332 efip->efi_item.li_type = XFS_LI_EFI;
333 efip->efi_item.li_ops = &xfs_efi_item_ops;
334 efip->efi_item.li_mountp = mp;
335 efip->efi_format.efi_nextents = nextents;
336 efip->efi_format.efi_id = (__psint_t)(void*)efip;
337
338 return (efip);
339}
340
341/*
342 * This is called by the efd item code below to release references to
343 * the given efi item. Each efd calls this with the number of
344 * extents that it has logged, and when the sum of these reaches
345 * the total number of extents logged by this efi item we can free
346 * the efi item.
347 *
348 * Freeing the efi item requires that we remove it from the AIL.
349 * We'll use the AIL lock to protect our counters as well as
350 * the removal from the AIL.
351 */
352void
353xfs_efi_release(xfs_efi_log_item_t *efip,
354 uint nextents)
355{
356 xfs_mount_t *mp;
357 int extents_left;
358 uint size;
359 int nexts;
360 SPLDECL(s);
361
362 mp = efip->efi_item.li_mountp;
363 ASSERT(efip->efi_next_extent > 0);
364 ASSERT(efip->efi_flags & XFS_EFI_COMMITTED);
365
366 AIL_LOCK(mp, s);
367 ASSERT(efip->efi_next_extent >= nextents);
368 efip->efi_next_extent -= nextents;
369 extents_left = efip->efi_next_extent;
370 if (extents_left == 0) {
371 /*
372 * xfs_trans_delete_ail() drops the AIL lock.
373 */
374 xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s);
375 } else {
376 AIL_UNLOCK(mp, s);
377 }
378
379 if (extents_left == 0) {
380 nexts = efip->efi_format.efi_nextents;
381 if (nexts > XFS_EFI_MAX_FAST_EXTENTS) {
382 size = sizeof(xfs_efi_log_item_t);
383 size += (nexts - 1) * sizeof(xfs_extent_t);
384 kmem_free(efip, size);
385 } else {
386 kmem_zone_free(xfs_efi_zone, efip);
387 }
388 }
389}
390
391/*
392 * This is called when the transaction that should be committing the
393 * EFD corresponding to the given EFI is aborted. The committed and
394 * canceled flags are used to coordinate the freeing of the EFI and
395 * the references by the transaction that committed it.
396 */
397STATIC void
398xfs_efi_cancel(
399 xfs_efi_log_item_t *efip)
400{
401 int nexts;
402 int size;
403 xfs_mount_t *mp;
404 SPLDECL(s);
405
406 mp = efip->efi_item.li_mountp;
407 AIL_LOCK(mp, s);
408 if (efip->efi_flags & XFS_EFI_COMMITTED) {
409 /*
410 * xfs_trans_delete_ail() drops the AIL lock.
411 */
412 xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s);
413
414 nexts = efip->efi_format.efi_nextents;
415 if (nexts > XFS_EFI_MAX_FAST_EXTENTS) {
416 size = sizeof(xfs_efi_log_item_t);
417 size += (nexts - 1) * sizeof(xfs_extent_t);
418 kmem_free(efip, size);
419 } else {
420 kmem_zone_free(xfs_efi_zone, efip);
421 }
422 } else {
423 efip->efi_flags |= XFS_EFI_CANCELED;
424 AIL_UNLOCK(mp, s);
425 }
426
427 return;
428}
429
430
431
432
433
434/*
435 * This returns the number of iovecs needed to log the given efd item.
436 * We only need 1 iovec for an efd item. It just logs the efd_log_format
437 * structure.
438 */
439/*ARGSUSED*/
440STATIC uint
441xfs_efd_item_size(xfs_efd_log_item_t *efdp)
442{
443 return 1;
444}
445
446/*
447 * This is called to fill in the vector of log iovecs for the
448 * given efd log item. We use only 1 iovec, and we point that
449 * at the efd_log_format structure embedded in the efd item.
450 * It is at this point that we assert that all of the extent
451 * slots in the efd item have been filled.
452 */
453STATIC void
454xfs_efd_item_format(xfs_efd_log_item_t *efdp,
455 xfs_log_iovec_t *log_vector)
456{
457 uint size;
458
459 ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents);
460
461 efdp->efd_format.efd_type = XFS_LI_EFD;
462
463 size = sizeof(xfs_efd_log_format_t);
464 size += (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t);
465 efdp->efd_format.efd_size = 1;
466
467 log_vector->i_addr = (xfs_caddr_t)&(efdp->efd_format);
468 log_vector->i_len = size;
469 ASSERT(size >= sizeof(xfs_efd_log_format_t));
470}
471
472
473/*
474 * Pinning has no meaning for an efd item, so just return.
475 */
476/*ARGSUSED*/
477STATIC void
478xfs_efd_item_pin(xfs_efd_log_item_t *efdp)
479{
480 return;
481}
482
483
484/*
485 * Since pinning has no meaning for an efd item, unpinning does
486 * not either.
487 */
488/*ARGSUSED*/
489STATIC void
490xfs_efd_item_unpin(xfs_efd_log_item_t *efdp, int stale)
491{
492 return;
493}
494
495/*ARGSUSED*/
496STATIC void
497xfs_efd_item_unpin_remove(xfs_efd_log_item_t *efdp, xfs_trans_t *tp)
498{
499 return;
500}
501
502/*
503 * Efd items have no locking, so just return success.
504 */
505/*ARGSUSED*/
506STATIC uint
507xfs_efd_item_trylock(xfs_efd_log_item_t *efdp)
508{
509 return XFS_ITEM_LOCKED;
510}
511
512/*
513 * Efd items have no locking or pushing, so return failure
514 * so that the caller doesn't bother with us.
515 */
516/*ARGSUSED*/
517STATIC void
518xfs_efd_item_unlock(xfs_efd_log_item_t *efdp)
519{
520 if (efdp->efd_item.li_flags & XFS_LI_ABORTED)
521 xfs_efd_item_abort(efdp);
522 return;
523}
524
525/*
526 * When the efd item is committed to disk, all we need to do
527 * is delete our reference to our partner efi item and then
528 * free ourselves. Since we're freeing ourselves we must
529 * return -1 to keep the transaction code from further referencing
530 * this item.
531 */
532/*ARGSUSED*/
533STATIC xfs_lsn_t
534xfs_efd_item_committed(xfs_efd_log_item_t *efdp, xfs_lsn_t lsn)
535{
536 uint size;
537 int nexts;
538
539 /*
540 * If we got a log I/O error, it's always the case that the LR with the
541 * EFI got unpinned and freed before the EFD got aborted.
542 */
543 if ((efdp->efd_item.li_flags & XFS_LI_ABORTED) == 0)
544 xfs_efi_release(efdp->efd_efip, efdp->efd_format.efd_nextents);
545
546 nexts = efdp->efd_format.efd_nextents;
547 if (nexts > XFS_EFD_MAX_FAST_EXTENTS) {
548 size = sizeof(xfs_efd_log_item_t);
549 size += (nexts - 1) * sizeof(xfs_extent_t);
550 kmem_free(efdp, size);
551 } else {
552 kmem_zone_free(xfs_efd_zone, efdp);
553 }
554
555 return (xfs_lsn_t)-1;
556}
557
558/*
559 * The transaction of which this EFD is a part has been aborted.
560 * Inform its companion EFI of this fact and then clean up after
561 * ourselves. No need to clean up the slot for the item in the
562 * transaction. That was done by the unpin code which is called
563 * prior to this routine in the abort/fs-shutdown path.
564 */
565STATIC void
566xfs_efd_item_abort(xfs_efd_log_item_t *efdp)
567{
568 int nexts;
569 int size;
570
571 /*
572 * If we got a log I/O error, it's always the case that the LR with the
573 * EFI got unpinned and freed before the EFD got aborted. So don't
574 * reference the EFI at all in that case.
575 */
576 if ((efdp->efd_item.li_flags & XFS_LI_ABORTED) == 0)
577 xfs_efi_cancel(efdp->efd_efip);
578
579 nexts = efdp->efd_format.efd_nextents;
580 if (nexts > XFS_EFD_MAX_FAST_EXTENTS) {
581 size = sizeof(xfs_efd_log_item_t);
582 size += (nexts - 1) * sizeof(xfs_extent_t);
583 kmem_free(efdp, size);
584 } else {
585 kmem_zone_free(xfs_efd_zone, efdp);
586 }
587 return;
588}
589
590/*
591 * There isn't much you can do to push on an efd item. It is simply
592 * stuck waiting for the log to be flushed to disk.
593 */
594/*ARGSUSED*/
595STATIC void
596xfs_efd_item_push(xfs_efd_log_item_t *efdp)
597{
598 return;
599}
600
601/*
602 * The EFD dependency tracking op doesn't do squat. It can't because
603 * it doesn't know where the free extent is coming from. The dependency
604 * tracking has to be handled by the "enclosing" metadata object. For
605 * example, for inodes, the inode is locked throughout the extent freeing
606 * so the dependency should be recorded there.
607 */
608/*ARGSUSED*/
609STATIC void
610xfs_efd_item_committing(xfs_efd_log_item_t *efip, xfs_lsn_t lsn)
611{
612 return;
613}
614
615/*
616 * This is the ops vector shared by all efd log items.
617 */
618struct xfs_item_ops xfs_efd_item_ops = {
619 .iop_size = (uint(*)(xfs_log_item_t*))xfs_efd_item_size,
620 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
621 xfs_efd_item_format,
622 .iop_pin = (void(*)(xfs_log_item_t*))xfs_efd_item_pin,
623 .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_efd_item_unpin,
624 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
625 xfs_efd_item_unpin_remove,
626 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efd_item_trylock,
627 .iop_unlock = (void(*)(xfs_log_item_t*))xfs_efd_item_unlock,
628 .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
629 xfs_efd_item_committed,
630 .iop_push = (void(*)(xfs_log_item_t*))xfs_efd_item_push,
631 .iop_abort = (void(*)(xfs_log_item_t*))xfs_efd_item_abort,
632 .iop_pushbuf = NULL,
633 .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
634 xfs_efd_item_committing
635};
636
637
638/*
639 * Allocate and initialize an efd item with the given number of extents.
640 */
641xfs_efd_log_item_t *
642xfs_efd_init(xfs_mount_t *mp,
643 xfs_efi_log_item_t *efip,
644 uint nextents)
645
646{
647 xfs_efd_log_item_t *efdp;
648 uint size;
649
650 ASSERT(nextents > 0);
651 if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
652 size = (uint)(sizeof(xfs_efd_log_item_t) +
653 ((nextents - 1) * sizeof(xfs_extent_t)));
654 efdp = (xfs_efd_log_item_t*)kmem_zalloc(size, KM_SLEEP);
655 } else {
656 efdp = (xfs_efd_log_item_t*)kmem_zone_zalloc(xfs_efd_zone,
657 KM_SLEEP);
658 }
659
660 efdp->efd_item.li_type = XFS_LI_EFD;
661 efdp->efd_item.li_ops = &xfs_efd_item_ops;
662 efdp->efd_item.li_mountp = mp;
663 efdp->efd_efip = efip;
664 efdp->efd_format.efd_nextents = nextents;
665 efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
666
667 return (efdp);
668}
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
new file mode 100644
index 000000000000..7122d6101d15
--- /dev/null
+++ b/fs/xfs/xfs_extfree_item.h
@@ -0,0 +1,123 @@
1/*
2 * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_EXTFREE_ITEM_H__
33#define __XFS_EXTFREE_ITEM_H__
34
35struct xfs_mount;
36struct kmem_zone;
37
38typedef struct xfs_extent {
39 xfs_dfsbno_t ext_start;
40 xfs_extlen_t ext_len;
41} xfs_extent_t;
42
43/*
44 * This is the structure used to lay out an efi log item in the
45 * log. The efi_extents field is a variable size array whose
46 * size is given by efi_nextents.
47 */
48typedef struct xfs_efi_log_format {
49 unsigned short efi_type; /* efi log item type */
50 unsigned short efi_size; /* size of this item */
51 uint efi_nextents; /* # extents to free */
52 __uint64_t efi_id; /* efi identifier */
53 xfs_extent_t efi_extents[1]; /* array of extents to free */
54} xfs_efi_log_format_t;
55
56/*
57 * This is the structure used to lay out an efd log item in the
58 * log. The efd_extents array is a variable size array whose
59 * size is given by efd_nextents;
60 */
61typedef struct xfs_efd_log_format {
62 unsigned short efd_type; /* efd log item type */
63 unsigned short efd_size; /* size of this item */
64 uint efd_nextents; /* # of extents freed */
65 __uint64_t efd_efi_id; /* id of corresponding efi */
66 xfs_extent_t efd_extents[1]; /* array of extents freed */
67} xfs_efd_log_format_t;
68
69
70#ifdef __KERNEL__
71
72/*
73 * Max number of extents in fast allocation path.
74 */
75#define XFS_EFI_MAX_FAST_EXTENTS 16
76
77/*
78 * Define EFI flags.
79 */
80#define XFS_EFI_RECOVERED 0x1
81#define XFS_EFI_COMMITTED 0x2
82#define XFS_EFI_CANCELED 0x4
83
84/*
85 * This is the "extent free intention" log item. It is used
86 * to log the fact that some extents need to be free. It is
87 * used in conjunction with the "extent free done" log item
88 * described below.
89 */
90typedef struct xfs_efi_log_item {
91 xfs_log_item_t efi_item;
92 uint efi_flags; /* misc flags */
93 uint efi_next_extent;
94 xfs_efi_log_format_t efi_format;
95} xfs_efi_log_item_t;
96
97/*
98 * This is the "extent free done" log item. It is used to log
99 * the fact that some extents earlier mentioned in an efi item
100 * have been freed.
101 */
102typedef struct xfs_efd_log_item {
103 xfs_log_item_t efd_item;
104 xfs_efi_log_item_t *efd_efip;
105 uint efd_next_extent;
106 xfs_efd_log_format_t efd_format;
107} xfs_efd_log_item_t;
108
109/*
110 * Max number of extents in fast allocation path.
111 */
112#define XFS_EFD_MAX_FAST_EXTENTS 16
113
114extern struct kmem_zone *xfs_efi_zone;
115extern struct kmem_zone *xfs_efd_zone;
116
117xfs_efi_log_item_t *xfs_efi_init(struct xfs_mount *, uint);
118xfs_efd_log_item_t *xfs_efd_init(struct xfs_mount *, xfs_efi_log_item_t *,
119 uint);
120
121#endif /* __KERNEL__ */
122
123#endif /* __XFS_EXTFREE_ITEM_H__ */
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
new file mode 100644
index 000000000000..6ee8443bf9d3
--- /dev/null
+++ b/fs/xfs/xfs_fs.h
@@ -0,0 +1,527 @@
1/*
2 * Copyright (c) 1995-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2.1 of the GNU Lesser General Public License
6 * as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this program; if not, write the Free Software
21 * Foundation, Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307,
22 * USA.
23 *
24 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
25 * Mountain View, CA 94043, or:
26 *
27 * http://www.sgi.com
28 *
29 * For further information regarding this notice, see:
30 *
31 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
32 */
33#ifndef __XFS_FS_H__
34#define __XFS_FS_H__
35
36/*
37 * SGI's XFS filesystem's major stuff (constants, structures)
38 */
39
40#define XFS_NAME "xfs"
41
42/*
43 * Direct I/O attribute record used with XFS_IOC_DIOINFO
44 * d_miniosz is the min xfer size, xfer size multiple and file seek offset
45 * alignment.
46 */
47#ifndef HAVE_DIOATTR
48struct dioattr {
49 __u32 d_mem; /* data buffer memory alignment */
50 __u32 d_miniosz; /* min xfer size */
51 __u32 d_maxiosz; /* max xfer size */
52};
53#endif
54
55/*
56 * Structure for XFS_IOC_FSGETXATTR[A] and XFS_IOC_FSSETXATTR.
57 */
58#ifndef HAVE_FSXATTR
59struct fsxattr {
60 __u32 fsx_xflags; /* xflags field value (get/set) */
61 __u32 fsx_extsize; /* extsize field value (get/set)*/
62 __u32 fsx_nextents; /* nextents field value (get) */
63 unsigned char fsx_pad[16];
64};
65#endif
66
67/*
68 * Flags for the bs_xflags/fsx_xflags field
69 * There should be a one-to-one correspondence between these flags and the
70 * XFS_DIFLAG_s.
71 */
72#define XFS_XFLAG_REALTIME 0x00000001 /* data in realtime volume */
73#define XFS_XFLAG_PREALLOC 0x00000002 /* preallocated file extents */
74#define XFS_XFLAG_IMMUTABLE 0x00000008 /* file cannot be modified */
75#define XFS_XFLAG_APPEND 0x00000010 /* all writes append */
76#define XFS_XFLAG_SYNC 0x00000020 /* all writes synchronous */
77#define XFS_XFLAG_NOATIME 0x00000040 /* do not update access time */
78#define XFS_XFLAG_NODUMP 0x00000080 /* do not include in backups */
79#define XFS_XFLAG_RTINHERIT 0x00000100 /* create with rt bit set */
80#define XFS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */
81#define XFS_XFLAG_NOSYMLINKS 0x00000400 /* disallow symlink creation */
82#define XFS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */
83
84/*
85 * Structure for XFS_IOC_GETBMAP.
86 * On input, fill in bmv_offset and bmv_length of the first structure
87 * to indicate the area of interest in the file, and bmv_entry with the
88 * number of array elements given. The first structure is updated on
89 * return to give the offset and length for the next call.
90 */
91#ifndef HAVE_GETBMAP
92struct getbmap {
93 __s64 bmv_offset; /* file offset of segment in blocks */
94 __s64 bmv_block; /* starting block (64-bit daddr_t) */
95 __s64 bmv_length; /* length of segment, blocks */
96 __s32 bmv_count; /* # of entries in array incl. 1st */
97 __s32 bmv_entries; /* # of entries filled in (output) */
98};
99#endif
100
101/*
102 * Structure for XFS_IOC_GETBMAPX. Fields bmv_offset through bmv_entries
103 * are used exactly as in the getbmap structure. The getbmapx structure
104 * has additional bmv_iflags and bmv_oflags fields. The bmv_iflags field
105 * is only used for the first structure. It contains input flags
106 * specifying XFS_IOC_GETBMAPX actions. The bmv_oflags field is filled
107 * in by the XFS_IOC_GETBMAPX command for each returned structure after
108 * the first.
109 */
110#ifndef HAVE_GETBMAPX
111struct getbmapx {
112 __s64 bmv_offset; /* file offset of segment in blocks */
113 __s64 bmv_block; /* starting block (64-bit daddr_t) */
114 __s64 bmv_length; /* length of segment, blocks */
115 __s32 bmv_count; /* # of entries in array incl. 1st */
116 __s32 bmv_entries; /* # of entries filled in (output). */
117 __s32 bmv_iflags; /* input flags (1st structure) */
118 __s32 bmv_oflags; /* output flags (after 1st structure)*/
119 __s32 bmv_unused1; /* future use */
120 __s32 bmv_unused2; /* future use */
121};
122#endif
123
124/* bmv_iflags values - set by XFS_IOC_GETBMAPX caller. */
125#define BMV_IF_ATTRFORK 0x1 /* return attr fork rather than data */
126#define BMV_IF_NO_DMAPI_READ 0x2 /* Do not generate DMAPI read event */
127#define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */
128#define BMV_IF_VALID (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC)
129#ifdef __KERNEL__
130#define BMV_IF_EXTENDED 0x40000000 /* getpmapx if set */
131#endif
132
133/* bmv_oflags values - returned for for each non-header segment */
134#define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */
135
136/* Convert getbmap <-> getbmapx - move fields from p1 to p2. */
137#define GETBMAP_CONVERT(p1,p2) { \
138 p2.bmv_offset = p1.bmv_offset; \
139 p2.bmv_block = p1.bmv_block; \
140 p2.bmv_length = p1.bmv_length; \
141 p2.bmv_count = p1.bmv_count; \
142 p2.bmv_entries = p1.bmv_entries; }
143
144
145/*
146 * Structure for XFS_IOC_FSSETDM.
147 * For use by backup and restore programs to set the XFS on-disk inode
148 * fields di_dmevmask and di_dmstate. These must be set to exactly and
149 * only values previously obtained via xfs_bulkstat! (Specifically the
150 * xfs_bstat_t fields bs_dmevmask and bs_dmstate.)
151 */
152#ifndef HAVE_FSDMIDATA
153struct fsdmidata {
154 __u32 fsd_dmevmask; /* corresponds to di_dmevmask */
155 __u16 fsd_padding;
156 __u16 fsd_dmstate; /* corresponds to di_dmstate */
157};
158#endif
159
160/*
161 * File segment locking set data type for 64 bit access.
162 * Also used for all the RESV/FREE interfaces.
163 */
164typedef struct xfs_flock64 {
165 __s16 l_type;
166 __s16 l_whence;
167 __s64 l_start;
168 __s64 l_len; /* len == 0 means until end of file */
169 __s32 l_sysid;
170 __u32 l_pid;
171 __s32 l_pad[4]; /* reserve area */
172} xfs_flock64_t;
173
174/*
175 * Output for XFS_IOC_FSGEOMETRY_V1
176 */
177typedef struct xfs_fsop_geom_v1 {
178 __u32 blocksize; /* filesystem (data) block size */
179 __u32 rtextsize; /* realtime extent size */
180 __u32 agblocks; /* fsblocks in an AG */
181 __u32 agcount; /* number of allocation groups */
182 __u32 logblocks; /* fsblocks in the log */
183 __u32 sectsize; /* (data) sector size, bytes */
184 __u32 inodesize; /* inode size in bytes */
185 __u32 imaxpct; /* max allowed inode space(%) */
186 __u64 datablocks; /* fsblocks in data subvolume */
187 __u64 rtblocks; /* fsblocks in realtime subvol */
188 __u64 rtextents; /* rt extents in realtime subvol*/
189 __u64 logstart; /* starting fsblock of the log */
190 unsigned char uuid[16]; /* unique id of the filesystem */
191 __u32 sunit; /* stripe unit, fsblocks */
192 __u32 swidth; /* stripe width, fsblocks */
193 __s32 version; /* structure version */
194 __u32 flags; /* superblock version flags */
195 __u32 logsectsize; /* log sector size, bytes */
196 __u32 rtsectsize; /* realtime sector size, bytes */
197 __u32 dirblocksize; /* directory block size, bytes */
198} xfs_fsop_geom_v1_t;
199
200/*
201 * Output for XFS_IOC_FSGEOMETRY
202 */
203typedef struct xfs_fsop_geom {
204 __u32 blocksize; /* filesystem (data) block size */
205 __u32 rtextsize; /* realtime extent size */
206 __u32 agblocks; /* fsblocks in an AG */
207 __u32 agcount; /* number of allocation groups */
208 __u32 logblocks; /* fsblocks in the log */
209 __u32 sectsize; /* (data) sector size, bytes */
210 __u32 inodesize; /* inode size in bytes */
211 __u32 imaxpct; /* max allowed inode space(%) */
212 __u64 datablocks; /* fsblocks in data subvolume */
213 __u64 rtblocks; /* fsblocks in realtime subvol */
214 __u64 rtextents; /* rt extents in realtime subvol*/
215 __u64 logstart; /* starting fsblock of the log */
216 unsigned char uuid[16]; /* unique id of the filesystem */
217 __u32 sunit; /* stripe unit, fsblocks */
218 __u32 swidth; /* stripe width, fsblocks */
219 __s32 version; /* structure version */
220 __u32 flags; /* superblock version flags */
221 __u32 logsectsize; /* log sector size, bytes */
222 __u32 rtsectsize; /* realtime sector size, bytes */
223 __u32 dirblocksize; /* directory block size, bytes */
224 __u32 logsunit; /* log stripe unit, bytes */
225} xfs_fsop_geom_t;
226
227/* Output for XFS_FS_COUNTS */
228typedef struct xfs_fsop_counts {
229 __u64 freedata; /* free data section blocks */
230 __u64 freertx; /* free rt extents */
231 __u64 freeino; /* free inodes */
232 __u64 allocino; /* total allocated inodes */
233} xfs_fsop_counts_t;
234
235/* Input/Output for XFS_GET_RESBLKS and XFS_SET_RESBLKS */
236typedef struct xfs_fsop_resblks {
237 __u64 resblks;
238 __u64 resblks_avail;
239} xfs_fsop_resblks_t;
240
241#define XFS_FSOP_GEOM_VERSION 0
242
243#define XFS_FSOP_GEOM_FLAGS_ATTR 0x0001 /* attributes in use */
244#define XFS_FSOP_GEOM_FLAGS_NLINK 0x0002 /* 32-bit nlink values */
245#define XFS_FSOP_GEOM_FLAGS_QUOTA 0x0004 /* quotas enabled */
246#define XFS_FSOP_GEOM_FLAGS_IALIGN 0x0008 /* inode alignment */
247#define XFS_FSOP_GEOM_FLAGS_DALIGN 0x0010 /* large data alignment */
248#define XFS_FSOP_GEOM_FLAGS_SHARED 0x0020 /* read-only shared */
249#define XFS_FSOP_GEOM_FLAGS_EXTFLG 0x0040 /* special extent flag */
250#define XFS_FSOP_GEOM_FLAGS_DIRV2 0x0080 /* directory version 2 */
251#define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */
252#define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */
253
254
255/*
256 * Minimum and maximum sizes need for growth checks
257 */
258#define XFS_MIN_AG_BLOCKS 64
259#define XFS_MIN_LOG_BLOCKS 512
260#define XFS_MAX_LOG_BLOCKS (64 * 1024)
261#define XFS_MIN_LOG_BYTES (256 * 1024)
262#define XFS_MAX_LOG_BYTES (128 * 1024 * 1024)
263
264/*
265 * Structures for XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG & XFS_IOC_FSGROWFSRT
266 */
267typedef struct xfs_growfs_data {
268 __u64 newblocks; /* new data subvol size, fsblocks */
269 __u32 imaxpct; /* new inode space percentage limit */
270} xfs_growfs_data_t;
271
272typedef struct xfs_growfs_log {
273 __u32 newblocks; /* new log size, fsblocks */
274 __u32 isint; /* 1 if new log is internal */
275} xfs_growfs_log_t;
276
277typedef struct xfs_growfs_rt {
278 __u64 newblocks; /* new realtime size, fsblocks */
279 __u32 extsize; /* new realtime extent size, fsblocks */
280} xfs_growfs_rt_t;
281
282
283/*
284 * Structures returned from ioctl XFS_IOC_FSBULKSTAT & XFS_IOC_FSBULKSTAT_SINGLE
285 */
286typedef struct xfs_bstime {
287 time_t tv_sec; /* seconds */
288 __s32 tv_nsec; /* and nanoseconds */
289} xfs_bstime_t;
290
291typedef struct xfs_bstat {
292 __u64 bs_ino; /* inode number */
293 __u16 bs_mode; /* type and mode */
294 __u16 bs_nlink; /* number of links */
295 __u32 bs_uid; /* user id */
296 __u32 bs_gid; /* group id */
297 __u32 bs_rdev; /* device value */
298 __s32 bs_blksize; /* block size */
299 __s64 bs_size; /* file size */
300 xfs_bstime_t bs_atime; /* access time */
301 xfs_bstime_t bs_mtime; /* modify time */
302 xfs_bstime_t bs_ctime; /* inode change time */
303 int64_t bs_blocks; /* number of blocks */
304 __u32 bs_xflags; /* extended flags */
305 __s32 bs_extsize; /* extent size */
306 __s32 bs_extents; /* number of extents */
307 __u32 bs_gen; /* generation count */
308 __u16 bs_projid; /* project id */
309 unsigned char bs_pad[14]; /* pad space, unused */
310 __u32 bs_dmevmask; /* DMIG event mask */
311 __u16 bs_dmstate; /* DMIG state info */
312 __u16 bs_aextents; /* attribute number of extents */
313} xfs_bstat_t;
314
315/*
316 * The user-level BulkStat Request interface structure.
317 */
318typedef struct xfs_fsop_bulkreq {
319 __u64 __user *lastip; /* last inode # pointer */
320 __s32 icount; /* count of entries in buffer */
321 void __user *ubuffer;/* user buffer for inode desc. */
322 __s32 __user *ocount; /* output count pointer */
323} xfs_fsop_bulkreq_t;
324
325
326/*
327 * Structures returned from xfs_inumbers routine (XFS_IOC_FSINUMBERS).
328 */
329typedef struct xfs_inogrp {
330 __u64 xi_startino; /* starting inode number */
331 __s32 xi_alloccount; /* # bits set in allocmask */
332 __u64 xi_allocmask; /* mask of allocated inodes */
333} xfs_inogrp_t;
334
335
336/*
337 * Error injection.
338 */
339typedef struct xfs_error_injection {
340 __s32 fd;
341 __s32 errtag;
342} xfs_error_injection_t;
343
344
345/*
346 * The user-level Handle Request interface structure.
347 */
348typedef struct xfs_fsop_handlereq {
349 __u32 fd; /* fd for FD_TO_HANDLE */
350 void __user *path; /* user pathname */
351 __u32 oflags; /* open flags */
352 void __user *ihandle;/* user supplied handle */
353 __u32 ihandlen; /* user supplied length */
354 void __user *ohandle;/* user buffer for handle */
355 __u32 __user *ohandlen;/* user buffer length */
356} xfs_fsop_handlereq_t;
357
358/*
359 * Compound structures for passing args through Handle Request interfaces
360 * xfs_fssetdm_by_handle, xfs_attrlist_by_handle, xfs_attrmulti_by_handle
361 * - ioctls: XFS_IOC_FSSETDM_BY_HANDLE, XFS_IOC_ATTRLIST_BY_HANDLE, and
362 * XFS_IOC_ATTRMULTI_BY_HANDLE
363 */
364
365typedef struct xfs_fsop_setdm_handlereq {
366 struct xfs_fsop_handlereq hreq; /* handle information */
367 struct fsdmidata __user *data; /* DMAPI data */
368} xfs_fsop_setdm_handlereq_t;
369
370typedef struct xfs_attrlist_cursor {
371 __u32 opaque[4];
372} xfs_attrlist_cursor_t;
373
374typedef struct xfs_fsop_attrlist_handlereq {
375 struct xfs_fsop_handlereq hreq; /* handle interface structure */
376 struct xfs_attrlist_cursor pos; /* opaque cookie, list offset */
377 __u32 flags; /* which namespace to use */
378 __u32 buflen; /* length of buffer supplied */
379 void __user *buffer; /* returned names */
380} xfs_fsop_attrlist_handlereq_t;
381
382typedef struct xfs_attr_multiop {
383 __u32 am_opcode;
384 __s32 am_error;
385 void __user *am_attrname;
386 void __user *am_attrvalue;
387 __u32 am_length;
388 __u32 am_flags;
389} xfs_attr_multiop_t;
390
391typedef struct xfs_fsop_attrmulti_handlereq {
392 struct xfs_fsop_handlereq hreq; /* handle interface structure */
393 __u32 opcount;/* count of following multiop */
394 struct xfs_attr_multiop __user *ops; /* attr_multi data */
395} xfs_fsop_attrmulti_handlereq_t;
396
397/*
398 * per machine unique filesystem identifier types.
399 */
400typedef struct { __u32 val[2]; } xfs_fsid_t; /* file system id type */
401
402
403#ifndef HAVE_FID
404#define MAXFIDSZ 46
405
406typedef struct fid {
407 __u16 fid_len; /* length of data in bytes */
408 unsigned char fid_data[MAXFIDSZ]; /* data (fid_len worth) */
409} fid_t;
410#endif
411
412typedef struct xfs_fid {
413 __u16 xfs_fid_len; /* length of remainder */
414 __u16 xfs_fid_pad;
415 __u32 xfs_fid_gen; /* generation number */
416 __u64 xfs_fid_ino; /* 64 bits inode number */
417} xfs_fid_t;
418
419typedef struct xfs_fid2 {
420 __u16 fid_len; /* length of remainder */
421 __u16 fid_pad; /* padding, must be zero */
422 __u32 fid_gen; /* generation number */
423 __u64 fid_ino; /* inode number */
424} xfs_fid2_t;
425
426typedef struct xfs_handle {
427 union {
428 __s64 align; /* force alignment of ha_fid */
429 xfs_fsid_t _ha_fsid; /* unique file system identifier */
430 } ha_u;
431 xfs_fid_t ha_fid; /* file system specific file ID */
432} xfs_handle_t;
433#define ha_fsid ha_u._ha_fsid
434
435#define XFS_HSIZE(handle) (((char *) &(handle).ha_fid.xfs_fid_pad \
436 - (char *) &(handle)) \
437 + (handle).ha_fid.xfs_fid_len)
438
439#define XFS_HANDLE_CMP(h1, h2) memcmp(h1, h2, sizeof(xfs_handle_t))
440
441#define FSHSIZE sizeof(fsid_t)
442
443/*
444 * Flags for going down operation
445 */
446#define XFS_FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */
447#define XFS_FSOP_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */
448#define XFS_FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */
449
450/*
451 * ioctl commands that are used by Linux filesystems
452 */
453#define XFS_IOC_GETXFLAGS _IOR('f', 1, long)
454#define XFS_IOC_SETXFLAGS _IOW('f', 2, long)
455#define XFS_IOC_GETVERSION _IOR('v', 1, long)
456
457/*
458 * ioctl commands that replace IRIX fcntl()'s
459 * For 'documentation' purposed more than anything else,
460 * the "cmd #" field reflects the IRIX fcntl number.
461 */
462#define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64)
463#define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64)
464#define XFS_IOC_DIOINFO _IOR ('X', 30, struct dioattr)
465#define XFS_IOC_FSGETXATTR _IOR ('X', 31, struct fsxattr)
466#define XFS_IOC_FSSETXATTR _IOW ('X', 32, struct fsxattr)
467#define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64)
468#define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64)
469#define XFS_IOC_GETBMAP _IOWR('X', 38, struct getbmap)
470#define XFS_IOC_FSSETDM _IOW ('X', 39, struct fsdmidata)
471#define XFS_IOC_RESVSP _IOW ('X', 40, struct xfs_flock64)
472#define XFS_IOC_UNRESVSP _IOW ('X', 41, struct xfs_flock64)
473#define XFS_IOC_RESVSP64 _IOW ('X', 42, struct xfs_flock64)
474#define XFS_IOC_UNRESVSP64 _IOW ('X', 43, struct xfs_flock64)
475#define XFS_IOC_GETBMAPA _IOWR('X', 44, struct getbmap)
476#define XFS_IOC_FSGETXATTRA _IOR ('X', 45, struct fsxattr)
477/* XFS_IOC_SETBIOSIZE ---- deprecated 46 */
478/* XFS_IOC_GETBIOSIZE ---- deprecated 47 */
479#define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap)
480
481/*
482 * ioctl commands that replace IRIX syssgi()'s
483 */
484#define XFS_IOC_FSGEOMETRY_V1 _IOR ('X', 100, struct xfs_fsop_geom_v1)
485#define XFS_IOC_FSBULKSTAT _IOWR('X', 101, struct xfs_fsop_bulkreq)
486#define XFS_IOC_FSBULKSTAT_SINGLE _IOWR('X', 102, struct xfs_fsop_bulkreq)
487#define XFS_IOC_FSINUMBERS _IOWR('X', 103, struct xfs_fsop_bulkreq)
488#define XFS_IOC_PATH_TO_FSHANDLE _IOWR('X', 104, struct xfs_fsop_handlereq)
489#define XFS_IOC_PATH_TO_HANDLE _IOWR('X', 105, struct xfs_fsop_handlereq)
490#define XFS_IOC_FD_TO_HANDLE _IOWR('X', 106, struct xfs_fsop_handlereq)
491#define XFS_IOC_OPEN_BY_HANDLE _IOWR('X', 107, struct xfs_fsop_handlereq)
492#define XFS_IOC_READLINK_BY_HANDLE _IOWR('X', 108, struct xfs_fsop_handlereq)
493#define XFS_IOC_SWAPEXT _IOWR('X', 109, struct xfs_swapext)
494#define XFS_IOC_FSGROWFSDATA _IOW ('X', 110, struct xfs_growfs_data)
495#define XFS_IOC_FSGROWFSLOG _IOW ('X', 111, struct xfs_growfs_log)
496#define XFS_IOC_FSGROWFSRT _IOW ('X', 112, struct xfs_growfs_rt)
497#define XFS_IOC_FSCOUNTS _IOR ('X', 113, struct xfs_fsop_counts)
498#define XFS_IOC_SET_RESBLKS _IOWR('X', 114, struct xfs_fsop_resblks)
499#define XFS_IOC_GET_RESBLKS _IOR ('X', 115, struct xfs_fsop_resblks)
500#define XFS_IOC_ERROR_INJECTION _IOW ('X', 116, struct xfs_error_injection)
501#define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection)
502/* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */
503#define XFS_IOC_FREEZE _IOWR('X', 119, int)
504#define XFS_IOC_THAW _IOWR('X', 120, int)
505#define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq)
506#define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
507#define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq)
508#define XFS_IOC_FSGEOMETRY _IOR ('X', 124, struct xfs_fsop_geom)
509#define XFS_IOC_GOINGDOWN _IOR ('X', 125, __uint32_t)
510/* XFS_IOC_GETFSUUID ---------- deprecated 140 */
511
512
513#ifndef HAVE_BBMACROS
514/*
515 * Block I/O parameterization. A basic block (BB) is the lowest size of
516 * filesystem allocation, and must equal 512. Length units given to bio
517 * routines are in BB's.
518 */
519#define BBSHIFT 9
520#define BBSIZE (1<<BBSHIFT)
521#define BBMASK (BBSIZE-1)
522#define BTOBB(bytes) (((__u64)(bytes) + BBSIZE - 1) >> BBSHIFT)
523#define BTOBBT(bytes) ((__u64)(bytes) >> BBSHIFT)
524#define BBTOB(bbs) ((bbs) << BBSHIFT)
525#endif
526
527#endif /* __XFS_FS_H__ */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
new file mode 100644
index 000000000000..21213057c27f
--- /dev/null
+++ b/fs/xfs/xfs_fsops.c
@@ -0,0 +1,616 @@
1/*
2 * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34#include "xfs_macros.h"
35#include "xfs_types.h"
36#include "xfs_inum.h"
37#include "xfs_log.h"
38#include "xfs_trans.h"
39#include "xfs_sb.h"
40#include "xfs_dir.h"
41#include "xfs_dmapi.h"
42#include "xfs_mount.h"
43#include "xfs_ag.h"
44#include "xfs_alloc_btree.h"
45#include "xfs_bmap_btree.h"
46#include "xfs_ialloc_btree.h"
47#include "xfs_btree.h"
48#include "xfs_error.h"
49#include "xfs_alloc.h"
50#include "xfs_ialloc.h"
51#include "xfs_fsops.h"
52#include "xfs_itable.h"
53#include "xfs_rw.h"
54#include "xfs_refcache.h"
55#include "xfs_trans_space.h"
56#include "xfs_rtalloc.h"
57#include "xfs_dir2.h"
58#include "xfs_attr_sf.h"
59#include "xfs_dir_sf.h"
60#include "xfs_dir2_sf.h"
61#include "xfs_dinode.h"
62#include "xfs_inode.h"
63#include "xfs_inode_item.h"
64
65/*
66 * File system operations
67 */
68
69int
70xfs_fs_geometry(
71 xfs_mount_t *mp,
72 xfs_fsop_geom_t *geo,
73 int new_version)
74{
75 geo->blocksize = mp->m_sb.sb_blocksize;
76 geo->rtextsize = mp->m_sb.sb_rextsize;
77 geo->agblocks = mp->m_sb.sb_agblocks;
78 geo->agcount = mp->m_sb.sb_agcount;
79 geo->logblocks = mp->m_sb.sb_logblocks;
80 geo->sectsize = mp->m_sb.sb_sectsize;
81 geo->inodesize = mp->m_sb.sb_inodesize;
82 geo->imaxpct = mp->m_sb.sb_imax_pct;
83 geo->datablocks = mp->m_sb.sb_dblocks;
84 geo->rtblocks = mp->m_sb.sb_rblocks;
85 geo->rtextents = mp->m_sb.sb_rextents;
86 geo->logstart = mp->m_sb.sb_logstart;
87 ASSERT(sizeof(geo->uuid)==sizeof(mp->m_sb.sb_uuid));
88 memcpy(geo->uuid, &mp->m_sb.sb_uuid, sizeof(mp->m_sb.sb_uuid));
89 if (new_version >= 2) {
90 geo->sunit = mp->m_sb.sb_unit;
91 geo->swidth = mp->m_sb.sb_width;
92 }
93 if (new_version >= 3) {
94 geo->version = XFS_FSOP_GEOM_VERSION;
95 geo->flags =
96 (XFS_SB_VERSION_HASATTR(&mp->m_sb) ?
97 XFS_FSOP_GEOM_FLAGS_ATTR : 0) |
98 (XFS_SB_VERSION_HASNLINK(&mp->m_sb) ?
99 XFS_FSOP_GEOM_FLAGS_NLINK : 0) |
100 (XFS_SB_VERSION_HASQUOTA(&mp->m_sb) ?
101 XFS_FSOP_GEOM_FLAGS_QUOTA : 0) |
102 (XFS_SB_VERSION_HASALIGN(&mp->m_sb) ?
103 XFS_FSOP_GEOM_FLAGS_IALIGN : 0) |
104 (XFS_SB_VERSION_HASDALIGN(&mp->m_sb) ?
105 XFS_FSOP_GEOM_FLAGS_DALIGN : 0) |
106 (XFS_SB_VERSION_HASSHARED(&mp->m_sb) ?
107 XFS_FSOP_GEOM_FLAGS_SHARED : 0) |
108 (XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb) ?
109 XFS_FSOP_GEOM_FLAGS_EXTFLG : 0) |
110 (XFS_SB_VERSION_HASDIRV2(&mp->m_sb) ?
111 XFS_FSOP_GEOM_FLAGS_DIRV2 : 0) |
112 (XFS_SB_VERSION_HASSECTOR(&mp->m_sb) ?
113 XFS_FSOP_GEOM_FLAGS_SECTOR : 0);
114 geo->logsectsize = XFS_SB_VERSION_HASSECTOR(&mp->m_sb) ?
115 mp->m_sb.sb_logsectsize : BBSIZE;
116 geo->rtsectsize = mp->m_sb.sb_blocksize;
117 geo->dirblocksize = mp->m_dirblksize;
118 }
119 if (new_version >= 4) {
120 geo->flags |=
121 (XFS_SB_VERSION_HASLOGV2(&mp->m_sb) ?
122 XFS_FSOP_GEOM_FLAGS_LOGV2 : 0);
123 geo->logsunit = mp->m_sb.sb_logsunit;
124 }
125 return 0;
126}
127
128static int
129xfs_growfs_data_private(
130 xfs_mount_t *mp, /* mount point for filesystem */
131 xfs_growfs_data_t *in) /* growfs data input struct */
132{
133 xfs_agf_t *agf;
134 xfs_agi_t *agi;
135 xfs_agnumber_t agno;
136 xfs_extlen_t agsize;
137 xfs_extlen_t tmpsize;
138 xfs_alloc_rec_t *arec;
139 xfs_btree_sblock_t *block;
140 xfs_buf_t *bp;
141 int bucket;
142 int dpct;
143 int error;
144 xfs_agnumber_t nagcount;
145 xfs_agnumber_t nagimax = 0;
146 xfs_rfsblock_t nb, nb_mod;
147 xfs_rfsblock_t new;
148 xfs_rfsblock_t nfree;
149 xfs_agnumber_t oagcount;
150 int pct;
151 xfs_sb_t *sbp;
152 xfs_trans_t *tp;
153
154 nb = in->newblocks;
155 pct = in->imaxpct;
156 if (nb < mp->m_sb.sb_dblocks || pct < 0 || pct > 100)
157 return XFS_ERROR(EINVAL);
158 dpct = pct - mp->m_sb.sb_imax_pct;
159 error = xfs_read_buf(mp, mp->m_ddev_targp,
160 XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
161 XFS_FSS_TO_BB(mp, 1), 0, &bp);
162 if (error)
163 return error;
164 ASSERT(bp);
165 xfs_buf_relse(bp);
166
167 new = nb; /* use new as a temporary here */
168 nb_mod = do_div(new, mp->m_sb.sb_agblocks);
169 nagcount = new + (nb_mod != 0);
170 if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) {
171 nagcount--;
172 nb = nagcount * mp->m_sb.sb_agblocks;
173 if (nb < mp->m_sb.sb_dblocks)
174 return XFS_ERROR(EINVAL);
175 }
176 new = nb - mp->m_sb.sb_dblocks;
177 oagcount = mp->m_sb.sb_agcount;
178 if (nagcount > oagcount) {
179 down_write(&mp->m_peraglock);
180 mp->m_perag = kmem_realloc(mp->m_perag,
181 sizeof(xfs_perag_t) * nagcount,
182 sizeof(xfs_perag_t) * oagcount,
183 KM_SLEEP);
184 memset(&mp->m_perag[oagcount], 0,
185 (nagcount - oagcount) * sizeof(xfs_perag_t));
186 mp->m_flags |= XFS_MOUNT_32BITINODES;
187 nagimax = xfs_initialize_perag(mp, nagcount);
188 up_write(&mp->m_peraglock);
189 }
190 tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
191 if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp),
192 XFS_GROWDATA_LOG_RES(mp), 0, 0, 0))) {
193 xfs_trans_cancel(tp, 0);
194 return error;
195 }
196
197 nfree = 0;
198 for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
199 /*
200 * AG freelist header block
201 */
202 bp = xfs_buf_get(mp->m_ddev_targp,
203 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
204 XFS_FSS_TO_BB(mp, 1), 0);
205 agf = XFS_BUF_TO_AGF(bp);
206 memset(agf, 0, mp->m_sb.sb_sectsize);
207 INT_SET(agf->agf_magicnum, ARCH_CONVERT, XFS_AGF_MAGIC);
208 INT_SET(agf->agf_versionnum, ARCH_CONVERT, XFS_AGF_VERSION);
209 INT_SET(agf->agf_seqno, ARCH_CONVERT, agno);
210 if (agno == nagcount - 1)
211 agsize =
212 nb -
213 (agno * (xfs_rfsblock_t)mp->m_sb.sb_agblocks);
214 else
215 agsize = mp->m_sb.sb_agblocks;
216 INT_SET(agf->agf_length, ARCH_CONVERT, agsize);
217 INT_SET(agf->agf_roots[XFS_BTNUM_BNOi], ARCH_CONVERT,
218 XFS_BNO_BLOCK(mp));
219 INT_SET(agf->agf_roots[XFS_BTNUM_CNTi], ARCH_CONVERT,
220 XFS_CNT_BLOCK(mp));
221 INT_SET(agf->agf_levels[XFS_BTNUM_BNOi], ARCH_CONVERT, 1);
222 INT_SET(agf->agf_levels[XFS_BTNUM_CNTi], ARCH_CONVERT, 1);
223 agf->agf_flfirst = 0;
224 INT_SET(agf->agf_fllast, ARCH_CONVERT, XFS_AGFL_SIZE(mp) - 1);
225 agf->agf_flcount = 0;
226 tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp);
227 INT_SET(agf->agf_freeblks, ARCH_CONVERT, tmpsize);
228 INT_SET(agf->agf_longest, ARCH_CONVERT, tmpsize);
229 error = xfs_bwrite(mp, bp);
230 if (error) {
231 goto error0;
232 }
233 /*
234 * AG inode header block
235 */
236 bp = xfs_buf_get(mp->m_ddev_targp,
237 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
238 XFS_FSS_TO_BB(mp, 1), 0);
239 agi = XFS_BUF_TO_AGI(bp);
240 memset(agi, 0, mp->m_sb.sb_sectsize);
241 INT_SET(agi->agi_magicnum, ARCH_CONVERT, XFS_AGI_MAGIC);
242 INT_SET(agi->agi_versionnum, ARCH_CONVERT, XFS_AGI_VERSION);
243 INT_SET(agi->agi_seqno, ARCH_CONVERT, agno);
244 INT_SET(agi->agi_length, ARCH_CONVERT, agsize);
245 agi->agi_count = 0;
246 INT_SET(agi->agi_root, ARCH_CONVERT, XFS_IBT_BLOCK(mp));
247 INT_SET(agi->agi_level, ARCH_CONVERT, 1);
248 agi->agi_freecount = 0;
249 INT_SET(agi->agi_newino, ARCH_CONVERT, NULLAGINO);
250 INT_SET(agi->agi_dirino, ARCH_CONVERT, NULLAGINO);
251 for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
252 INT_SET(agi->agi_unlinked[bucket], ARCH_CONVERT,
253 NULLAGINO);
254 error = xfs_bwrite(mp, bp);
255 if (error) {
256 goto error0;
257 }
258 /*
259 * BNO btree root block
260 */
261 bp = xfs_buf_get(mp->m_ddev_targp,
262 XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
263 BTOBB(mp->m_sb.sb_blocksize), 0);
264 block = XFS_BUF_TO_SBLOCK(bp);
265 memset(block, 0, mp->m_sb.sb_blocksize);
266 INT_SET(block->bb_magic, ARCH_CONVERT, XFS_ABTB_MAGIC);
267 block->bb_level = 0;
268 INT_SET(block->bb_numrecs, ARCH_CONVERT, 1);
269 INT_SET(block->bb_leftsib, ARCH_CONVERT, NULLAGBLOCK);
270 INT_SET(block->bb_rightsib, ARCH_CONVERT, NULLAGBLOCK);
271 arec = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_alloc,
272 block, 1, mp->m_alloc_mxr[0]);
273 INT_SET(arec->ar_startblock, ARCH_CONVERT,
274 XFS_PREALLOC_BLOCKS(mp));
275 INT_SET(arec->ar_blockcount, ARCH_CONVERT,
276 agsize - INT_GET(arec->ar_startblock, ARCH_CONVERT));
277 error = xfs_bwrite(mp, bp);
278 if (error) {
279 goto error0;
280 }
281 /*
282 * CNT btree root block
283 */
284 bp = xfs_buf_get(mp->m_ddev_targp,
285 XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
286 BTOBB(mp->m_sb.sb_blocksize), 0);
287 block = XFS_BUF_TO_SBLOCK(bp);
288 memset(block, 0, mp->m_sb.sb_blocksize);
289 INT_SET(block->bb_magic, ARCH_CONVERT, XFS_ABTC_MAGIC);
290 block->bb_level = 0;
291 INT_SET(block->bb_numrecs, ARCH_CONVERT, 1);
292 INT_SET(block->bb_leftsib, ARCH_CONVERT, NULLAGBLOCK);
293 INT_SET(block->bb_rightsib, ARCH_CONVERT, NULLAGBLOCK);
294 arec = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_alloc,
295 block, 1, mp->m_alloc_mxr[0]);
296 INT_SET(arec->ar_startblock, ARCH_CONVERT,
297 XFS_PREALLOC_BLOCKS(mp));
298 INT_SET(arec->ar_blockcount, ARCH_CONVERT,
299 agsize - INT_GET(arec->ar_startblock, ARCH_CONVERT));
300 nfree += INT_GET(arec->ar_blockcount, ARCH_CONVERT);
301 error = xfs_bwrite(mp, bp);
302 if (error) {
303 goto error0;
304 }
305 /*
306 * INO btree root block
307 */
308 bp = xfs_buf_get(mp->m_ddev_targp,
309 XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
310 BTOBB(mp->m_sb.sb_blocksize), 0);
311 block = XFS_BUF_TO_SBLOCK(bp);
312 memset(block, 0, mp->m_sb.sb_blocksize);
313 INT_SET(block->bb_magic, ARCH_CONVERT, XFS_IBT_MAGIC);
314 block->bb_level = 0;
315 block->bb_numrecs = 0;
316 INT_SET(block->bb_leftsib, ARCH_CONVERT, NULLAGBLOCK);
317 INT_SET(block->bb_rightsib, ARCH_CONVERT, NULLAGBLOCK);
318 error = xfs_bwrite(mp, bp);
319 if (error) {
320 goto error0;
321 }
322 }
323 xfs_trans_agblocks_delta(tp, nfree);
324 /*
325 * There are new blocks in the old last a.g.
326 */
327 if (new) {
328 /*
329 * Change the agi length.
330 */
331 error = xfs_ialloc_read_agi(mp, tp, agno, &bp);
332 if (error) {
333 goto error0;
334 }
335 ASSERT(bp);
336 agi = XFS_BUF_TO_AGI(bp);
337 INT_MOD(agi->agi_length, ARCH_CONVERT, new);
338 ASSERT(nagcount == oagcount ||
339 INT_GET(agi->agi_length, ARCH_CONVERT) ==
340 mp->m_sb.sb_agblocks);
341 xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH);
342 /*
343 * Change agf length.
344 */
345 error = xfs_alloc_read_agf(mp, tp, agno, 0, &bp);
346 if (error) {
347 goto error0;
348 }
349 ASSERT(bp);
350 agf = XFS_BUF_TO_AGF(bp);
351 INT_MOD(agf->agf_length, ARCH_CONVERT, new);
352 ASSERT(INT_GET(agf->agf_length, ARCH_CONVERT) ==
353 INT_GET(agi->agi_length, ARCH_CONVERT));
354 /*
355 * Free the new space.
356 */
357 error = xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, agno,
358 INT_GET(agf->agf_length, ARCH_CONVERT) - new), new);
359 if (error) {
360 goto error0;
361 }
362 }
363 if (nagcount > oagcount)
364 xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount);
365 if (nb > mp->m_sb.sb_dblocks)
366 xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS,
367 nb - mp->m_sb.sb_dblocks);
368 if (nfree)
369 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, nfree);
370 if (dpct)
371 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
372 error = xfs_trans_commit(tp, 0, NULL);
373 if (error) {
374 return error;
375 }
376 /* New allocation groups fully initialized, so update mount struct */
377 if (nagimax)
378 mp->m_maxagi = nagimax;
379 if (mp->m_sb.sb_imax_pct) {
380 __uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct;
381 do_div(icount, 100);
382 mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
383 } else
384 mp->m_maxicount = 0;
385 for (agno = 1; agno < nagcount; agno++) {
386 error = xfs_read_buf(mp, mp->m_ddev_targp,
387 XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
388 XFS_FSS_TO_BB(mp, 1), 0, &bp);
389 if (error) {
390 xfs_fs_cmn_err(CE_WARN, mp,
391 "error %d reading secondary superblock for ag %d",
392 error, agno);
393 break;
394 }
395 sbp = XFS_BUF_TO_SBP(bp);
396 xfs_xlatesb(sbp, &mp->m_sb, -1, XFS_SB_ALL_BITS);
397 /*
398 * If we get an error writing out the alternate superblocks,
399 * just issue a warning and continue. The real work is
400 * already done and committed.
401 */
402 if (!(error = xfs_bwrite(mp, bp))) {
403 continue;
404 } else {
405 xfs_fs_cmn_err(CE_WARN, mp,
406 "write error %d updating secondary superblock for ag %d",
407 error, agno);
408 break; /* no point in continuing */
409 }
410 }
411 return 0;
412
413 error0:
414 xfs_trans_cancel(tp, XFS_TRANS_ABORT);
415 return error;
416}
417
418static int
419xfs_growfs_log_private(
420 xfs_mount_t *mp, /* mount point for filesystem */
421 xfs_growfs_log_t *in) /* growfs log input struct */
422{
423 xfs_extlen_t nb;
424
425 nb = in->newblocks;
426 if (nb < XFS_MIN_LOG_BLOCKS || nb < XFS_B_TO_FSB(mp, XFS_MIN_LOG_BYTES))
427 return XFS_ERROR(EINVAL);
428 if (nb == mp->m_sb.sb_logblocks &&
429 in->isint == (mp->m_sb.sb_logstart != 0))
430 return XFS_ERROR(EINVAL);
431 /*
432 * Moving the log is hard, need new interfaces to sync
433 * the log first, hold off all activity while moving it.
434 * Can have shorter or longer log in the same space,
435 * or transform internal to external log or vice versa.
436 */
437 return XFS_ERROR(ENOSYS);
438}
439
440/*
441 * protected versions of growfs function acquire and release locks on the mount
442 * point - exported through ioctls: XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG,
443 * XFS_IOC_FSGROWFSRT
444 */
445
446
447int
448xfs_growfs_data(
449 xfs_mount_t *mp,
450 xfs_growfs_data_t *in)
451{
452 int error;
453 if (!cpsema(&mp->m_growlock))
454 return XFS_ERROR(EWOULDBLOCK);
455 error = xfs_growfs_data_private(mp, in);
456 vsema(&mp->m_growlock);
457 return error;
458}
459
460int
461xfs_growfs_log(
462 xfs_mount_t *mp,
463 xfs_growfs_log_t *in)
464{
465 int error;
466 if (!cpsema(&mp->m_growlock))
467 return XFS_ERROR(EWOULDBLOCK);
468 error = xfs_growfs_log_private(mp, in);
469 vsema(&mp->m_growlock);
470 return error;
471}
472
473/*
474 * exported through ioctl XFS_IOC_FSCOUNTS
475 */
476
477int
478xfs_fs_counts(
479 xfs_mount_t *mp,
480 xfs_fsop_counts_t *cnt)
481{
482 unsigned long s;
483
484 s = XFS_SB_LOCK(mp);
485 cnt->freedata = mp->m_sb.sb_fdblocks;
486 cnt->freertx = mp->m_sb.sb_frextents;
487 cnt->freeino = mp->m_sb.sb_ifree;
488 cnt->allocino = mp->m_sb.sb_icount;
489 XFS_SB_UNLOCK(mp, s);
490 return 0;
491}
492
493/*
494 * exported through ioctl XFS_IOC_SET_RESBLKS & XFS_IOC_GET_RESBLKS
495 *
496 * xfs_reserve_blocks is called to set m_resblks
497 * in the in-core mount table. The number of unused reserved blocks
498 * is kept in m_resbls_avail.
499 *
500 * Reserve the requested number of blocks if available. Otherwise return
501 * as many as possible to satisfy the request. The actual number
502 * reserved are returned in outval
503 *
504 * A null inval pointer indicates that only the current reserved blocks
505 * available should be returned no settings are changed.
506 */
507
508int
509xfs_reserve_blocks(
510 xfs_mount_t *mp,
511 __uint64_t *inval,
512 xfs_fsop_resblks_t *outval)
513{
514 __int64_t lcounter, delta;
515 __uint64_t request;
516 unsigned long s;
517
518 /* If inval is null, report current values and return */
519
520 if (inval == (__uint64_t *)NULL) {
521 outval->resblks = mp->m_resblks;
522 outval->resblks_avail = mp->m_resblks_avail;
523 return(0);
524 }
525
526 request = *inval;
527 s = XFS_SB_LOCK(mp);
528
529 /*
530 * If our previous reservation was larger than the current value,
531 * then move any unused blocks back to the free pool.
532 */
533
534 if (mp->m_resblks > request) {
535 lcounter = mp->m_resblks_avail - request;
536 if (lcounter > 0) { /* release unused blocks */
537 mp->m_sb.sb_fdblocks += lcounter;
538 mp->m_resblks_avail -= lcounter;
539 }
540 mp->m_resblks = request;
541 } else {
542 delta = request - mp->m_resblks;
543 lcounter = mp->m_sb.sb_fdblocks - delta;
544 if (lcounter < 0) {
545 /* We can't satisfy the request, just get what we can */
546 mp->m_resblks += mp->m_sb.sb_fdblocks;
547 mp->m_resblks_avail += mp->m_sb.sb_fdblocks;
548 mp->m_sb.sb_fdblocks = 0;
549 } else {
550 mp->m_sb.sb_fdblocks = lcounter;
551 mp->m_resblks = request;
552 mp->m_resblks_avail += delta;
553 }
554 }
555
556 outval->resblks = mp->m_resblks;
557 outval->resblks_avail = mp->m_resblks_avail;
558 XFS_SB_UNLOCK(mp, s);
559 return(0);
560}
561
562void
563xfs_fs_log_dummy(xfs_mount_t *mp)
564{
565 xfs_trans_t *tp;
566 xfs_inode_t *ip;
567
568
569 tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
570 atomic_inc(&mp->m_active_trans);
571 if (xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0)) {
572 xfs_trans_cancel(tp, 0);
573 return;
574 }
575
576 ip = mp->m_rootip;
577 xfs_ilock(ip, XFS_ILOCK_EXCL);
578
579 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
580 xfs_trans_ihold(tp, ip);
581 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
582 xfs_trans_set_sync(tp);
583 xfs_trans_commit(tp, 0, NULL);
584
585 xfs_iunlock(ip, XFS_ILOCK_EXCL);
586}
587
588int
589xfs_fs_goingdown(
590 xfs_mount_t *mp,
591 __uint32_t inflags)
592{
593 switch (inflags) {
594 case XFS_FSOP_GOING_FLAGS_DEFAULT: {
595 struct vfs *vfsp = XFS_MTOVFS(mp);
596 struct super_block *sb = freeze_bdev(vfsp->vfs_super->s_bdev);
597
598 if (sb) {
599 xfs_force_shutdown(mp, XFS_FORCE_UMOUNT);
600 thaw_bdev(sb->s_bdev, sb);
601 }
602
603 break;
604 }
605 case XFS_FSOP_GOING_FLAGS_LOGFLUSH:
606 xfs_force_shutdown(mp, XFS_FORCE_UMOUNT);
607 break;
608 case XFS_FSOP_GOING_FLAGS_NOLOGFLUSH:
609 xfs_force_shutdown(mp, XFS_FORCE_UMOUNT|XFS_LOG_IO_ERROR);
610 break;
611 default:
612 return XFS_ERROR(EINVAL);
613 }
614
615 return 0;
616}
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
new file mode 100644
index 000000000000..b61486173a61
--- /dev/null
+++ b/fs/xfs/xfs_fsops.h
@@ -0,0 +1,67 @@
1/*
2 * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_FSOPS_H__
33#define __XFS_FSOPS_H__
34
35int
36xfs_fs_geometry(
37 xfs_mount_t *mp,
38 xfs_fsop_geom_t *geo,
39 int new_version);
40
41int
42xfs_growfs_data(
43 xfs_mount_t *mp,
44 xfs_growfs_data_t *in);
45
46int
47xfs_growfs_log(
48 xfs_mount_t *mp,
49 xfs_growfs_log_t *in);
50
51int
52xfs_fs_counts(
53 xfs_mount_t *mp,
54 xfs_fsop_counts_t *cnt);
55
56int
57xfs_reserve_blocks(
58 xfs_mount_t *mp,
59 __uint64_t *inval,
60 xfs_fsop_resblks_t *outval);
61
62int
63xfs_fs_goingdown(
64 xfs_mount_t *mp,
65 __uint32_t inflags);
66
67#endif /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
new file mode 100644
index 000000000000..ce5fee9eaec5
--- /dev/null
+++ b/fs/xfs/xfs_ialloc.c
@@ -0,0 +1,1401 @@
1/*
2 * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34
35#include "xfs_macros.h"
36#include "xfs_types.h"
37#include "xfs_inum.h"
38#include "xfs_log.h"
39#include "xfs_trans.h"
40#include "xfs_sb.h"
41#include "xfs_ag.h"
42#include "xfs_dir.h"
43#include "xfs_dir2.h"
44#include "xfs_dmapi.h"
45#include "xfs_mount.h"
46#include "xfs_alloc_btree.h"
47#include "xfs_bmap_btree.h"
48#include "xfs_ialloc_btree.h"
49#include "xfs_btree.h"
50#include "xfs_ialloc.h"
51#include "xfs_attr_sf.h"
52#include "xfs_dir_sf.h"
53#include "xfs_dir2_sf.h"
54#include "xfs_dinode.h"
55#include "xfs_inode.h"
56#include "xfs_alloc.h"
57#include "xfs_bit.h"
58#include "xfs_rtalloc.h"
59#include "xfs_error.h"
60#include "xfs_bmap.h"
61
62/*
63 * Log specified fields for the inode given by bp and off.
64 */
65STATIC void
66xfs_ialloc_log_di(
67 xfs_trans_t *tp, /* transaction pointer */
68 xfs_buf_t *bp, /* inode buffer */
69 int off, /* index of inode in buffer */
70 int fields) /* bitmask of fields to log */
71{
72 int first; /* first byte number */
73 int ioffset; /* off in bytes */
74 int last; /* last byte number */
75 xfs_mount_t *mp; /* mount point structure */
76 static const short offsets[] = { /* field offsets */
77 /* keep in sync with bits */
78 offsetof(xfs_dinode_core_t, di_magic),
79 offsetof(xfs_dinode_core_t, di_mode),
80 offsetof(xfs_dinode_core_t, di_version),
81 offsetof(xfs_dinode_core_t, di_format),
82 offsetof(xfs_dinode_core_t, di_onlink),
83 offsetof(xfs_dinode_core_t, di_uid),
84 offsetof(xfs_dinode_core_t, di_gid),
85 offsetof(xfs_dinode_core_t, di_nlink),
86 offsetof(xfs_dinode_core_t, di_projid),
87 offsetof(xfs_dinode_core_t, di_pad),
88 offsetof(xfs_dinode_core_t, di_atime),
89 offsetof(xfs_dinode_core_t, di_mtime),
90 offsetof(xfs_dinode_core_t, di_ctime),
91 offsetof(xfs_dinode_core_t, di_size),
92 offsetof(xfs_dinode_core_t, di_nblocks),
93 offsetof(xfs_dinode_core_t, di_extsize),
94 offsetof(xfs_dinode_core_t, di_nextents),
95 offsetof(xfs_dinode_core_t, di_anextents),
96 offsetof(xfs_dinode_core_t, di_forkoff),
97 offsetof(xfs_dinode_core_t, di_aformat),
98 offsetof(xfs_dinode_core_t, di_dmevmask),
99 offsetof(xfs_dinode_core_t, di_dmstate),
100 offsetof(xfs_dinode_core_t, di_flags),
101 offsetof(xfs_dinode_core_t, di_gen),
102 offsetof(xfs_dinode_t, di_next_unlinked),
103 offsetof(xfs_dinode_t, di_u),
104 offsetof(xfs_dinode_t, di_a),
105 sizeof(xfs_dinode_t)
106 };
107
108
109 ASSERT(offsetof(xfs_dinode_t, di_core) == 0);
110 ASSERT((fields & (XFS_DI_U|XFS_DI_A)) == 0);
111 mp = tp->t_mountp;
112 /*
113 * Get the inode-relative first and last bytes for these fields
114 */
115 xfs_btree_offsets(fields, offsets, XFS_DI_NUM_BITS, &first, &last);
116 /*
117 * Convert to buffer offsets and log it.
118 */
119 ioffset = off << mp->m_sb.sb_inodelog;
120 first += ioffset;
121 last += ioffset;
122 xfs_trans_log_buf(tp, bp, first, last);
123}
124
125/*
126 * Allocation group level functions.
127 */
128
129/*
130 * Allocate new inodes in the allocation group specified by agbp.
131 * Return 0 for success, else error code.
132 */
133STATIC int /* error code or 0 */
134xfs_ialloc_ag_alloc(
135 xfs_trans_t *tp, /* transaction pointer */
136 xfs_buf_t *agbp, /* alloc group buffer */
137 int *alloc)
138{
139 xfs_agi_t *agi; /* allocation group header */
140 xfs_alloc_arg_t args; /* allocation argument structure */
141 int blks_per_cluster; /* fs blocks per inode cluster */
142 xfs_btree_cur_t *cur; /* inode btree cursor */
143 xfs_daddr_t d; /* disk addr of buffer */
144 int error;
145 xfs_buf_t *fbuf; /* new free inodes' buffer */
146 xfs_dinode_t *free; /* new free inode structure */
147 int i; /* inode counter */
148 int j; /* block counter */
149 int nbufs; /* num bufs of new inodes */
150 xfs_agino_t newino; /* new first inode's number */
151 xfs_agino_t newlen; /* new number of inodes */
152 int ninodes; /* num inodes per buf */
153 xfs_agino_t thisino; /* current inode number, for loop */
154 int version; /* inode version number to use */
155 int isaligned; /* inode allocation at stripe unit */
156 /* boundary */
157 xfs_dinode_core_t dic; /* a dinode_core to copy to new */
158 /* inodes */
159
160 args.tp = tp;
161 args.mp = tp->t_mountp;
162
163 /*
164 * Locking will ensure that we don't have two callers in here
165 * at one time.
166 */
167 newlen = XFS_IALLOC_INODES(args.mp);
168 if (args.mp->m_maxicount &&
169 args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount)
170 return XFS_ERROR(ENOSPC);
171 args.minlen = args.maxlen = XFS_IALLOC_BLOCKS(args.mp);
172 /*
173 * Set the alignment for the allocation.
174 * If stripe alignment is turned on then align at stripe unit
175 * boundary.
176 * If the cluster size is smaller than a filesystem block
177 * then we're doing I/O for inodes in filesystem block size pieces,
178 * so don't need alignment anyway.
179 */
180 isaligned = 0;
181 if (args.mp->m_sinoalign) {
182 ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN));
183 args.alignment = args.mp->m_dalign;
184 isaligned = 1;
185 } else if (XFS_SB_VERSION_HASALIGN(&args.mp->m_sb) &&
186 args.mp->m_sb.sb_inoalignmt >=
187 XFS_B_TO_FSBT(args.mp, XFS_INODE_CLUSTER_SIZE(args.mp)))
188 args.alignment = args.mp->m_sb.sb_inoalignmt;
189 else
190 args.alignment = 1;
191 agi = XFS_BUF_TO_AGI(agbp);
192 /*
193 * Need to figure out where to allocate the inode blocks.
194 * Ideally they should be spaced out through the a.g.
195 * For now, just allocate blocks up front.
196 */
197 args.agbno = INT_GET(agi->agi_root, ARCH_CONVERT);
198 args.fsbno = XFS_AGB_TO_FSB(args.mp, INT_GET(agi->agi_seqno, ARCH_CONVERT),
199 args.agbno);
200 /*
201 * Allocate a fixed-size extent of inodes.
202 */
203 args.type = XFS_ALLOCTYPE_NEAR_BNO;
204 args.mod = args.total = args.wasdel = args.isfl = args.userdata =
205 args.minalignslop = 0;
206 args.prod = 1;
207 /*
208 * Allow space for the inode btree to split.
209 */
210 args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1;
211 if ((error = xfs_alloc_vextent(&args)))
212 return error;
213
214 /*
215 * If stripe alignment is turned on, then try again with cluster
216 * alignment.
217 */
218 if (isaligned && args.fsbno == NULLFSBLOCK) {
219 args.type = XFS_ALLOCTYPE_NEAR_BNO;
220 args.agbno = INT_GET(agi->agi_root, ARCH_CONVERT);
221 args.fsbno = XFS_AGB_TO_FSB(args.mp,
222 INT_GET(agi->agi_seqno, ARCH_CONVERT), args.agbno);
223 if (XFS_SB_VERSION_HASALIGN(&args.mp->m_sb) &&
224 args.mp->m_sb.sb_inoalignmt >=
225 XFS_B_TO_FSBT(args.mp, XFS_INODE_CLUSTER_SIZE(args.mp)))
226 args.alignment = args.mp->m_sb.sb_inoalignmt;
227 else
228 args.alignment = 1;
229 if ((error = xfs_alloc_vextent(&args)))
230 return error;
231 }
232
233 if (args.fsbno == NULLFSBLOCK) {
234 *alloc = 0;
235 return 0;
236 }
237 ASSERT(args.len == args.minlen);
238 /*
239 * Convert the results.
240 */
241 newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
242 /*
243 * Loop over the new block(s), filling in the inodes.
244 * For small block sizes, manipulate the inodes in buffers
245 * which are multiples of the blocks size.
246 */
247 if (args.mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(args.mp)) {
248 blks_per_cluster = 1;
249 nbufs = (int)args.len;
250 ninodes = args.mp->m_sb.sb_inopblock;
251 } else {
252 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(args.mp) /
253 args.mp->m_sb.sb_blocksize;
254 nbufs = (int)args.len / blks_per_cluster;
255 ninodes = blks_per_cluster * args.mp->m_sb.sb_inopblock;
256 }
257 /*
258 * Figure out what version number to use in the inodes we create.
259 * If the superblock version has caught up to the one that supports
260 * the new inode format, then use the new inode version. Otherwise
261 * use the old version so that old kernels will continue to be
262 * able to use the file system.
263 */
264 if (XFS_SB_VERSION_HASNLINK(&args.mp->m_sb))
265 version = XFS_DINODE_VERSION_2;
266 else
267 version = XFS_DINODE_VERSION_1;
268
269 memset(&dic, 0, sizeof(xfs_dinode_core_t));
270 INT_SET(dic.di_magic, ARCH_CONVERT, XFS_DINODE_MAGIC);
271 INT_SET(dic.di_version, ARCH_CONVERT, version);
272
273 for (j = 0; j < nbufs; j++) {
274 /*
275 * Get the block.
276 */
277 d = XFS_AGB_TO_DADDR(args.mp, INT_GET(agi->agi_seqno, ARCH_CONVERT),
278 args.agbno + (j * blks_per_cluster));
279 fbuf = xfs_trans_get_buf(tp, args.mp->m_ddev_targp, d,
280 args.mp->m_bsize * blks_per_cluster,
281 XFS_BUF_LOCK);
282 ASSERT(fbuf);
283 ASSERT(!XFS_BUF_GETERROR(fbuf));
284 /*
285 * Loop over the inodes in this buffer.
286 */
287
288 for (i = 0; i < ninodes; i++) {
289 free = XFS_MAKE_IPTR(args.mp, fbuf, i);
290 memcpy(&(free->di_core), &dic, sizeof(xfs_dinode_core_t));
291 INT_SET(free->di_next_unlinked, ARCH_CONVERT, NULLAGINO);
292 xfs_ialloc_log_di(tp, fbuf, i,
293 XFS_DI_CORE_BITS | XFS_DI_NEXT_UNLINKED);
294 }
295 xfs_trans_inode_alloc_buf(tp, fbuf);
296 }
297 INT_MOD(agi->agi_count, ARCH_CONVERT, newlen);
298 INT_MOD(agi->agi_freecount, ARCH_CONVERT, newlen);
299 down_read(&args.mp->m_peraglock);
300 args.mp->m_perag[INT_GET(agi->agi_seqno, ARCH_CONVERT)].pagi_freecount += newlen;
301 up_read(&args.mp->m_peraglock);
302 INT_SET(agi->agi_newino, ARCH_CONVERT, newino);
303 /*
304 * Insert records describing the new inode chunk into the btree.
305 */
306 cur = xfs_btree_init_cursor(args.mp, tp, agbp,
307 INT_GET(agi->agi_seqno, ARCH_CONVERT),
308 XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
309 for (thisino = newino;
310 thisino < newino + newlen;
311 thisino += XFS_INODES_PER_CHUNK) {
312 if ((error = xfs_inobt_lookup_eq(cur, thisino,
313 XFS_INODES_PER_CHUNK, XFS_INOBT_ALL_FREE, &i))) {
314 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
315 return error;
316 }
317 ASSERT(i == 0);
318 if ((error = xfs_inobt_insert(cur, &i))) {
319 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
320 return error;
321 }
322 ASSERT(i == 1);
323 }
324 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
325 /*
326 * Log allocation group header fields
327 */
328 xfs_ialloc_log_agi(tp, agbp,
329 XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO);
330 /*
331 * Modify/log superblock values for inode count and inode free count.
332 */
333 xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, (long)newlen);
334 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, (long)newlen);
335 *alloc = 1;
336 return 0;
337}
338
339STATIC __inline xfs_agnumber_t
340xfs_ialloc_next_ag(
341 xfs_mount_t *mp)
342{
343 xfs_agnumber_t agno;
344
345 spin_lock(&mp->m_agirotor_lock);
346 agno = mp->m_agirotor;
347 if (++mp->m_agirotor == mp->m_maxagi)
348 mp->m_agirotor = 0;
349 spin_unlock(&mp->m_agirotor_lock);
350
351 return agno;
352}
353
354/*
355 * Select an allocation group to look for a free inode in, based on the parent
356 * inode and then mode. Return the allocation group buffer.
357 */
358STATIC xfs_buf_t * /* allocation group buffer */
359xfs_ialloc_ag_select(
360 xfs_trans_t *tp, /* transaction pointer */
361 xfs_ino_t parent, /* parent directory inode number */
362 mode_t mode, /* bits set to indicate file type */
363 int okalloc) /* ok to allocate more space */
364{
365 xfs_buf_t *agbp; /* allocation group header buffer */
366 xfs_agnumber_t agcount; /* number of ag's in the filesystem */
367 xfs_agnumber_t agno; /* current ag number */
368 int flags; /* alloc buffer locking flags */
369 xfs_extlen_t ineed; /* blocks needed for inode allocation */
370 xfs_extlen_t longest = 0; /* longest extent available */
371 xfs_mount_t *mp; /* mount point structure */
372 int needspace; /* file mode implies space allocated */
373 xfs_perag_t *pag; /* per allocation group data */
374 xfs_agnumber_t pagno; /* parent (starting) ag number */
375
376 /*
377 * Files of these types need at least one block if length > 0
378 * (and they won't fit in the inode, but that's hard to figure out).
379 */
380 needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode);
381 mp = tp->t_mountp;
382 agcount = mp->m_maxagi;
383 if (S_ISDIR(mode))
384 pagno = xfs_ialloc_next_ag(mp);
385 else {
386 pagno = XFS_INO_TO_AGNO(mp, parent);
387 if (pagno >= agcount)
388 pagno = 0;
389 }
390 ASSERT(pagno < agcount);
391 /*
392 * Loop through allocation groups, looking for one with a little
393 * free space in it. Note we don't look for free inodes, exactly.
394 * Instead, we include whether there is a need to allocate inodes
395 * to mean that blocks must be allocated for them,
396 * if none are currently free.
397 */
398 agno = pagno;
399 flags = XFS_ALLOC_FLAG_TRYLOCK;
400 down_read(&mp->m_peraglock);
401 for (;;) {
402 pag = &mp->m_perag[agno];
403 if (!pag->pagi_init) {
404 if (xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
405 agbp = NULL;
406 goto nextag;
407 }
408 } else
409 agbp = NULL;
410
411 if (!pag->pagi_inodeok) {
412 xfs_ialloc_next_ag(mp);
413 goto unlock_nextag;
414 }
415
416 /*
417 * Is there enough free space for the file plus a block
418 * of inodes (if we need to allocate some)?
419 */
420 ineed = pag->pagi_freecount ? 0 : XFS_IALLOC_BLOCKS(mp);
421 if (ineed && !pag->pagf_init) {
422 if (agbp == NULL &&
423 xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
424 agbp = NULL;
425 goto nextag;
426 }
427 (void)xfs_alloc_pagf_init(mp, tp, agno, flags);
428 }
429 if (!ineed || pag->pagf_init) {
430 if (ineed && !(longest = pag->pagf_longest))
431 longest = pag->pagf_flcount > 0;
432 if (!ineed ||
433 (pag->pagf_freeblks >= needspace + ineed &&
434 longest >= ineed &&
435 okalloc)) {
436 if (agbp == NULL &&
437 xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
438 agbp = NULL;
439 goto nextag;
440 }
441 up_read(&mp->m_peraglock);
442 return agbp;
443 }
444 }
445unlock_nextag:
446 if (agbp)
447 xfs_trans_brelse(tp, agbp);
448nextag:
449 /*
450 * No point in iterating over the rest, if we're shutting
451 * down.
452 */
453 if (XFS_FORCED_SHUTDOWN(mp)) {
454 up_read(&mp->m_peraglock);
455 return (xfs_buf_t *)0;
456 }
457 agno++;
458 if (agno >= agcount)
459 agno = 0;
460 if (agno == pagno) {
461 if (flags == 0) {
462 up_read(&mp->m_peraglock);
463 return (xfs_buf_t *)0;
464 }
465 flags = 0;
466 }
467 }
468}
469
470/*
471 * Visible inode allocation functions.
472 */
473
474/*
475 * Allocate an inode on disk.
476 * Mode is used to tell whether the new inode will need space, and whether
477 * it is a directory.
478 *
479 * The arguments IO_agbp and alloc_done are defined to work within
480 * the constraint of one allocation per transaction.
481 * xfs_dialloc() is designed to be called twice if it has to do an
482 * allocation to make more free inodes. On the first call,
483 * IO_agbp should be set to NULL. If an inode is available,
484 * i.e., xfs_dialloc() did not need to do an allocation, an inode
485 * number is returned. In this case, IO_agbp would be set to the
486 * current ag_buf and alloc_done set to false.
487 * If an allocation needed to be done, xfs_dialloc would return
488 * the current ag_buf in IO_agbp and set alloc_done to true.
489 * The caller should then commit the current transaction, allocate a new
490 * transaction, and call xfs_dialloc() again, passing in the previous
491 * value of IO_agbp. IO_agbp should be held across the transactions.
492 * Since the agbp is locked across the two calls, the second call is
493 * guaranteed to have a free inode available.
494 *
495 * Once we successfully pick an inode its number is returned and the
496 * on-disk data structures are updated. The inode itself is not read
497 * in, since doing so would break ordering constraints with xfs_reclaim.
498 */
499int
500xfs_dialloc(
501 xfs_trans_t *tp, /* transaction pointer */
502 xfs_ino_t parent, /* parent inode (directory) */
503 mode_t mode, /* mode bits for new inode */
504 int okalloc, /* ok to allocate more space */
505 xfs_buf_t **IO_agbp, /* in/out ag header's buffer */
506 boolean_t *alloc_done, /* true if we needed to replenish
507 inode freelist */
508 xfs_ino_t *inop) /* inode number allocated */
509{
510 xfs_agnumber_t agcount; /* number of allocation groups */
511 xfs_buf_t *agbp; /* allocation group header's buffer */
512 xfs_agnumber_t agno; /* allocation group number */
513 xfs_agi_t *agi; /* allocation group header structure */
514 xfs_btree_cur_t *cur; /* inode allocation btree cursor */
515 int error; /* error return value */
516 int i; /* result code */
517 int ialloced; /* inode allocation status */
518 int noroom = 0; /* no space for inode blk allocation */
519 xfs_ino_t ino; /* fs-relative inode to be returned */
520 /* REFERENCED */
521 int j; /* result code */
522 xfs_mount_t *mp; /* file system mount structure */
523 int offset; /* index of inode in chunk */
524 xfs_agino_t pagino; /* parent's a.g. relative inode # */
525 xfs_agnumber_t pagno; /* parent's allocation group number */
526 xfs_inobt_rec_t rec; /* inode allocation record */
527 xfs_agnumber_t tagno; /* testing allocation group number */
528 xfs_btree_cur_t *tcur; /* temp cursor */
529 xfs_inobt_rec_t trec; /* temp inode allocation record */
530
531
532 if (*IO_agbp == NULL) {
533 /*
534 * We do not have an agbp, so select an initial allocation
535 * group for inode allocation.
536 */
537 agbp = xfs_ialloc_ag_select(tp, parent, mode, okalloc);
538 /*
539 * Couldn't find an allocation group satisfying the
540 * criteria, give up.
541 */
542 if (!agbp) {
543 *inop = NULLFSINO;
544 return 0;
545 }
546 agi = XFS_BUF_TO_AGI(agbp);
547 ASSERT(INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC);
548 } else {
549 /*
550 * Continue where we left off before. In this case, we
551 * know that the allocation group has free inodes.
552 */
553 agbp = *IO_agbp;
554 agi = XFS_BUF_TO_AGI(agbp);
555 ASSERT(INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC);
556 ASSERT(INT_GET(agi->agi_freecount, ARCH_CONVERT) > 0);
557 }
558 mp = tp->t_mountp;
559 agcount = mp->m_sb.sb_agcount;
560 agno = INT_GET(agi->agi_seqno, ARCH_CONVERT);
561 tagno = agno;
562 pagno = XFS_INO_TO_AGNO(mp, parent);
563 pagino = XFS_INO_TO_AGINO(mp, parent);
564
565 /*
566 * If we have already hit the ceiling of inode blocks then clear
567 * okalloc so we scan all available agi structures for a free
568 * inode.
569 */
570
571 if (mp->m_maxicount &&
572 mp->m_sb.sb_icount + XFS_IALLOC_INODES(mp) > mp->m_maxicount) {
573 noroom = 1;
574 okalloc = 0;
575 }
576
577 /*
578 * Loop until we find an allocation group that either has free inodes
579 * or in which we can allocate some inodes. Iterate through the
580 * allocation groups upward, wrapping at the end.
581 */
582 *alloc_done = B_FALSE;
583 while (!agi->agi_freecount) {
584 /*
585 * Don't do anything if we're not supposed to allocate
586 * any blocks, just go on to the next ag.
587 */
588 if (okalloc) {
589 /*
590 * Try to allocate some new inodes in the allocation
591 * group.
592 */
593 if ((error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced))) {
594 xfs_trans_brelse(tp, agbp);
595 if (error == ENOSPC) {
596 *inop = NULLFSINO;
597 return 0;
598 } else
599 return error;
600 }
601 if (ialloced) {
602 /*
603 * We successfully allocated some inodes, return
604 * the current context to the caller so that it
605 * can commit the current transaction and call
606 * us again where we left off.
607 */
608 ASSERT(INT_GET(agi->agi_freecount, ARCH_CONVERT) > 0);
609 *alloc_done = B_TRUE;
610 *IO_agbp = agbp;
611 *inop = NULLFSINO;
612 return 0;
613 }
614 }
615 /*
616 * If it failed, give up on this ag.
617 */
618 xfs_trans_brelse(tp, agbp);
619 /*
620 * Go on to the next ag: get its ag header.
621 */
622nextag:
623 if (++tagno == agcount)
624 tagno = 0;
625 if (tagno == agno) {
626 *inop = NULLFSINO;
627 return noroom ? ENOSPC : 0;
628 }
629 down_read(&mp->m_peraglock);
630 if (mp->m_perag[tagno].pagi_inodeok == 0) {
631 up_read(&mp->m_peraglock);
632 goto nextag;
633 }
634 error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp);
635 up_read(&mp->m_peraglock);
636 if (error)
637 goto nextag;
638 agi = XFS_BUF_TO_AGI(agbp);
639 ASSERT(INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC);
640 }
641 /*
642 * Here with an allocation group that has a free inode.
643 * Reset agno since we may have chosen a new ag in the
644 * loop above.
645 */
646 agno = tagno;
647 *IO_agbp = NULL;
648 cur = xfs_btree_init_cursor(mp, tp, agbp, INT_GET(agi->agi_seqno, ARCH_CONVERT),
649 XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
650 /*
651 * If pagino is 0 (this is the root inode allocation) use newino.
652 * This must work because we've just allocated some.
653 */
654 if (!pagino)
655 pagino = INT_GET(agi->agi_newino, ARCH_CONVERT);
656#ifdef DEBUG
657 if (cur->bc_nlevels == 1) {
658 int freecount = 0;
659
660 if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
661 goto error0;
662 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
663 do {
664 if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
665 &rec.ir_freecount, &rec.ir_free, &i)))
666 goto error0;
667 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
668 freecount += rec.ir_freecount;
669 if ((error = xfs_inobt_increment(cur, 0, &i)))
670 goto error0;
671 } while (i == 1);
672
673 ASSERT(freecount == INT_GET(agi->agi_freecount, ARCH_CONVERT) ||
674 XFS_FORCED_SHUTDOWN(mp));
675 }
676#endif
677 /*
678 * If in the same a.g. as the parent, try to get near the parent.
679 */
680 if (pagno == agno) {
681 if ((error = xfs_inobt_lookup_le(cur, pagino, 0, 0, &i)))
682 goto error0;
683 if (i != 0 &&
684 (error = xfs_inobt_get_rec(cur, &rec.ir_startino,
685 &rec.ir_freecount, &rec.ir_free, &j)) == 0 &&
686 j == 1 &&
687 rec.ir_freecount > 0) {
688 /*
689 * Found a free inode in the same chunk
690 * as parent, done.
691 */
692 }
693 /*
694 * In the same a.g. as parent, but parent's chunk is full.
695 */
696 else {
697 int doneleft; /* done, to the left */
698 int doneright; /* done, to the right */
699
700 if (error)
701 goto error0;
702 ASSERT(i == 1);
703 ASSERT(j == 1);
704 /*
705 * Duplicate the cursor, search left & right
706 * simultaneously.
707 */
708 if ((error = xfs_btree_dup_cursor(cur, &tcur)))
709 goto error0;
710 /*
711 * Search left with tcur, back up 1 record.
712 */
713 if ((error = xfs_inobt_decrement(tcur, 0, &i)))
714 goto error1;
715 doneleft = !i;
716 if (!doneleft) {
717 if ((error = xfs_inobt_get_rec(tcur,
718 &trec.ir_startino,
719 &trec.ir_freecount,
720 &trec.ir_free, &i)))
721 goto error1;
722 XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
723 }
724 /*
725 * Search right with cur, go forward 1 record.
726 */
727 if ((error = xfs_inobt_increment(cur, 0, &i)))
728 goto error1;
729 doneright = !i;
730 if (!doneright) {
731 if ((error = xfs_inobt_get_rec(cur,
732 &rec.ir_startino,
733 &rec.ir_freecount,
734 &rec.ir_free, &i)))
735 goto error1;
736 XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
737 }
738 /*
739 * Loop until we find the closest inode chunk
740 * with a free one.
741 */
742 while (!doneleft || !doneright) {
743 int useleft; /* using left inode
744 chunk this time */
745
746 /*
747 * Figure out which block is closer,
748 * if both are valid.
749 */
750 if (!doneleft && !doneright)
751 useleft =
752 pagino -
753 (trec.ir_startino +
754 XFS_INODES_PER_CHUNK - 1) <
755 rec.ir_startino - pagino;
756 else
757 useleft = !doneleft;
758 /*
759 * If checking the left, does it have
760 * free inodes?
761 */
762 if (useleft && trec.ir_freecount) {
763 /*
764 * Yes, set it up as the chunk to use.
765 */
766 rec = trec;
767 xfs_btree_del_cursor(cur,
768 XFS_BTREE_NOERROR);
769 cur = tcur;
770 break;
771 }
772 /*
773 * If checking the right, does it have
774 * free inodes?
775 */
776 if (!useleft && rec.ir_freecount) {
777 /*
778 * Yes, it's already set up.
779 */
780 xfs_btree_del_cursor(tcur,
781 XFS_BTREE_NOERROR);
782 break;
783 }
784 /*
785 * If used the left, get another one
786 * further left.
787 */
788 if (useleft) {
789 if ((error = xfs_inobt_decrement(tcur, 0,
790 &i)))
791 goto error1;
792 doneleft = !i;
793 if (!doneleft) {
794 if ((error = xfs_inobt_get_rec(
795 tcur,
796 &trec.ir_startino,
797 &trec.ir_freecount,
798 &trec.ir_free, &i)))
799 goto error1;
800 XFS_WANT_CORRUPTED_GOTO(i == 1,
801 error1);
802 }
803 }
804 /*
805 * If used the right, get another one
806 * further right.
807 */
808 else {
809 if ((error = xfs_inobt_increment(cur, 0,
810 &i)))
811 goto error1;
812 doneright = !i;
813 if (!doneright) {
814 if ((error = xfs_inobt_get_rec(
815 cur,
816 &rec.ir_startino,
817 &rec.ir_freecount,
818 &rec.ir_free, &i)))
819 goto error1;
820 XFS_WANT_CORRUPTED_GOTO(i == 1,
821 error1);
822 }
823 }
824 }
825 ASSERT(!doneleft || !doneright);
826 }
827 }
828 /*
829 * In a different a.g. from the parent.
830 * See if the most recently allocated block has any free.
831 */
832 else if (INT_GET(agi->agi_newino, ARCH_CONVERT) != NULLAGINO) {
833 if ((error = xfs_inobt_lookup_eq(cur,
834 INT_GET(agi->agi_newino, ARCH_CONVERT), 0, 0, &i)))
835 goto error0;
836 if (i == 1 &&
837 (error = xfs_inobt_get_rec(cur, &rec.ir_startino,
838 &rec.ir_freecount, &rec.ir_free, &j)) == 0 &&
839 j == 1 &&
840 rec.ir_freecount > 0) {
841 /*
842 * The last chunk allocated in the group still has
843 * a free inode.
844 */
845 }
846 /*
847 * None left in the last group, search the whole a.g.
848 */
849 else {
850 if (error)
851 goto error0;
852 if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
853 goto error0;
854 ASSERT(i == 1);
855 for (;;) {
856 if ((error = xfs_inobt_get_rec(cur,
857 &rec.ir_startino,
858 &rec.ir_freecount, &rec.ir_free,
859 &i)))
860 goto error0;
861 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
862 if (rec.ir_freecount > 0)
863 break;
864 if ((error = xfs_inobt_increment(cur, 0, &i)))
865 goto error0;
866 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
867 }
868 }
869 }
870 offset = XFS_IALLOC_FIND_FREE(&rec.ir_free);
871 ASSERT(offset >= 0);
872 ASSERT(offset < XFS_INODES_PER_CHUNK);
873 ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
874 XFS_INODES_PER_CHUNK) == 0);
875 ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
876 XFS_INOBT_CLR_FREE(&rec, offset);
877 rec.ir_freecount--;
878 if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount,
879 rec.ir_free)))
880 goto error0;
881 INT_MOD(agi->agi_freecount, ARCH_CONVERT, -1);
882 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
883 down_read(&mp->m_peraglock);
884 mp->m_perag[tagno].pagi_freecount--;
885 up_read(&mp->m_peraglock);
886#ifdef DEBUG
887 if (cur->bc_nlevels == 1) {
888 int freecount = 0;
889
890 if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
891 goto error0;
892 do {
893 if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
894 &rec.ir_freecount, &rec.ir_free, &i)))
895 goto error0;
896 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
897 freecount += rec.ir_freecount;
898 if ((error = xfs_inobt_increment(cur, 0, &i)))
899 goto error0;
900 } while (i == 1);
901 ASSERT(freecount == INT_GET(agi->agi_freecount, ARCH_CONVERT) ||
902 XFS_FORCED_SHUTDOWN(mp));
903 }
904#endif
905 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
906 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
907 *inop = ino;
908 return 0;
909error1:
910 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
911error0:
912 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
913 return error;
914}
915
916/*
917 * Free disk inode. Carefully avoids touching the incore inode, all
918 * manipulations incore are the caller's responsibility.
919 * The on-disk inode is not changed by this operation, only the
920 * btree (free inode mask) is changed.
921 */
922int
923xfs_difree(
924 xfs_trans_t *tp, /* transaction pointer */
925 xfs_ino_t inode, /* inode to be freed */
926 xfs_bmap_free_t *flist, /* extents to free */
927 int *delete, /* set if inode cluster was deleted */
928 xfs_ino_t *first_ino) /* first inode in deleted cluster */
929{
930 /* REFERENCED */
931 xfs_agblock_t agbno; /* block number containing inode */
932 xfs_buf_t *agbp; /* buffer containing allocation group header */
933 xfs_agino_t agino; /* inode number relative to allocation group */
934 xfs_agnumber_t agno; /* allocation group number */
935 xfs_agi_t *agi; /* allocation group header */
936 xfs_btree_cur_t *cur; /* inode btree cursor */
937 int error; /* error return value */
938 int i; /* result code */
939 int ilen; /* inodes in an inode cluster */
940 xfs_mount_t *mp; /* mount structure for filesystem */
941 int off; /* offset of inode in inode chunk */
942 xfs_inobt_rec_t rec; /* btree record */
943
944 mp = tp->t_mountp;
945
946 /*
947 * Break up inode number into its components.
948 */
949 agno = XFS_INO_TO_AGNO(mp, inode);
950 if (agno >= mp->m_sb.sb_agcount) {
951 cmn_err(CE_WARN,
952 "xfs_difree: agno >= mp->m_sb.sb_agcount (%d >= %d) on %s. Returning EINVAL.",
953 agno, mp->m_sb.sb_agcount, mp->m_fsname);
954 ASSERT(0);
955 return XFS_ERROR(EINVAL);
956 }
957 agino = XFS_INO_TO_AGINO(mp, inode);
958 if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) {
959 cmn_err(CE_WARN,
960 "xfs_difree: inode != XFS_AGINO_TO_INO() (%d != %d) on %s. Returning EINVAL.",
961 inode, XFS_AGINO_TO_INO(mp, agno, agino), mp->m_fsname);
962 ASSERT(0);
963 return XFS_ERROR(EINVAL);
964 }
965 agbno = XFS_AGINO_TO_AGBNO(mp, agino);
966 if (agbno >= mp->m_sb.sb_agblocks) {
967 cmn_err(CE_WARN,
968 "xfs_difree: agbno >= mp->m_sb.sb_agblocks (%d >= %d) on %s. Returning EINVAL.",
969 agbno, mp->m_sb.sb_agblocks, mp->m_fsname);
970 ASSERT(0);
971 return XFS_ERROR(EINVAL);
972 }
973 /*
974 * Get the allocation group header.
975 */
976 down_read(&mp->m_peraglock);
977 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
978 up_read(&mp->m_peraglock);
979 if (error) {
980 cmn_err(CE_WARN,
981 "xfs_difree: xfs_ialloc_read_agi() returned an error %d on %s. Returning error.",
982 error, mp->m_fsname);
983 return error;
984 }
985 agi = XFS_BUF_TO_AGI(agbp);
986 ASSERT(INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC);
987 ASSERT(agbno < INT_GET(agi->agi_length, ARCH_CONVERT));
988 /*
989 * Initialize the cursor.
990 */
991 cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO,
992 (xfs_inode_t *)0, 0);
993#ifdef DEBUG
994 if (cur->bc_nlevels == 1) {
995 int freecount = 0;
996
997 if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
998 goto error0;
999 do {
1000 if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
1001 &rec.ir_freecount, &rec.ir_free, &i)))
1002 goto error0;
1003 if (i) {
1004 freecount += rec.ir_freecount;
1005 if ((error = xfs_inobt_increment(cur, 0, &i)))
1006 goto error0;
1007 }
1008 } while (i == 1);
1009 ASSERT(freecount == INT_GET(agi->agi_freecount, ARCH_CONVERT) ||
1010 XFS_FORCED_SHUTDOWN(mp));
1011 }
1012#endif
1013 /*
1014 * Look for the entry describing this inode.
1015 */
1016 if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) {
1017 cmn_err(CE_WARN,
1018 "xfs_difree: xfs_inobt_lookup_le returned() an error %d on %s. Returning error.",
1019 error, mp->m_fsname);
1020 goto error0;
1021 }
1022 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1023 if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, &rec.ir_freecount,
1024 &rec.ir_free, &i))) {
1025 cmn_err(CE_WARN,
1026 "xfs_difree: xfs_inobt_get_rec() returned an error %d on %s. Returning error.",
1027 error, mp->m_fsname);
1028 goto error0;
1029 }
1030 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1031 /*
1032 * Get the offset in the inode chunk.
1033 */
1034 off = agino - rec.ir_startino;
1035 ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK);
1036 ASSERT(!XFS_INOBT_IS_FREE(&rec, off));
1037 /*
1038 * Mark the inode free & increment the count.
1039 */
1040 XFS_INOBT_SET_FREE(&rec, off);
1041 rec.ir_freecount++;
1042
1043 /*
1044 * When an inode cluster is free, it becomes elgible for removal
1045 */
1046 if ((mp->m_flags & XFS_MOUNT_IDELETE) &&
1047 (rec.ir_freecount == XFS_IALLOC_INODES(mp))) {
1048
1049 *delete = 1;
1050 *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
1051
1052 /*
1053 * Remove the inode cluster from the AGI B+Tree, adjust the
1054 * AGI and Superblock inode counts, and mark the disk space
1055 * to be freed when the transaction is committed.
1056 */
1057 ilen = XFS_IALLOC_INODES(mp);
1058 INT_MOD(agi->agi_count, ARCH_CONVERT, -ilen);
1059 INT_MOD(agi->agi_freecount, ARCH_CONVERT, -(ilen - 1));
1060 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
1061 down_read(&mp->m_peraglock);
1062 mp->m_perag[agno].pagi_freecount -= ilen - 1;
1063 up_read(&mp->m_peraglock);
1064 xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
1065 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
1066
1067 if ((error = xfs_inobt_delete(cur, &i))) {
1068 cmn_err(CE_WARN, "xfs_difree: xfs_inobt_delete returned an error %d on %s.\n",
1069 error, mp->m_fsname);
1070 goto error0;
1071 }
1072
1073 xfs_bmap_add_free(XFS_AGB_TO_FSB(mp,
1074 agno, XFS_INO_TO_AGBNO(mp,rec.ir_startino)),
1075 XFS_IALLOC_BLOCKS(mp), flist, mp);
1076 } else {
1077 *delete = 0;
1078
1079 if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, rec.ir_free))) {
1080 cmn_err(CE_WARN,
1081 "xfs_difree: xfs_inobt_update() returned an error %d on %s. Returning error.",
1082 error, mp->m_fsname);
1083 goto error0;
1084 }
1085 /*
1086 * Change the inode free counts and log the ag/sb changes.
1087 */
1088 INT_MOD(agi->agi_freecount, ARCH_CONVERT, 1);
1089 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
1090 down_read(&mp->m_peraglock);
1091 mp->m_perag[agno].pagi_freecount++;
1092 up_read(&mp->m_peraglock);
1093 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
1094 }
1095
1096#ifdef DEBUG
1097 if (cur->bc_nlevels == 1) {
1098 int freecount = 0;
1099
1100 if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
1101 goto error0;
1102 do {
1103 if ((error = xfs_inobt_get_rec(cur,
1104 &rec.ir_startino,
1105 &rec.ir_freecount,
1106 &rec.ir_free, &i)))
1107 goto error0;
1108 if (i) {
1109 freecount += rec.ir_freecount;
1110 if ((error = xfs_inobt_increment(cur, 0, &i)))
1111 goto error0;
1112 }
1113 } while (i == 1);
1114 ASSERT(freecount == INT_GET(agi->agi_freecount, ARCH_CONVERT) ||
1115 XFS_FORCED_SHUTDOWN(mp));
1116 }
1117#endif
1118 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1119 return 0;
1120
1121error0:
1122 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
1123 return error;
1124}
1125
1126/*
1127 * Return the location of the inode in bno/off, for mapping it into a buffer.
1128 */
1129/*ARGSUSED*/
1130int
1131xfs_dilocate(
1132 xfs_mount_t *mp, /* file system mount structure */
1133 xfs_trans_t *tp, /* transaction pointer */
1134 xfs_ino_t ino, /* inode to locate */
1135 xfs_fsblock_t *bno, /* output: block containing inode */
1136 int *len, /* output: num blocks in inode cluster */
1137 int *off, /* output: index in block of inode */
1138 uint flags) /* flags concerning inode lookup */
1139{
1140 xfs_agblock_t agbno; /* block number of inode in the alloc group */
1141 xfs_buf_t *agbp; /* agi buffer */
1142 xfs_agino_t agino; /* inode number within alloc group */
1143 xfs_agnumber_t agno; /* allocation group number */
1144 int blks_per_cluster; /* num blocks per inode cluster */
1145 xfs_agblock_t chunk_agbno; /* first block in inode chunk */
1146 xfs_agino_t chunk_agino; /* first agino in inode chunk */
1147 __int32_t chunk_cnt; /* count of free inodes in chunk */
1148 xfs_inofree_t chunk_free; /* mask of free inodes in chunk */
1149 xfs_agblock_t cluster_agbno; /* first block in inode cluster */
1150 xfs_btree_cur_t *cur; /* inode btree cursor */
1151 int error; /* error code */
1152 int i; /* temp state */
1153 int offset; /* index of inode in its buffer */
1154 int offset_agbno; /* blks from chunk start to inode */
1155
1156 ASSERT(ino != NULLFSINO);
1157 /*
1158 * Split up the inode number into its parts.
1159 */
1160 agno = XFS_INO_TO_AGNO(mp, ino);
1161 agino = XFS_INO_TO_AGINO(mp, ino);
1162 agbno = XFS_AGINO_TO_AGBNO(mp, agino);
1163 if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks ||
1164 ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
1165#ifdef DEBUG
1166 if (agno >= mp->m_sb.sb_agcount) {
1167 xfs_fs_cmn_err(CE_ALERT, mp,
1168 "xfs_dilocate: agno (%d) >= "
1169 "mp->m_sb.sb_agcount (%d)",
1170 agno, mp->m_sb.sb_agcount);
1171 }
1172 if (agbno >= mp->m_sb.sb_agblocks) {
1173 xfs_fs_cmn_err(CE_ALERT, mp,
1174 "xfs_dilocate: agbno (0x%llx) >= "
1175 "mp->m_sb.sb_agblocks (0x%lx)",
1176 (unsigned long long) agbno,
1177 (unsigned long) mp->m_sb.sb_agblocks);
1178 }
1179 if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
1180 xfs_fs_cmn_err(CE_ALERT, mp,
1181 "xfs_dilocate: ino (0x%llx) != "
1182 "XFS_AGINO_TO_INO(mp, agno, agino) "
1183 "(0x%llx)",
1184 ino, XFS_AGINO_TO_INO(mp, agno, agino));
1185 }
1186#endif /* DEBUG */
1187 return XFS_ERROR(EINVAL);
1188 }
1189 if ((mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) ||
1190 !(flags & XFS_IMAP_LOOKUP)) {
1191 offset = XFS_INO_TO_OFFSET(mp, ino);
1192 ASSERT(offset < mp->m_sb.sb_inopblock);
1193 *bno = XFS_AGB_TO_FSB(mp, agno, agbno);
1194 *off = offset;
1195 *len = 1;
1196 return 0;
1197 }
1198 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog;
1199 if (*bno != NULLFSBLOCK) {
1200 offset = XFS_INO_TO_OFFSET(mp, ino);
1201 ASSERT(offset < mp->m_sb.sb_inopblock);
1202 cluster_agbno = XFS_FSB_TO_AGBNO(mp, *bno);
1203 *off = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
1204 offset;
1205 *len = blks_per_cluster;
1206 return 0;
1207 }
1208 if (mp->m_inoalign_mask) {
1209 offset_agbno = agbno & mp->m_inoalign_mask;
1210 chunk_agbno = agbno - offset_agbno;
1211 } else {
1212 down_read(&mp->m_peraglock);
1213 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
1214 up_read(&mp->m_peraglock);
1215 if (error) {
1216#ifdef DEBUG
1217 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
1218 "xfs_ialloc_read_agi() returned "
1219 "error %d, agno %d",
1220 error, agno);
1221#endif /* DEBUG */
1222 return error;
1223 }
1224 cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO,
1225 (xfs_inode_t *)0, 0);
1226 if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) {
1227#ifdef DEBUG
1228 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
1229 "xfs_inobt_lookup_le() failed");
1230#endif /* DEBUG */
1231 goto error0;
1232 }
1233 if ((error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt,
1234 &chunk_free, &i))) {
1235#ifdef DEBUG
1236 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
1237 "xfs_inobt_get_rec() failed");
1238#endif /* DEBUG */
1239 goto error0;
1240 }
1241 if (i == 0) {
1242#ifdef DEBUG
1243 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
1244 "xfs_inobt_get_rec() failed");
1245#endif /* DEBUG */
1246 error = XFS_ERROR(EINVAL);
1247 }
1248 xfs_trans_brelse(tp, agbp);
1249 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1250 if (error)
1251 return error;
1252 chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_agino);
1253 offset_agbno = agbno - chunk_agbno;
1254 }
1255 ASSERT(agbno >= chunk_agbno);
1256 cluster_agbno = chunk_agbno +
1257 ((offset_agbno / blks_per_cluster) * blks_per_cluster);
1258 offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
1259 XFS_INO_TO_OFFSET(mp, ino);
1260 *bno = XFS_AGB_TO_FSB(mp, agno, cluster_agbno);
1261 *off = offset;
1262 *len = blks_per_cluster;
1263 return 0;
1264error0:
1265 xfs_trans_brelse(tp, agbp);
1266 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
1267 return error;
1268}
1269
1270/*
1271 * Compute and fill in value of m_in_maxlevels.
1272 */
1273void
1274xfs_ialloc_compute_maxlevels(
1275 xfs_mount_t *mp) /* file system mount structure */
1276{
1277 int level;
1278 uint maxblocks;
1279 uint maxleafents;
1280 int minleafrecs;
1281 int minnoderecs;
1282
1283 maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >>
1284 XFS_INODES_PER_CHUNK_LOG;
1285 minleafrecs = mp->m_alloc_mnr[0];
1286 minnoderecs = mp->m_alloc_mnr[1];
1287 maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
1288 for (level = 1; maxblocks > 1; level++)
1289 maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
1290 mp->m_in_maxlevels = level;
1291}
1292
1293/*
1294 * Log specified fields for the ag hdr (inode section)
1295 */
1296void
1297xfs_ialloc_log_agi(
1298 xfs_trans_t *tp, /* transaction pointer */
1299 xfs_buf_t *bp, /* allocation group header buffer */
1300 int fields) /* bitmask of fields to log */
1301{
1302 int first; /* first byte number */
1303 int last; /* last byte number */
1304 static const short offsets[] = { /* field starting offsets */
1305 /* keep in sync with bit definitions */
1306 offsetof(xfs_agi_t, agi_magicnum),
1307 offsetof(xfs_agi_t, agi_versionnum),
1308 offsetof(xfs_agi_t, agi_seqno),
1309 offsetof(xfs_agi_t, agi_length),
1310 offsetof(xfs_agi_t, agi_count),
1311 offsetof(xfs_agi_t, agi_root),
1312 offsetof(xfs_agi_t, agi_level),
1313 offsetof(xfs_agi_t, agi_freecount),
1314 offsetof(xfs_agi_t, agi_newino),
1315 offsetof(xfs_agi_t, agi_dirino),
1316 offsetof(xfs_agi_t, agi_unlinked),
1317 sizeof(xfs_agi_t)
1318 };
1319#ifdef DEBUG
1320 xfs_agi_t *agi; /* allocation group header */
1321
1322 agi = XFS_BUF_TO_AGI(bp);
1323 ASSERT(INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC);
1324#endif
1325 /*
1326 * Compute byte offsets for the first and last fields.
1327 */
1328 xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS, &first, &last);
1329 /*
1330 * Log the allocation group inode header buffer.
1331 */
1332 xfs_trans_log_buf(tp, bp, first, last);
1333}
1334
1335/*
1336 * Read in the allocation group header (inode allocation section)
1337 */
1338int
1339xfs_ialloc_read_agi(
1340 xfs_mount_t *mp, /* file system mount structure */
1341 xfs_trans_t *tp, /* transaction pointer */
1342 xfs_agnumber_t agno, /* allocation group number */
1343 xfs_buf_t **bpp) /* allocation group hdr buf */
1344{
1345 xfs_agi_t *agi; /* allocation group header */
1346 int agi_ok; /* agi is consistent */
1347 xfs_buf_t *bp; /* allocation group hdr buf */
1348 xfs_perag_t *pag; /* per allocation group data */
1349 int error;
1350
1351 ASSERT(agno != NULLAGNUMBER);
1352 error = xfs_trans_read_buf(
1353 mp, tp, mp->m_ddev_targp,
1354 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
1355 XFS_FSS_TO_BB(mp, 1), 0, &bp);
1356 if (error)
1357 return error;
1358 ASSERT(bp && !XFS_BUF_GETERROR(bp));
1359
1360 /*
1361 * Validate the magic number of the agi block.
1362 */
1363 agi = XFS_BUF_TO_AGI(bp);
1364 agi_ok =
1365 INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC &&
1366 XFS_AGI_GOOD_VERSION(
1367 INT_GET(agi->agi_versionnum, ARCH_CONVERT));
1368 if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
1369 XFS_RANDOM_IALLOC_READ_AGI))) {
1370 XFS_CORRUPTION_ERROR("xfs_ialloc_read_agi", XFS_ERRLEVEL_LOW,
1371 mp, agi);
1372 xfs_trans_brelse(tp, bp);
1373 return XFS_ERROR(EFSCORRUPTED);
1374 }
1375 pag = &mp->m_perag[agno];
1376 if (!pag->pagi_init) {
1377 pag->pagi_freecount = INT_GET(agi->agi_freecount, ARCH_CONVERT);
1378 pag->pagi_init = 1;
1379 } else {
1380 /*
1381 * It's possible for these to be out of sync if
1382 * we are in the middle of a forced shutdown.
1383 */
1384 ASSERT(pag->pagi_freecount ==
1385 INT_GET(agi->agi_freecount, ARCH_CONVERT)
1386 || XFS_FORCED_SHUTDOWN(mp));
1387 }
1388
1389#ifdef DEBUG
1390 {
1391 int i;
1392
1393 for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
1394 ASSERT(agi->agi_unlinked[i]);
1395 }
1396#endif
1397
1398 XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGI, XFS_AGI_REF);
1399 *bpp = bp;
1400 return 0;
1401}
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
new file mode 100644
index 000000000000..db6d0015cecf
--- /dev/null
+++ b/fs/xfs/xfs_ialloc.h
@@ -0,0 +1,184 @@
1/*
2 * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_IALLOC_H__
33#define __XFS_IALLOC_H__
34
35struct xfs_buf;
36struct xfs_dinode;
37struct xfs_mount;
38struct xfs_trans;
39
40/*
41 * Allocation parameters for inode allocation.
42 */
43#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IALLOC_INODES)
44int xfs_ialloc_inodes(struct xfs_mount *mp);
45#define XFS_IALLOC_INODES(mp) xfs_ialloc_inodes(mp)
46#else
47#define XFS_IALLOC_INODES(mp) ((mp)->m_ialloc_inos)
48#endif
49#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IALLOC_BLOCKS)
50xfs_extlen_t xfs_ialloc_blocks(struct xfs_mount *mp);
51#define XFS_IALLOC_BLOCKS(mp) xfs_ialloc_blocks(mp)
52#else
53#define XFS_IALLOC_BLOCKS(mp) ((mp)->m_ialloc_blks)
54#endif
55
56/*
57 * For small block file systems, move inodes in clusters of this size.
58 * When we don't have a lot of memory, however, we go a bit smaller
59 * to reduce the number of AGI and ialloc btree blocks we need to keep
60 * around for xfs_dilocate(). We choose which one to use in
61 * xfs_mount_int().
62 */
63#define XFS_INODE_BIG_CLUSTER_SIZE 8192
64#define XFS_INODE_SMALL_CLUSTER_SIZE 4096
65#define XFS_INODE_CLUSTER_SIZE(mp) (mp)->m_inode_cluster_size
66
67/*
68 * Make an inode pointer out of the buffer/offset.
69 */
70#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MAKE_IPTR)
71struct xfs_dinode *xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o);
72#define XFS_MAKE_IPTR(mp,b,o) xfs_make_iptr(mp,b,o)
73#else
74#define XFS_MAKE_IPTR(mp,b,o) \
75 ((xfs_dinode_t *)(xfs_buf_offset(b, (o) << (mp)->m_sb.sb_inodelog)))
76#endif
77
78/*
79 * Find a free (set) bit in the inode bitmask.
80 */
81#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IALLOC_FIND_FREE)
82int xfs_ialloc_find_free(xfs_inofree_t *fp);
83#define XFS_IALLOC_FIND_FREE(fp) xfs_ialloc_find_free(fp)
84#else
85#define XFS_IALLOC_FIND_FREE(fp) xfs_lowbit64(*(fp))
86#endif
87
88
89#ifdef __KERNEL__
90
91/*
92 * Prototypes for visible xfs_ialloc.c routines.
93 */
94
95/*
96 * Allocate an inode on disk.
97 * Mode is used to tell whether the new inode will need space, and whether
98 * it is a directory.
99 *
100 * To work within the constraint of one allocation per transaction,
101 * xfs_dialloc() is designed to be called twice if it has to do an
102 * allocation to make more free inodes. If an inode is
103 * available without an allocation, agbp would be set to the current
104 * agbp and alloc_done set to false.
105 * If an allocation needed to be done, agbp would be set to the
106 * inode header of the allocation group and alloc_done set to true.
107 * The caller should then commit the current transaction and allocate a new
108 * transaction. xfs_dialloc() should then be called again with
109 * the agbp value returned from the previous call.
110 *
111 * Once we successfully pick an inode its number is returned and the
112 * on-disk data structures are updated. The inode itself is not read
113 * in, since doing so would break ordering constraints with xfs_reclaim.
114 *
115 * *agbp should be set to NULL on the first call, *alloc_done set to FALSE.
116 */
117int /* error */
118xfs_dialloc(
119 struct xfs_trans *tp, /* transaction pointer */
120 xfs_ino_t parent, /* parent inode (directory) */
121 mode_t mode, /* mode bits for new inode */
122 int okalloc, /* ok to allocate more space */
123 struct xfs_buf **agbp, /* buf for a.g. inode header */
124 boolean_t *alloc_done, /* an allocation was done to replenish
125 the free inodes */
126 xfs_ino_t *inop); /* inode number allocated */
127
128/*
129 * Free disk inode. Carefully avoids touching the incore inode, all
130 * manipulations incore are the caller's responsibility.
131 * The on-disk inode is not changed by this operation, only the
132 * btree (free inode mask) is changed.
133 */
134int /* error */
135xfs_difree(
136 struct xfs_trans *tp, /* transaction pointer */
137 xfs_ino_t inode, /* inode to be freed */
138 struct xfs_bmap_free *flist, /* extents to free */
139 int *delete, /* set if inode cluster was deleted */
140 xfs_ino_t *first_ino); /* first inode in deleted cluster */
141
142/*
143 * Return the location of the inode in bno/len/off,
144 * for mapping it into a buffer.
145 */
146int
147xfs_dilocate(
148 struct xfs_mount *mp, /* file system mount structure */
149 struct xfs_trans *tp, /* transaction pointer */
150 xfs_ino_t ino, /* inode to locate */
151 xfs_fsblock_t *bno, /* output: block containing inode */
152 int *len, /* output: num blocks in cluster*/
153 int *off, /* output: index in block of inode */
154 uint flags); /* flags for inode btree lookup */
155
156/*
157 * Compute and fill in value of m_in_maxlevels.
158 */
159void
160xfs_ialloc_compute_maxlevels(
161 struct xfs_mount *mp); /* file system mount structure */
162
163/*
164 * Log specified fields for the ag hdr (inode section)
165 */
166void
167xfs_ialloc_log_agi(
168 struct xfs_trans *tp, /* transaction pointer */
169 struct xfs_buf *bp, /* allocation group header buffer */
170 int fields); /* bitmask of fields to log */
171
172/*
173 * Read in the allocation group header (inode allocation section)
174 */
175int /* error */
176xfs_ialloc_read_agi(
177 struct xfs_mount *mp, /* file system mount structure */
178 struct xfs_trans *tp, /* transaction pointer */
179 xfs_agnumber_t agno, /* allocation group number */
180 struct xfs_buf **bpp); /* allocation group hdr buf */
181
182#endif /* __KERNEL__ */
183
184#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
new file mode 100644
index 000000000000..2d4daecec990
--- /dev/null
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -0,0 +1,2094 @@
1/*
2 * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34
35#include "xfs_macros.h"
36#include "xfs_types.h"
37#include "xfs_inum.h"
38#include "xfs_log.h"
39#include "xfs_trans.h"
40#include "xfs_sb.h"
41#include "xfs_ag.h"
42#include "xfs_dir.h"
43#include "xfs_dmapi.h"
44#include "xfs_mount.h"
45#include "xfs_alloc_btree.h"
46#include "xfs_bmap_btree.h"
47#include "xfs_ialloc_btree.h"
48#include "xfs_btree.h"
49#include "xfs_ialloc.h"
50#include "xfs_alloc.h"
51#include "xfs_error.h"
52
53/*
54 * Inode allocation management for XFS.
55 */
56
57/*
58 * Prototypes for internal functions.
59 */
60
61STATIC void xfs_inobt_log_block(xfs_trans_t *, xfs_buf_t *, int);
62STATIC void xfs_inobt_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
63STATIC void xfs_inobt_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
64STATIC void xfs_inobt_log_recs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
65STATIC int xfs_inobt_lshift(xfs_btree_cur_t *, int, int *);
66STATIC int xfs_inobt_newroot(xfs_btree_cur_t *, int *);
67STATIC int xfs_inobt_rshift(xfs_btree_cur_t *, int, int *);
68STATIC int xfs_inobt_split(xfs_btree_cur_t *, int, xfs_agblock_t *,
69 xfs_inobt_key_t *, xfs_btree_cur_t **, int *);
70STATIC int xfs_inobt_updkey(xfs_btree_cur_t *, xfs_inobt_key_t *, int);
71
72/*
73 * Internal functions.
74 */
75
76/*
77 * Single level of the xfs_inobt_delete record deletion routine.
78 * Delete record pointed to by cur/level.
79 * Remove the record from its block then rebalance the tree.
80 * Return 0 for error, 1 for done, 2 to go on to the next level.
81 */
82STATIC int /* error */
83xfs_inobt_delrec(
84 xfs_btree_cur_t *cur, /* btree cursor */
85 int level, /* level removing record from */
86 int *stat) /* fail/done/go-on */
87{
88 xfs_buf_t *agbp; /* buffer for a.g. inode header */
89 xfs_mount_t *mp; /* mount structure */
90 xfs_agi_t *agi; /* allocation group inode header */
91 xfs_inobt_block_t *block; /* btree block record/key lives in */
92 xfs_agblock_t bno; /* btree block number */
93 xfs_buf_t *bp; /* buffer for block */
94 int error; /* error return value */
95 int i; /* loop index */
96 xfs_inobt_key_t key; /* kp points here if block is level 0 */
97 xfs_inobt_key_t *kp = NULL; /* pointer to btree keys */
98 xfs_agblock_t lbno; /* left block's block number */
99 xfs_buf_t *lbp; /* left block's buffer pointer */
100 xfs_inobt_block_t *left; /* left btree block */
101 xfs_inobt_key_t *lkp; /* left block key pointer */
102 xfs_inobt_ptr_t *lpp; /* left block address pointer */
103 int lrecs = 0; /* number of records in left block */
104 xfs_inobt_rec_t *lrp; /* left block record pointer */
105 xfs_inobt_ptr_t *pp = NULL; /* pointer to btree addresses */
106 int ptr; /* index in btree block for this rec */
107 xfs_agblock_t rbno; /* right block's block number */
108 xfs_buf_t *rbp; /* right block's buffer pointer */
109 xfs_inobt_block_t *right; /* right btree block */
110 xfs_inobt_key_t *rkp; /* right block key pointer */
111 xfs_inobt_rec_t *rp; /* pointer to btree records */
112 xfs_inobt_ptr_t *rpp; /* right block address pointer */
113 int rrecs = 0; /* number of records in right block */
114 int numrecs;
115 xfs_inobt_rec_t *rrp; /* right block record pointer */
116 xfs_btree_cur_t *tcur; /* temporary btree cursor */
117
118 mp = cur->bc_mp;
119
120 /*
121 * Get the index of the entry being deleted, check for nothing there.
122 */
123 ptr = cur->bc_ptrs[level];
124 if (ptr == 0) {
125 *stat = 0;
126 return 0;
127 }
128
129 /*
130 * Get the buffer & block containing the record or key/ptr.
131 */
132 bp = cur->bc_bufs[level];
133 block = XFS_BUF_TO_INOBT_BLOCK(bp);
134#ifdef DEBUG
135 if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
136 return error;
137#endif
138 /*
139 * Fail if we're off the end of the block.
140 */
141
142 numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT);
143 if (ptr > numrecs) {
144 *stat = 0;
145 return 0;
146 }
147 /*
148 * It's a nonleaf. Excise the key and ptr being deleted, by
149 * sliding the entries past them down one.
150 * Log the changed areas of the block.
151 */
152 if (level > 0) {
153 kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
154 pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
155#ifdef DEBUG
156 for (i = ptr; i < numrecs; i++) {
157 if ((error = xfs_btree_check_sptr(cur, INT_GET(pp[i], ARCH_CONVERT), level)))
158 return error;
159 }
160#endif
161 if (ptr < numrecs) {
162 memmove(&kp[ptr - 1], &kp[ptr],
163 (numrecs - ptr) * sizeof(*kp));
164 memmove(&pp[ptr - 1], &pp[ptr],
165 (numrecs - ptr) * sizeof(*kp));
166 xfs_inobt_log_keys(cur, bp, ptr, numrecs - 1);
167 xfs_inobt_log_ptrs(cur, bp, ptr, numrecs - 1);
168 }
169 }
170 /*
171 * It's a leaf. Excise the record being deleted, by sliding the
172 * entries past it down one. Log the changed areas of the block.
173 */
174 else {
175 rp = XFS_INOBT_REC_ADDR(block, 1, cur);
176 if (ptr < numrecs) {
177 memmove(&rp[ptr - 1], &rp[ptr],
178 (numrecs - ptr) * sizeof(*rp));
179 xfs_inobt_log_recs(cur, bp, ptr, numrecs - 1);
180 }
181 /*
182 * If it's the first record in the block, we'll need a key
183 * structure to pass up to the next level (updkey).
184 */
185 if (ptr == 1) {
186 key.ir_startino = rp->ir_startino;
187 kp = &key;
188 }
189 }
190 /*
191 * Decrement and log the number of entries in the block.
192 */
193 numrecs--;
194 INT_SET(block->bb_numrecs, ARCH_CONVERT, numrecs);
195 xfs_inobt_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
196 /*
197 * Is this the root level? If so, we're almost done.
198 */
199 if (level == cur->bc_nlevels - 1) {
200 /*
201 * If this is the root level,
202 * and there's only one entry left,
203 * and it's NOT the leaf level,
204 * then we can get rid of this level.
205 */
206 if (numrecs == 1 && level > 0) {
207 agbp = cur->bc_private.i.agbp;
208 agi = XFS_BUF_TO_AGI(agbp);
209 /*
210 * pp is still set to the first pointer in the block.
211 * Make it the new root of the btree.
212 */
213 bno = INT_GET(agi->agi_root, ARCH_CONVERT);
214 agi->agi_root = *pp;
215 INT_MOD(agi->agi_level, ARCH_CONVERT, -1);
216 /*
217 * Free the block.
218 */
219 if ((error = xfs_free_extent(cur->bc_tp,
220 XFS_AGB_TO_FSB(mp, cur->bc_private.i.agno, bno), 1)))
221 return error;
222 xfs_trans_binval(cur->bc_tp, bp);
223 xfs_ialloc_log_agi(cur->bc_tp, agbp,
224 XFS_AGI_ROOT | XFS_AGI_LEVEL);
225 /*
226 * Update the cursor so there's one fewer level.
227 */
228 cur->bc_bufs[level] = NULL;
229 cur->bc_nlevels--;
230 } else if (level > 0 &&
231 (error = xfs_inobt_decrement(cur, level, &i)))
232 return error;
233 *stat = 1;
234 return 0;
235 }
236 /*
237 * If we deleted the leftmost entry in the block, update the
238 * key values above us in the tree.
239 */
240 if (ptr == 1 && (error = xfs_inobt_updkey(cur, kp, level + 1)))
241 return error;
242 /*
243 * If the number of records remaining in the block is at least
244 * the minimum, we're done.
245 */
246 if (numrecs >= XFS_INOBT_BLOCK_MINRECS(level, cur)) {
247 if (level > 0 &&
248 (error = xfs_inobt_decrement(cur, level, &i)))
249 return error;
250 *stat = 1;
251 return 0;
252 }
253 /*
254 * Otherwise, we have to move some records around to keep the
255 * tree balanced. Look at the left and right sibling blocks to
256 * see if we can re-balance by moving only one record.
257 */
258 rbno = INT_GET(block->bb_rightsib, ARCH_CONVERT);
259 lbno = INT_GET(block->bb_leftsib, ARCH_CONVERT);
260 bno = NULLAGBLOCK;
261 ASSERT(rbno != NULLAGBLOCK || lbno != NULLAGBLOCK);
262 /*
263 * Duplicate the cursor so our btree manipulations here won't
264 * disrupt the next level up.
265 */
266 if ((error = xfs_btree_dup_cursor(cur, &tcur)))
267 return error;
268 /*
269 * If there's a right sibling, see if it's ok to shift an entry
270 * out of it.
271 */
272 if (rbno != NULLAGBLOCK) {
273 /*
274 * Move the temp cursor to the last entry in the next block.
275 * Actually any entry but the first would suffice.
276 */
277 i = xfs_btree_lastrec(tcur, level);
278 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
279 if ((error = xfs_inobt_increment(tcur, level, &i)))
280 goto error0;
281 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
282 i = xfs_btree_lastrec(tcur, level);
283 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
284 /*
285 * Grab a pointer to the block.
286 */
287 rbp = tcur->bc_bufs[level];
288 right = XFS_BUF_TO_INOBT_BLOCK(rbp);
289#ifdef DEBUG
290 if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
291 goto error0;
292#endif
293 /*
294 * Grab the current block number, for future use.
295 */
296 bno = INT_GET(right->bb_leftsib, ARCH_CONVERT);
297 /*
298 * If right block is full enough so that removing one entry
299 * won't make it too empty, and left-shifting an entry out
300 * of right to us works, we're done.
301 */
302 if (INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1 >=
303 XFS_INOBT_BLOCK_MINRECS(level, cur)) {
304 if ((error = xfs_inobt_lshift(tcur, level, &i)))
305 goto error0;
306 if (i) {
307 ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >=
308 XFS_INOBT_BLOCK_MINRECS(level, cur));
309 xfs_btree_del_cursor(tcur,
310 XFS_BTREE_NOERROR);
311 if (level > 0 &&
312 (error = xfs_inobt_decrement(cur, level,
313 &i)))
314 return error;
315 *stat = 1;
316 return 0;
317 }
318 }
319 /*
320 * Otherwise, grab the number of records in right for
321 * future reference, and fix up the temp cursor to point
322 * to our block again (last record).
323 */
324 rrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT);
325 if (lbno != NULLAGBLOCK) {
326 xfs_btree_firstrec(tcur, level);
327 if ((error = xfs_inobt_decrement(tcur, level, &i)))
328 goto error0;
329 }
330 }
331 /*
332 * If there's a left sibling, see if it's ok to shift an entry
333 * out of it.
334 */
335 if (lbno != NULLAGBLOCK) {
336 /*
337 * Move the temp cursor to the first entry in the
338 * previous block.
339 */
340 xfs_btree_firstrec(tcur, level);
341 if ((error = xfs_inobt_decrement(tcur, level, &i)))
342 goto error0;
343 xfs_btree_firstrec(tcur, level);
344 /*
345 * Grab a pointer to the block.
346 */
347 lbp = tcur->bc_bufs[level];
348 left = XFS_BUF_TO_INOBT_BLOCK(lbp);
349#ifdef DEBUG
350 if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
351 goto error0;
352#endif
353 /*
354 * Grab the current block number, for future use.
355 */
356 bno = INT_GET(left->bb_rightsib, ARCH_CONVERT);
357 /*
358 * If left block is full enough so that removing one entry
359 * won't make it too empty, and right-shifting an entry out
360 * of left to us works, we're done.
361 */
362 if (INT_GET(left->bb_numrecs, ARCH_CONVERT) - 1 >=
363 XFS_INOBT_BLOCK_MINRECS(level, cur)) {
364 if ((error = xfs_inobt_rshift(tcur, level, &i)))
365 goto error0;
366 if (i) {
367 ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >=
368 XFS_INOBT_BLOCK_MINRECS(level, cur));
369 xfs_btree_del_cursor(tcur,
370 XFS_BTREE_NOERROR);
371 if (level == 0)
372 cur->bc_ptrs[0]++;
373 *stat = 1;
374 return 0;
375 }
376 }
377 /*
378 * Otherwise, grab the number of records in right for
379 * future reference.
380 */
381 lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT);
382 }
383 /*
384 * Delete the temp cursor, we're done with it.
385 */
386 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
387 /*
388 * If here, we need to do a join to keep the tree balanced.
389 */
390 ASSERT(bno != NULLAGBLOCK);
391 /*
392 * See if we can join with the left neighbor block.
393 */
394 if (lbno != NULLAGBLOCK &&
395 lrecs + numrecs <= XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
396 /*
397 * Set "right" to be the starting block,
398 * "left" to be the left neighbor.
399 */
400 rbno = bno;
401 right = block;
402 rrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT);
403 rbp = bp;
404 if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
405 cur->bc_private.i.agno, lbno, 0, &lbp,
406 XFS_INO_BTREE_REF)))
407 return error;
408 left = XFS_BUF_TO_INOBT_BLOCK(lbp);
409 lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT);
410 if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
411 return error;
412 }
413 /*
414 * If that won't work, see if we can join with the right neighbor block.
415 */
416 else if (rbno != NULLAGBLOCK &&
417 rrecs + numrecs <= XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
418 /*
419 * Set "left" to be the starting block,
420 * "right" to be the right neighbor.
421 */
422 lbno = bno;
423 left = block;
424 lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT);
425 lbp = bp;
426 if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
427 cur->bc_private.i.agno, rbno, 0, &rbp,
428 XFS_INO_BTREE_REF)))
429 return error;
430 right = XFS_BUF_TO_INOBT_BLOCK(rbp);
431 rrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT);
432 if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
433 return error;
434 }
435 /*
436 * Otherwise, we can't fix the imbalance.
437 * Just return. This is probably a logic error, but it's not fatal.
438 */
439 else {
440 if (level > 0 && (error = xfs_inobt_decrement(cur, level, &i)))
441 return error;
442 *stat = 1;
443 return 0;
444 }
445 /*
446 * We're now going to join "left" and "right" by moving all the stuff
447 * in "right" to "left" and deleting "right".
448 */
449 if (level > 0) {
450 /*
451 * It's a non-leaf. Move keys and pointers.
452 */
453 lkp = XFS_INOBT_KEY_ADDR(left, lrecs + 1, cur);
454 lpp = XFS_INOBT_PTR_ADDR(left, lrecs + 1, cur);
455 rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
456 rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
457#ifdef DEBUG
458 for (i = 0; i < rrecs; i++) {
459 if ((error = xfs_btree_check_sptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level)))
460 return error;
461 }
462#endif
463 memcpy(lkp, rkp, rrecs * sizeof(*lkp));
464 memcpy(lpp, rpp, rrecs * sizeof(*lpp));
465 xfs_inobt_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
466 xfs_inobt_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
467 } else {
468 /*
469 * It's a leaf. Move records.
470 */
471 lrp = XFS_INOBT_REC_ADDR(left, lrecs + 1, cur);
472 rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
473 memcpy(lrp, rrp, rrecs * sizeof(*lrp));
474 xfs_inobt_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
475 }
476 /*
477 * If we joined with the left neighbor, set the buffer in the
478 * cursor to the left block, and fix up the index.
479 */
480 if (bp != lbp) {
481 xfs_btree_setbuf(cur, level, lbp);
482 cur->bc_ptrs[level] += lrecs;
483 }
484 /*
485 * If we joined with the right neighbor and there's a level above
486 * us, increment the cursor at that level.
487 */
488 else if (level + 1 < cur->bc_nlevels &&
489 (error = xfs_alloc_increment(cur, level + 1, &i)))
490 return error;
491 /*
492 * Fix up the number of records in the surviving block.
493 */
494 lrecs += rrecs;
495 INT_SET(left->bb_numrecs, ARCH_CONVERT, lrecs);
496 /*
497 * Fix up the right block pointer in the surviving block, and log it.
498 */
499 left->bb_rightsib = right->bb_rightsib;
500 xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
501 /*
502 * If there is a right sibling now, make it point to the
503 * remaining block.
504 */
505 if (INT_GET(left->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
506 xfs_inobt_block_t *rrblock;
507 xfs_buf_t *rrbp;
508
509 if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
510 cur->bc_private.i.agno, INT_GET(left->bb_rightsib, ARCH_CONVERT), 0,
511 &rrbp, XFS_INO_BTREE_REF)))
512 return error;
513 rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp);
514 if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
515 return error;
516 INT_SET(rrblock->bb_leftsib, ARCH_CONVERT, lbno);
517 xfs_inobt_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
518 }
519 /*
520 * Free the deleting block.
521 */
522 if ((error = xfs_free_extent(cur->bc_tp, XFS_AGB_TO_FSB(mp,
523 cur->bc_private.i.agno, rbno), 1)))
524 return error;
525 xfs_trans_binval(cur->bc_tp, rbp);
526 /*
527 * Readjust the ptr at this level if it's not a leaf, since it's
528 * still pointing at the deletion point, which makes the cursor
529 * inconsistent. If this makes the ptr 0, the caller fixes it up.
530 * We can't use decrement because it would change the next level up.
531 */
532 if (level > 0)
533 cur->bc_ptrs[level]--;
534 /*
535 * Return value means the next level up has something to do.
536 */
537 *stat = 2;
538 return 0;
539
540error0:
541 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
542 return error;
543}
544
545/*
546 * Insert one record/level. Return information to the caller
547 * allowing the next level up to proceed if necessary.
548 */
549STATIC int /* error */
550xfs_inobt_insrec(
551 xfs_btree_cur_t *cur, /* btree cursor */
552 int level, /* level to insert record at */
553 xfs_agblock_t *bnop, /* i/o: block number inserted */
554 xfs_inobt_rec_t *recp, /* i/o: record data inserted */
555 xfs_btree_cur_t **curp, /* output: new cursor replacing cur */
556 int *stat) /* success/failure */
557{
558 xfs_inobt_block_t *block; /* btree block record/key lives in */
559 xfs_buf_t *bp; /* buffer for block */
560 int error; /* error return value */
561 int i; /* loop index */
562 xfs_inobt_key_t key; /* key value being inserted */
563 xfs_inobt_key_t *kp=NULL; /* pointer to btree keys */
564 xfs_agblock_t nbno; /* block number of allocated block */
565 xfs_btree_cur_t *ncur; /* new cursor to be used at next lvl */
566 xfs_inobt_key_t nkey; /* new key value, from split */
567 xfs_inobt_rec_t nrec; /* new record value, for caller */
568 int numrecs;
569 int optr; /* old ptr value */
570 xfs_inobt_ptr_t *pp; /* pointer to btree addresses */
571 int ptr; /* index in btree block for this rec */
572 xfs_inobt_rec_t *rp=NULL; /* pointer to btree records */
573
574 /*
575 * If we made it to the root level, allocate a new root block
576 * and we're done.
577 */
578 if (level >= cur->bc_nlevels) {
579 error = xfs_inobt_newroot(cur, &i);
580 *bnop = NULLAGBLOCK;
581 *stat = i;
582 return error;
583 }
584 /*
585 * Make a key out of the record data to be inserted, and save it.
586 */
587 key.ir_startino = recp->ir_startino; /* INT_: direct copy */
588 optr = ptr = cur->bc_ptrs[level];
589 /*
590 * If we're off the left edge, return failure.
591 */
592 if (ptr == 0) {
593 *stat = 0;
594 return 0;
595 }
596 /*
597 * Get pointers to the btree buffer and block.
598 */
599 bp = cur->bc_bufs[level];
600 block = XFS_BUF_TO_INOBT_BLOCK(bp);
601 numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT);
602#ifdef DEBUG
603 if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
604 return error;
605 /*
606 * Check that the new entry is being inserted in the right place.
607 */
608 if (ptr <= numrecs) {
609 if (level == 0) {
610 rp = XFS_INOBT_REC_ADDR(block, ptr, cur);
611 xfs_btree_check_rec(cur->bc_btnum, recp, rp);
612 } else {
613 kp = XFS_INOBT_KEY_ADDR(block, ptr, cur);
614 xfs_btree_check_key(cur->bc_btnum, &key, kp);
615 }
616 }
617#endif
618 nbno = NULLAGBLOCK;
619 ncur = (xfs_btree_cur_t *)0;
620 /*
621 * If the block is full, we can't insert the new entry until we
622 * make the block un-full.
623 */
624 if (numrecs == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
625 /*
626 * First, try shifting an entry to the right neighbor.
627 */
628 if ((error = xfs_inobt_rshift(cur, level, &i)))
629 return error;
630 if (i) {
631 /* nothing */
632 }
633 /*
634 * Next, try shifting an entry to the left neighbor.
635 */
636 else {
637 if ((error = xfs_inobt_lshift(cur, level, &i)))
638 return error;
639 if (i) {
640 optr = ptr = cur->bc_ptrs[level];
641 } else {
642 /*
643 * Next, try splitting the current block
644 * in half. If this works we have to
645 * re-set our variables because
646 * we could be in a different block now.
647 */
648 if ((error = xfs_inobt_split(cur, level, &nbno,
649 &nkey, &ncur, &i)))
650 return error;
651 if (i) {
652 bp = cur->bc_bufs[level];
653 block = XFS_BUF_TO_INOBT_BLOCK(bp);
654#ifdef DEBUG
655 if ((error = xfs_btree_check_sblock(cur,
656 block, level, bp)))
657 return error;
658#endif
659 ptr = cur->bc_ptrs[level];
660 nrec.ir_startino = nkey.ir_startino; /* INT_: direct copy */
661 } else {
662 /*
663 * Otherwise the insert fails.
664 */
665 *stat = 0;
666 return 0;
667 }
668 }
669 }
670 }
671 /*
672 * At this point we know there's room for our new entry in the block
673 * we're pointing at.
674 */
675 numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT);
676 if (level > 0) {
677 /*
678 * It's a non-leaf entry. Make a hole for the new data
679 * in the key and ptr regions of the block.
680 */
681 kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
682 pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
683#ifdef DEBUG
684 for (i = numrecs; i >= ptr; i--) {
685 if ((error = xfs_btree_check_sptr(cur, INT_GET(pp[i - 1], ARCH_CONVERT), level)))
686 return error;
687 }
688#endif
689 memmove(&kp[ptr], &kp[ptr - 1],
690 (numrecs - ptr + 1) * sizeof(*kp));
691 memmove(&pp[ptr], &pp[ptr - 1],
692 (numrecs - ptr + 1) * sizeof(*pp));
693 /*
694 * Now stuff the new data in, bump numrecs and log the new data.
695 */
696#ifdef DEBUG
697 if ((error = xfs_btree_check_sptr(cur, *bnop, level)))
698 return error;
699#endif
700 kp[ptr - 1] = key; /* INT_: struct copy */
701 INT_SET(pp[ptr - 1], ARCH_CONVERT, *bnop);
702 numrecs++;
703 INT_SET(block->bb_numrecs, ARCH_CONVERT, numrecs);
704 xfs_inobt_log_keys(cur, bp, ptr, numrecs);
705 xfs_inobt_log_ptrs(cur, bp, ptr, numrecs);
706 } else {
707 /*
708 * It's a leaf entry. Make a hole for the new record.
709 */
710 rp = XFS_INOBT_REC_ADDR(block, 1, cur);
711 memmove(&rp[ptr], &rp[ptr - 1],
712 (numrecs - ptr + 1) * sizeof(*rp));
713 /*
714 * Now stuff the new record in, bump numrecs
715 * and log the new data.
716 */
717 rp[ptr - 1] = *recp; /* INT_: struct copy */
718 numrecs++;
719 INT_SET(block->bb_numrecs, ARCH_CONVERT, numrecs);
720 xfs_inobt_log_recs(cur, bp, ptr, numrecs);
721 }
722 /*
723 * Log the new number of records in the btree header.
724 */
725 xfs_inobt_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
726#ifdef DEBUG
727 /*
728 * Check that the key/record is in the right place, now.
729 */
730 if (ptr < numrecs) {
731 if (level == 0)
732 xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1,
733 rp + ptr);
734 else
735 xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1,
736 kp + ptr);
737 }
738#endif
739 /*
740 * If we inserted at the start of a block, update the parents' keys.
741 */
742 if (optr == 1 && (error = xfs_inobt_updkey(cur, &key, level + 1)))
743 return error;
744 /*
745 * Return the new block number, if any.
746 * If there is one, give back a record value and a cursor too.
747 */
748 *bnop = nbno;
749 if (nbno != NULLAGBLOCK) {
750 *recp = nrec; /* INT_: struct copy */
751 *curp = ncur;
752 }
753 *stat = 1;
754 return 0;
755}
756
757/*
758 * Log header fields from a btree block.
759 */
760STATIC void
761xfs_inobt_log_block(
762 xfs_trans_t *tp, /* transaction pointer */
763 xfs_buf_t *bp, /* buffer containing btree block */
764 int fields) /* mask of fields: XFS_BB_... */
765{
766 int first; /* first byte offset logged */
767 int last; /* last byte offset logged */
768 static const short offsets[] = { /* table of offsets */
769 offsetof(xfs_inobt_block_t, bb_magic),
770 offsetof(xfs_inobt_block_t, bb_level),
771 offsetof(xfs_inobt_block_t, bb_numrecs),
772 offsetof(xfs_inobt_block_t, bb_leftsib),
773 offsetof(xfs_inobt_block_t, bb_rightsib),
774 sizeof(xfs_inobt_block_t)
775 };
776
777 xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, &last);
778 xfs_trans_log_buf(tp, bp, first, last);
779}
780
781/*
782 * Log keys from a btree block (nonleaf).
783 */
784STATIC void
785xfs_inobt_log_keys(
786 xfs_btree_cur_t *cur, /* btree cursor */
787 xfs_buf_t *bp, /* buffer containing btree block */
788 int kfirst, /* index of first key to log */
789 int klast) /* index of last key to log */
790{
791 xfs_inobt_block_t *block; /* btree block to log from */
792 int first; /* first byte offset logged */
793 xfs_inobt_key_t *kp; /* key pointer in btree block */
794 int last; /* last byte offset logged */
795
796 block = XFS_BUF_TO_INOBT_BLOCK(bp);
797 kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
798 first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
799 last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
800 xfs_trans_log_buf(cur->bc_tp, bp, first, last);
801}
802
803/*
804 * Log block pointer fields from a btree block (nonleaf).
805 */
806STATIC void
807xfs_inobt_log_ptrs(
808 xfs_btree_cur_t *cur, /* btree cursor */
809 xfs_buf_t *bp, /* buffer containing btree block */
810 int pfirst, /* index of first pointer to log */
811 int plast) /* index of last pointer to log */
812{
813 xfs_inobt_block_t *block; /* btree block to log from */
814 int first; /* first byte offset logged */
815 int last; /* last byte offset logged */
816 xfs_inobt_ptr_t *pp; /* block-pointer pointer in btree blk */
817
818 block = XFS_BUF_TO_INOBT_BLOCK(bp);
819 pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
820 first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
821 last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
822 xfs_trans_log_buf(cur->bc_tp, bp, first, last);
823}
824
825/*
826 * Log records from a btree block (leaf).
827 */
828STATIC void
829xfs_inobt_log_recs(
830 xfs_btree_cur_t *cur, /* btree cursor */
831 xfs_buf_t *bp, /* buffer containing btree block */
832 int rfirst, /* index of first record to log */
833 int rlast) /* index of last record to log */
834{
835 xfs_inobt_block_t *block; /* btree block to log from */
836 int first; /* first byte offset logged */
837 int last; /* last byte offset logged */
838 xfs_inobt_rec_t *rp; /* record pointer for btree block */
839
840 block = XFS_BUF_TO_INOBT_BLOCK(bp);
841 rp = XFS_INOBT_REC_ADDR(block, 1, cur);
842 first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
843 last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
844 xfs_trans_log_buf(cur->bc_tp, bp, first, last);
845}
846
847/*
848 * Lookup the record. The cursor is made to point to it, based on dir.
849 * Return 0 if can't find any such record, 1 for success.
850 */
851STATIC int /* error */
852xfs_inobt_lookup(
853 xfs_btree_cur_t *cur, /* btree cursor */
854 xfs_lookup_t dir, /* <=, ==, or >= */
855 int *stat) /* success/failure */
856{
857 xfs_agblock_t agbno; /* a.g. relative btree block number */
858 xfs_agnumber_t agno; /* allocation group number */
859 xfs_inobt_block_t *block=NULL; /* current btree block */
860 __int64_t diff; /* difference for the current key */
861 int error; /* error return value */
862 int keyno=0; /* current key number */
863 int level; /* level in the btree */
864 xfs_mount_t *mp; /* file system mount point */
865
866 /*
867 * Get the allocation group header, and the root block number.
868 */
869 mp = cur->bc_mp;
870 {
871 xfs_agi_t *agi; /* a.g. inode header */
872
873 agi = XFS_BUF_TO_AGI(cur->bc_private.i.agbp);
874 agno = INT_GET(agi->agi_seqno, ARCH_CONVERT);
875 agbno = INT_GET(agi->agi_root, ARCH_CONVERT);
876 }
877 /*
878 * Iterate over each level in the btree, starting at the root.
879 * For each level above the leaves, find the key we need, based
880 * on the lookup record, then follow the corresponding block
881 * pointer down to the next level.
882 */
883 for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
884 xfs_buf_t *bp; /* buffer pointer for btree block */
885 xfs_daddr_t d; /* disk address of btree block */
886
887 /*
888 * Get the disk address we're looking for.
889 */
890 d = XFS_AGB_TO_DADDR(mp, agno, agbno);
891 /*
892 * If the old buffer at this level is for a different block,
893 * throw it away, otherwise just use it.
894 */
895 bp = cur->bc_bufs[level];
896 if (bp && XFS_BUF_ADDR(bp) != d)
897 bp = (xfs_buf_t *)0;
898 if (!bp) {
899 /*
900 * Need to get a new buffer. Read it, then
901 * set it in the cursor, releasing the old one.
902 */
903 if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
904 agno, agbno, 0, &bp, XFS_INO_BTREE_REF)))
905 return error;
906 xfs_btree_setbuf(cur, level, bp);
907 /*
908 * Point to the btree block, now that we have the buffer
909 */
910 block = XFS_BUF_TO_INOBT_BLOCK(bp);
911 if ((error = xfs_btree_check_sblock(cur, block, level,
912 bp)))
913 return error;
914 } else
915 block = XFS_BUF_TO_INOBT_BLOCK(bp);
916 /*
917 * If we already had a key match at a higher level, we know
918 * we need to use the first entry in this block.
919 */
920 if (diff == 0)
921 keyno = 1;
922 /*
923 * Otherwise we need to search this block. Do a binary search.
924 */
925 else {
926 int high; /* high entry number */
927 xfs_inobt_key_t *kkbase=NULL;/* base of keys in block */
928 xfs_inobt_rec_t *krbase=NULL;/* base of records in block */
929 int low; /* low entry number */
930
931 /*
932 * Get a pointer to keys or records.
933 */
934 if (level > 0)
935 kkbase = XFS_INOBT_KEY_ADDR(block, 1, cur);
936 else
937 krbase = XFS_INOBT_REC_ADDR(block, 1, cur);
938 /*
939 * Set low and high entry numbers, 1-based.
940 */
941 low = 1;
942 if (!(high = INT_GET(block->bb_numrecs, ARCH_CONVERT))) {
943 /*
944 * If the block is empty, the tree must
945 * be an empty leaf.
946 */
947 ASSERT(level == 0 && cur->bc_nlevels == 1);
948 cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
949 *stat = 0;
950 return 0;
951 }
952 /*
953 * Binary search the block.
954 */
955 while (low <= high) {
956 xfs_agino_t startino; /* key value */
957
958 /*
959 * keyno is average of low and high.
960 */
961 keyno = (low + high) >> 1;
962 /*
963 * Get startino.
964 */
965 if (level > 0) {
966 xfs_inobt_key_t *kkp;
967
968 kkp = kkbase + keyno - 1;
969 startino = INT_GET(kkp->ir_startino, ARCH_CONVERT);
970 } else {
971 xfs_inobt_rec_t *krp;
972
973 krp = krbase + keyno - 1;
974 startino = INT_GET(krp->ir_startino, ARCH_CONVERT);
975 }
976 /*
977 * Compute difference to get next direction.
978 */
979 diff = (__int64_t)
980 startino - cur->bc_rec.i.ir_startino;
981 /*
982 * Less than, move right.
983 */
984 if (diff < 0)
985 low = keyno + 1;
986 /*
987 * Greater than, move left.
988 */
989 else if (diff > 0)
990 high = keyno - 1;
991 /*
992 * Equal, we're done.
993 */
994 else
995 break;
996 }
997 }
998 /*
999 * If there are more levels, set up for the next level
1000 * by getting the block number and filling in the cursor.
1001 */
1002 if (level > 0) {
1003 /*
1004 * If we moved left, need the previous key number,
1005 * unless there isn't one.
1006 */
1007 if (diff > 0 && --keyno < 1)
1008 keyno = 1;
1009 agbno = INT_GET(*XFS_INOBT_PTR_ADDR(block, keyno, cur), ARCH_CONVERT);
1010#ifdef DEBUG
1011 if ((error = xfs_btree_check_sptr(cur, agbno, level)))
1012 return error;
1013#endif
1014 cur->bc_ptrs[level] = keyno;
1015 }
1016 }
1017 /*
1018 * Done with the search.
1019 * See if we need to adjust the results.
1020 */
1021 if (dir != XFS_LOOKUP_LE && diff < 0) {
1022 keyno++;
1023 /*
1024 * If ge search and we went off the end of the block, but it's
1025 * not the last block, we're in the wrong block.
1026 */
1027 if (dir == XFS_LOOKUP_GE &&
1028 keyno > INT_GET(block->bb_numrecs, ARCH_CONVERT) &&
1029 INT_GET(block->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
1030 int i;
1031
1032 cur->bc_ptrs[0] = keyno;
1033 if ((error = xfs_inobt_increment(cur, 0, &i)))
1034 return error;
1035 ASSERT(i == 1);
1036 *stat = 1;
1037 return 0;
1038 }
1039 }
1040 else if (dir == XFS_LOOKUP_LE && diff > 0)
1041 keyno--;
1042 cur->bc_ptrs[0] = keyno;
1043 /*
1044 * Return if we succeeded or not.
1045 */
1046 if (keyno == 0 || keyno > INT_GET(block->bb_numrecs, ARCH_CONVERT))
1047 *stat = 0;
1048 else
1049 *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
1050 return 0;
1051}
1052
1053/*
1054 * Move 1 record left from cur/level if possible.
1055 * Update cur to reflect the new path.
1056 */
1057STATIC int /* error */
1058xfs_inobt_lshift(
1059 xfs_btree_cur_t *cur, /* btree cursor */
1060 int level, /* level to shift record on */
1061 int *stat) /* success/failure */
1062{
1063 int error; /* error return value */
1064#ifdef DEBUG
1065 int i; /* loop index */
1066#endif
1067 xfs_inobt_key_t key; /* key value for leaf level upward */
1068 xfs_buf_t *lbp; /* buffer for left neighbor block */
1069 xfs_inobt_block_t *left; /* left neighbor btree block */
1070 xfs_inobt_key_t *lkp=NULL; /* key pointer for left block */
1071 xfs_inobt_ptr_t *lpp; /* address pointer for left block */
1072 xfs_inobt_rec_t *lrp=NULL; /* record pointer for left block */
1073 int nrec; /* new number of left block entries */
1074 xfs_buf_t *rbp; /* buffer for right (current) block */
1075 xfs_inobt_block_t *right; /* right (current) btree block */
1076 xfs_inobt_key_t *rkp=NULL; /* key pointer for right block */
1077 xfs_inobt_ptr_t *rpp=NULL; /* address pointer for right block */
1078 xfs_inobt_rec_t *rrp=NULL; /* record pointer for right block */
1079
1080 /*
1081 * Set up variables for this block as "right".
1082 */
1083 rbp = cur->bc_bufs[level];
1084 right = XFS_BUF_TO_INOBT_BLOCK(rbp);
1085#ifdef DEBUG
1086 if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
1087 return error;
1088#endif
1089 /*
1090 * If we've got no left sibling then we can't shift an entry left.
1091 */
1092 if (INT_GET(right->bb_leftsib, ARCH_CONVERT) == NULLAGBLOCK) {
1093 *stat = 0;
1094 return 0;
1095 }
1096 /*
1097 * If the cursor entry is the one that would be moved, don't
1098 * do it... it's too complicated.
1099 */
1100 if (cur->bc_ptrs[level] <= 1) {
1101 *stat = 0;
1102 return 0;
1103 }
1104 /*
1105 * Set up the left neighbor as "left".
1106 */
1107 if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
1108 cur->bc_private.i.agno, INT_GET(right->bb_leftsib, ARCH_CONVERT), 0, &lbp,
1109 XFS_INO_BTREE_REF)))
1110 return error;
1111 left = XFS_BUF_TO_INOBT_BLOCK(lbp);
1112 if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
1113 return error;
1114 /*
1115 * If it's full, it can't take another entry.
1116 */
1117 if (INT_GET(left->bb_numrecs, ARCH_CONVERT) == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
1118 *stat = 0;
1119 return 0;
1120 }
1121 nrec = INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1;
1122 /*
1123 * If non-leaf, copy a key and a ptr to the left block.
1124 */
1125 if (level > 0) {
1126 lkp = XFS_INOBT_KEY_ADDR(left, nrec, cur);
1127 rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
1128 *lkp = *rkp;
1129 xfs_inobt_log_keys(cur, lbp, nrec, nrec);
1130 lpp = XFS_INOBT_PTR_ADDR(left, nrec, cur);
1131 rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
1132#ifdef DEBUG
1133 if ((error = xfs_btree_check_sptr(cur, INT_GET(*rpp, ARCH_CONVERT), level)))
1134 return error;
1135#endif
1136 *lpp = *rpp; /* INT_: no-change copy */
1137 xfs_inobt_log_ptrs(cur, lbp, nrec, nrec);
1138 }
1139 /*
1140 * If leaf, copy a record to the left block.
1141 */
1142 else {
1143 lrp = XFS_INOBT_REC_ADDR(left, nrec, cur);
1144 rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
1145 *lrp = *rrp;
1146 xfs_inobt_log_recs(cur, lbp, nrec, nrec);
1147 }
1148 /*
1149 * Bump and log left's numrecs, decrement and log right's numrecs.
1150 */
1151 INT_MOD(left->bb_numrecs, ARCH_CONVERT, +1);
1152 xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
1153#ifdef DEBUG
1154 if (level > 0)
1155 xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp);
1156 else
1157 xfs_btree_check_rec(cur->bc_btnum, lrp - 1, lrp);
1158#endif
1159 INT_MOD(right->bb_numrecs, ARCH_CONVERT, -1);
1160 xfs_inobt_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
1161 /*
1162 * Slide the contents of right down one entry.
1163 */
1164 if (level > 0) {
1165#ifdef DEBUG
1166 for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) {
1167 if ((error = xfs_btree_check_sptr(cur, INT_GET(rpp[i + 1], ARCH_CONVERT),
1168 level)))
1169 return error;
1170 }
1171#endif
1172 memmove(rkp, rkp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp));
1173 memmove(rpp, rpp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp));
1174 xfs_inobt_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
1175 xfs_inobt_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
1176 } else {
1177 memmove(rrp, rrp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp));
1178 xfs_inobt_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
1179 key.ir_startino = rrp->ir_startino; /* INT_: direct copy */
1180 rkp = &key;
1181 }
1182 /*
1183 * Update the parent key values of right.
1184 */
1185 if ((error = xfs_inobt_updkey(cur, rkp, level + 1)))
1186 return error;
1187 /*
1188 * Slide the cursor value left one.
1189 */
1190 cur->bc_ptrs[level]--;
1191 *stat = 1;
1192 return 0;
1193}
1194
1195/*
1196 * Allocate a new root block, fill it in.
1197 */
1198STATIC int /* error */
1199xfs_inobt_newroot(
1200 xfs_btree_cur_t *cur, /* btree cursor */
1201 int *stat) /* success/failure */
1202{
1203 xfs_agi_t *agi; /* a.g. inode header */
1204 xfs_alloc_arg_t args; /* allocation argument structure */
1205 xfs_inobt_block_t *block; /* one half of the old root block */
1206 xfs_buf_t *bp; /* buffer containing block */
1207 int error; /* error return value */
1208 xfs_inobt_key_t *kp; /* btree key pointer */
1209 xfs_agblock_t lbno; /* left block number */
1210 xfs_buf_t *lbp; /* left buffer pointer */
1211 xfs_inobt_block_t *left; /* left btree block */
1212 xfs_buf_t *nbp; /* new (root) buffer */
1213 xfs_inobt_block_t *new; /* new (root) btree block */
1214 int nptr; /* new value for key index, 1 or 2 */
1215 xfs_inobt_ptr_t *pp; /* btree address pointer */
1216 xfs_agblock_t rbno; /* right block number */
1217 xfs_buf_t *rbp; /* right buffer pointer */
1218 xfs_inobt_block_t *right; /* right btree block */
1219 xfs_inobt_rec_t *rp; /* btree record pointer */
1220
1221 ASSERT(cur->bc_nlevels < XFS_IN_MAXLEVELS(cur->bc_mp));
1222
1223 /*
1224 * Get a block & a buffer.
1225 */
1226 agi = XFS_BUF_TO_AGI(cur->bc_private.i.agbp);
1227 args.tp = cur->bc_tp;
1228 args.mp = cur->bc_mp;
1229 args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.i.agno,
1230 INT_GET(agi->agi_root, ARCH_CONVERT));
1231 args.mod = args.minleft = args.alignment = args.total = args.wasdel =
1232 args.isfl = args.userdata = args.minalignslop = 0;
1233 args.minlen = args.maxlen = args.prod = 1;
1234 args.type = XFS_ALLOCTYPE_NEAR_BNO;
1235 if ((error = xfs_alloc_vextent(&args)))
1236 return error;
1237 /*
1238 * None available, we fail.
1239 */
1240 if (args.fsbno == NULLFSBLOCK) {
1241 *stat = 0;
1242 return 0;
1243 }
1244 ASSERT(args.len == 1);
1245 nbp = xfs_btree_get_bufs(args.mp, args.tp, args.agno, args.agbno, 0);
1246 new = XFS_BUF_TO_INOBT_BLOCK(nbp);
1247 /*
1248 * Set the root data in the a.g. inode structure.
1249 */
1250 INT_SET(agi->agi_root, ARCH_CONVERT, args.agbno);
1251 INT_MOD(agi->agi_level, ARCH_CONVERT, 1);
1252 xfs_ialloc_log_agi(args.tp, cur->bc_private.i.agbp,
1253 XFS_AGI_ROOT | XFS_AGI_LEVEL);
1254 /*
1255 * At the previous root level there are now two blocks: the old
1256 * root, and the new block generated when it was split.
1257 * We don't know which one the cursor is pointing at, so we
1258 * set up variables "left" and "right" for each case.
1259 */
1260 bp = cur->bc_bufs[cur->bc_nlevels - 1];
1261 block = XFS_BUF_TO_INOBT_BLOCK(bp);
1262#ifdef DEBUG
1263 if ((error = xfs_btree_check_sblock(cur, block, cur->bc_nlevels - 1, bp)))
1264 return error;
1265#endif
1266 if (INT_GET(block->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
1267 /*
1268 * Our block is left, pick up the right block.
1269 */
1270 lbp = bp;
1271 lbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(lbp));
1272 left = block;
1273 rbno = INT_GET(left->bb_rightsib, ARCH_CONVERT);
1274 if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
1275 rbno, 0, &rbp, XFS_INO_BTREE_REF)))
1276 return error;
1277 bp = rbp;
1278 right = XFS_BUF_TO_INOBT_BLOCK(rbp);
1279 if ((error = xfs_btree_check_sblock(cur, right,
1280 cur->bc_nlevels - 1, rbp)))
1281 return error;
1282 nptr = 1;
1283 } else {
1284 /*
1285 * Our block is right, pick up the left block.
1286 */
1287 rbp = bp;
1288 rbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(rbp));
1289 right = block;
1290 lbno = INT_GET(right->bb_leftsib, ARCH_CONVERT);
1291 if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
1292 lbno, 0, &lbp, XFS_INO_BTREE_REF)))
1293 return error;
1294 bp = lbp;
1295 left = XFS_BUF_TO_INOBT_BLOCK(lbp);
1296 if ((error = xfs_btree_check_sblock(cur, left,
1297 cur->bc_nlevels - 1, lbp)))
1298 return error;
1299 nptr = 2;
1300 }
1301 /*
1302 * Fill in the new block's btree header and log it.
1303 */
1304 INT_SET(new->bb_magic, ARCH_CONVERT, xfs_magics[cur->bc_btnum]);
1305 INT_SET(new->bb_level, ARCH_CONVERT, (__uint16_t)cur->bc_nlevels);
1306 INT_SET(new->bb_numrecs, ARCH_CONVERT, 2);
1307 INT_SET(new->bb_leftsib, ARCH_CONVERT, NULLAGBLOCK);
1308 INT_SET(new->bb_rightsib, ARCH_CONVERT, NULLAGBLOCK);
1309 xfs_inobt_log_block(args.tp, nbp, XFS_BB_ALL_BITS);
1310 ASSERT(lbno != NULLAGBLOCK && rbno != NULLAGBLOCK);
1311 /*
1312 * Fill in the key data in the new root.
1313 */
1314 kp = XFS_INOBT_KEY_ADDR(new, 1, cur);
1315 if (INT_GET(left->bb_level, ARCH_CONVERT) > 0) {
1316 kp[0] = *XFS_INOBT_KEY_ADDR(left, 1, cur); /* INT_: struct copy */
1317 kp[1] = *XFS_INOBT_KEY_ADDR(right, 1, cur); /* INT_: struct copy */
1318 } else {
1319 rp = XFS_INOBT_REC_ADDR(left, 1, cur);
1320 INT_COPY(kp[0].ir_startino, rp->ir_startino, ARCH_CONVERT);
1321 rp = XFS_INOBT_REC_ADDR(right, 1, cur);
1322 INT_COPY(kp[1].ir_startino, rp->ir_startino, ARCH_CONVERT);
1323 }
1324 xfs_inobt_log_keys(cur, nbp, 1, 2);
1325 /*
1326 * Fill in the pointer data in the new root.
1327 */
1328 pp = XFS_INOBT_PTR_ADDR(new, 1, cur);
1329 INT_SET(pp[0], ARCH_CONVERT, lbno);
1330 INT_SET(pp[1], ARCH_CONVERT, rbno);
1331 xfs_inobt_log_ptrs(cur, nbp, 1, 2);
1332 /*
1333 * Fix up the cursor.
1334 */
1335 xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
1336 cur->bc_ptrs[cur->bc_nlevels] = nptr;
1337 cur->bc_nlevels++;
1338 *stat = 1;
1339 return 0;
1340}
1341
1342/*
1343 * Move 1 record right from cur/level if possible.
1344 * Update cur to reflect the new path.
1345 */
1346STATIC int /* error */
1347xfs_inobt_rshift(
1348 xfs_btree_cur_t *cur, /* btree cursor */
1349 int level, /* level to shift record on */
1350 int *stat) /* success/failure */
1351{
1352 int error; /* error return value */
1353 int i; /* loop index */
1354 xfs_inobt_key_t key; /* key value for leaf level upward */
1355 xfs_buf_t *lbp; /* buffer for left (current) block */
1356 xfs_inobt_block_t *left; /* left (current) btree block */
1357 xfs_inobt_key_t *lkp; /* key pointer for left block */
1358 xfs_inobt_ptr_t *lpp; /* address pointer for left block */
1359 xfs_inobt_rec_t *lrp; /* record pointer for left block */
1360 xfs_buf_t *rbp; /* buffer for right neighbor block */
1361 xfs_inobt_block_t *right; /* right neighbor btree block */
1362 xfs_inobt_key_t *rkp; /* key pointer for right block */
1363 xfs_inobt_ptr_t *rpp; /* address pointer for right block */
1364 xfs_inobt_rec_t *rrp=NULL; /* record pointer for right block */
1365 xfs_btree_cur_t *tcur; /* temporary cursor */
1366
1367 /*
1368 * Set up variables for this block as "left".
1369 */
1370 lbp = cur->bc_bufs[level];
1371 left = XFS_BUF_TO_INOBT_BLOCK(lbp);
1372#ifdef DEBUG
1373 if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
1374 return error;
1375#endif
1376 /*
1377 * If we've got no right sibling then we can't shift an entry right.
1378 */
1379 if (INT_GET(left->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK) {
1380 *stat = 0;
1381 return 0;
1382 }
1383 /*
1384 * If the cursor entry is the one that would be moved, don't
1385 * do it... it's too complicated.
1386 */
1387 if (cur->bc_ptrs[level] >= INT_GET(left->bb_numrecs, ARCH_CONVERT)) {
1388 *stat = 0;
1389 return 0;
1390 }
1391 /*
1392 * Set up the right neighbor as "right".
1393 */
1394 if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
1395 cur->bc_private.i.agno, INT_GET(left->bb_rightsib, ARCH_CONVERT), 0, &rbp,
1396 XFS_INO_BTREE_REF)))
1397 return error;
1398 right = XFS_BUF_TO_INOBT_BLOCK(rbp);
1399 if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
1400 return error;
1401 /*
1402 * If it's full, it can't take another entry.
1403 */
1404 if (INT_GET(right->bb_numrecs, ARCH_CONVERT) == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
1405 *stat = 0;
1406 return 0;
1407 }
1408 /*
1409 * Make a hole at the start of the right neighbor block, then
1410 * copy the last left block entry to the hole.
1411 */
1412 if (level > 0) {
1413 lkp = XFS_INOBT_KEY_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
1414 lpp = XFS_INOBT_PTR_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
1415 rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
1416 rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
1417#ifdef DEBUG
1418 for (i = INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1; i >= 0; i--) {
1419 if ((error = xfs_btree_check_sptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level)))
1420 return error;
1421 }
1422#endif
1423 memmove(rkp + 1, rkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp));
1424 memmove(rpp + 1, rpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp));
1425#ifdef DEBUG
1426 if ((error = xfs_btree_check_sptr(cur, INT_GET(*lpp, ARCH_CONVERT), level)))
1427 return error;
1428#endif
1429 *rkp = *lkp; /* INT_: no change copy */
1430 *rpp = *lpp; /* INT_: no change copy */
1431 xfs_inobt_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
1432 xfs_inobt_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
1433 } else {
1434 lrp = XFS_INOBT_REC_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
1435 rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
1436 memmove(rrp + 1, rrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp));
1437 *rrp = *lrp;
1438 xfs_inobt_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
1439 key.ir_startino = rrp->ir_startino; /* INT_: direct copy */
1440 rkp = &key;
1441 }
1442 /*
1443 * Decrement and log left's numrecs, bump and log right's numrecs.
1444 */
1445 INT_MOD(left->bb_numrecs, ARCH_CONVERT, -1);
1446 xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
1447 INT_MOD(right->bb_numrecs, ARCH_CONVERT, +1);
1448#ifdef DEBUG
1449 if (level > 0)
1450 xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1);
1451 else
1452 xfs_btree_check_rec(cur->bc_btnum, rrp, rrp + 1);
1453#endif
1454 xfs_inobt_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
1455 /*
1456 * Using a temporary cursor, update the parent key values of the
1457 * block on the right.
1458 */
1459 if ((error = xfs_btree_dup_cursor(cur, &tcur)))
1460 return error;
1461 xfs_btree_lastrec(tcur, level);
1462 if ((error = xfs_inobt_increment(tcur, level, &i)) ||
1463 (error = xfs_inobt_updkey(tcur, rkp, level + 1))) {
1464 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
1465 return error;
1466 }
1467 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
1468 *stat = 1;
1469 return 0;
1470}
1471
1472/*
1473 * Split cur/level block in half.
1474 * Return new block number and its first record (to be inserted into parent).
1475 */
1476STATIC int /* error */
1477xfs_inobt_split(
1478 xfs_btree_cur_t *cur, /* btree cursor */
1479 int level, /* level to split */
1480 xfs_agblock_t *bnop, /* output: block number allocated */
1481 xfs_inobt_key_t *keyp, /* output: first key of new block */
1482 xfs_btree_cur_t **curp, /* output: new cursor */
1483 int *stat) /* success/failure */
1484{
1485 xfs_alloc_arg_t args; /* allocation argument structure */
1486 int error; /* error return value */
1487 int i; /* loop index/record number */
1488 xfs_agblock_t lbno; /* left (current) block number */
1489 xfs_buf_t *lbp; /* buffer for left block */
1490 xfs_inobt_block_t *left; /* left (current) btree block */
1491 xfs_inobt_key_t *lkp; /* left btree key pointer */
1492 xfs_inobt_ptr_t *lpp; /* left btree address pointer */
1493 xfs_inobt_rec_t *lrp; /* left btree record pointer */
1494 xfs_buf_t *rbp; /* buffer for right block */
1495 xfs_inobt_block_t *right; /* right (new) btree block */
1496 xfs_inobt_key_t *rkp; /* right btree key pointer */
1497 xfs_inobt_ptr_t *rpp; /* right btree address pointer */
1498 xfs_inobt_rec_t *rrp; /* right btree record pointer */
1499
1500 /*
1501 * Set up left block (current one).
1502 */
1503 lbp = cur->bc_bufs[level];
1504 args.tp = cur->bc_tp;
1505 args.mp = cur->bc_mp;
1506 lbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(lbp));
1507 /*
1508 * Allocate the new block.
1509 * If we can't do it, we're toast. Give up.
1510 */
1511 args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.i.agno, lbno);
1512 args.mod = args.minleft = args.alignment = args.total = args.wasdel =
1513 args.isfl = args.userdata = args.minalignslop = 0;
1514 args.minlen = args.maxlen = args.prod = 1;
1515 args.type = XFS_ALLOCTYPE_NEAR_BNO;
1516 if ((error = xfs_alloc_vextent(&args)))
1517 return error;
1518 if (args.fsbno == NULLFSBLOCK) {
1519 *stat = 0;
1520 return 0;
1521 }
1522 ASSERT(args.len == 1);
1523 rbp = xfs_btree_get_bufs(args.mp, args.tp, args.agno, args.agbno, 0);
1524 /*
1525 * Set up the new block as "right".
1526 */
1527 right = XFS_BUF_TO_INOBT_BLOCK(rbp);
1528 /*
1529 * "Left" is the current (according to the cursor) block.
1530 */
1531 left = XFS_BUF_TO_INOBT_BLOCK(lbp);
1532#ifdef DEBUG
1533 if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
1534 return error;
1535#endif
1536 /*
1537 * Fill in the btree header for the new block.
1538 */
1539 INT_SET(right->bb_magic, ARCH_CONVERT, xfs_magics[cur->bc_btnum]);
1540 right->bb_level = left->bb_level; /* INT_: direct copy */
1541 INT_SET(right->bb_numrecs, ARCH_CONVERT, (__uint16_t)(INT_GET(left->bb_numrecs, ARCH_CONVERT) / 2));
1542 /*
1543 * Make sure that if there's an odd number of entries now, that
1544 * each new block will have the same number of entries.
1545 */
1546 if ((INT_GET(left->bb_numrecs, ARCH_CONVERT) & 1) &&
1547 cur->bc_ptrs[level] <= INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1)
1548 INT_MOD(right->bb_numrecs, ARCH_CONVERT, +1);
1549 i = INT_GET(left->bb_numrecs, ARCH_CONVERT) - INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1;
1550 /*
1551 * For non-leaf blocks, copy keys and addresses over to the new block.
1552 */
1553 if (level > 0) {
1554 lkp = XFS_INOBT_KEY_ADDR(left, i, cur);
1555 lpp = XFS_INOBT_PTR_ADDR(left, i, cur);
1556 rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
1557 rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
1558#ifdef DEBUG
1559 for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) {
1560 if ((error = xfs_btree_check_sptr(cur, INT_GET(lpp[i], ARCH_CONVERT), level)))
1561 return error;
1562 }
1563#endif
1564 memcpy(rkp, lkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp));
1565 memcpy(rpp, lpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp));
1566 xfs_inobt_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
1567 xfs_inobt_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
1568 *keyp = *rkp;
1569 }
1570 /*
1571 * For leaf blocks, copy records over to the new block.
1572 */
1573 else {
1574 lrp = XFS_INOBT_REC_ADDR(left, i, cur);
1575 rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
1576 memcpy(rrp, lrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp));
1577 xfs_inobt_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
1578 keyp->ir_startino = rrp->ir_startino; /* INT_: direct copy */
1579 }
1580 /*
1581 * Find the left block number by looking in the buffer.
1582 * Adjust numrecs, sibling pointers.
1583 */
1584 INT_MOD(left->bb_numrecs, ARCH_CONVERT, -(INT_GET(right->bb_numrecs, ARCH_CONVERT)));
1585 right->bb_rightsib = left->bb_rightsib; /* INT_: direct copy */
1586 INT_SET(left->bb_rightsib, ARCH_CONVERT, args.agbno);
1587 INT_SET(right->bb_leftsib, ARCH_CONVERT, lbno);
1588 xfs_inobt_log_block(args.tp, rbp, XFS_BB_ALL_BITS);
1589 xfs_inobt_log_block(args.tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
1590 /*
1591 * If there's a block to the new block's right, make that block
1592 * point back to right instead of to left.
1593 */
1594 if (INT_GET(right->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
1595 xfs_inobt_block_t *rrblock; /* rr btree block */
1596 xfs_buf_t *rrbp; /* buffer for rrblock */
1597
1598 if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
1599 INT_GET(right->bb_rightsib, ARCH_CONVERT), 0, &rrbp,
1600 XFS_INO_BTREE_REF)))
1601 return error;
1602 rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp);
1603 if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
1604 return error;
1605 INT_SET(rrblock->bb_leftsib, ARCH_CONVERT, args.agbno);
1606 xfs_inobt_log_block(args.tp, rrbp, XFS_BB_LEFTSIB);
1607 }
1608 /*
1609 * If the cursor is really in the right block, move it there.
1610 * If it's just pointing past the last entry in left, then we'll
1611 * insert there, so don't change anything in that case.
1612 */
1613 if (cur->bc_ptrs[level] > INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1) {
1614 xfs_btree_setbuf(cur, level, rbp);
1615 cur->bc_ptrs[level] -= INT_GET(left->bb_numrecs, ARCH_CONVERT);
1616 }
1617 /*
1618 * If there are more levels, we'll need another cursor which refers
1619 * the right block, no matter where this cursor was.
1620 */
1621 if (level + 1 < cur->bc_nlevels) {
1622 if ((error = xfs_btree_dup_cursor(cur, curp)))
1623 return error;
1624 (*curp)->bc_ptrs[level + 1]++;
1625 }
1626 *bnop = args.agbno;
1627 *stat = 1;
1628 return 0;
1629}
1630
1631/*
1632 * Update keys at all levels from here to the root along the cursor's path.
1633 */
1634STATIC int /* error */
1635xfs_inobt_updkey(
1636 xfs_btree_cur_t *cur, /* btree cursor */
1637 xfs_inobt_key_t *keyp, /* new key value to update to */
1638 int level) /* starting level for update */
1639{
1640 int ptr; /* index of key in block */
1641
1642 /*
1643 * Go up the tree from this level toward the root.
1644 * At each level, update the key value to the value input.
1645 * Stop when we reach a level where the cursor isn't pointing
1646 * at the first entry in the block.
1647 */
1648 for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
1649 xfs_buf_t *bp; /* buffer for block */
1650 xfs_inobt_block_t *block; /* btree block */
1651#ifdef DEBUG
1652 int error; /* error return value */
1653#endif
1654 xfs_inobt_key_t *kp; /* ptr to btree block keys */
1655
1656 bp = cur->bc_bufs[level];
1657 block = XFS_BUF_TO_INOBT_BLOCK(bp);
1658#ifdef DEBUG
1659 if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
1660 return error;
1661#endif
1662 ptr = cur->bc_ptrs[level];
1663 kp = XFS_INOBT_KEY_ADDR(block, ptr, cur);
1664 *kp = *keyp;
1665 xfs_inobt_log_keys(cur, bp, ptr, ptr);
1666 }
1667 return 0;
1668}
1669
1670/*
1671 * Externally visible routines.
1672 */
1673
1674/*
1675 * Decrement cursor by one record at the level.
1676 * For nonzero levels the leaf-ward information is untouched.
1677 */
1678int /* error */
1679xfs_inobt_decrement(
1680 xfs_btree_cur_t *cur, /* btree cursor */
1681 int level, /* level in btree, 0 is leaf */
1682 int *stat) /* success/failure */
1683{
1684 xfs_inobt_block_t *block; /* btree block */
1685 int error;
1686 int lev; /* btree level */
1687
1688 ASSERT(level < cur->bc_nlevels);
1689 /*
1690 * Read-ahead to the left at this level.
1691 */
1692 xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
1693 /*
1694 * Decrement the ptr at this level. If we're still in the block
1695 * then we're done.
1696 */
1697 if (--cur->bc_ptrs[level] > 0) {
1698 *stat = 1;
1699 return 0;
1700 }
1701 /*
1702 * Get a pointer to the btree block.
1703 */
1704 block = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[level]);
1705#ifdef DEBUG
1706 if ((error = xfs_btree_check_sblock(cur, block, level,
1707 cur->bc_bufs[level])))
1708 return error;
1709#endif
1710 /*
1711 * If we just went off the left edge of the tree, return failure.
1712 */
1713 if (INT_GET(block->bb_leftsib, ARCH_CONVERT) == NULLAGBLOCK) {
1714 *stat = 0;
1715 return 0;
1716 }
1717 /*
1718 * March up the tree decrementing pointers.
1719 * Stop when we don't go off the left edge of a block.
1720 */
1721 for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1722 if (--cur->bc_ptrs[lev] > 0)
1723 break;
1724 /*
1725 * Read-ahead the left block, we're going to read it
1726 * in the next loop.
1727 */
1728 xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
1729 }
1730 /*
1731 * If we went off the root then we are seriously confused.
1732 */
1733 ASSERT(lev < cur->bc_nlevels);
1734 /*
1735 * Now walk back down the tree, fixing up the cursor's buffer
1736 * pointers and key numbers.
1737 */
1738 for (block = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]); lev > level; ) {
1739 xfs_agblock_t agbno; /* block number of btree block */
1740 xfs_buf_t *bp; /* buffer containing btree block */
1741
1742 agbno = INT_GET(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT);
1743 if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
1744 cur->bc_private.i.agno, agbno, 0, &bp,
1745 XFS_INO_BTREE_REF)))
1746 return error;
1747 lev--;
1748 xfs_btree_setbuf(cur, lev, bp);
1749 block = XFS_BUF_TO_INOBT_BLOCK(bp);
1750 if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
1751 return error;
1752 cur->bc_ptrs[lev] = INT_GET(block->bb_numrecs, ARCH_CONVERT);
1753 }
1754 *stat = 1;
1755 return 0;
1756}
1757
1758/*
1759 * Delete the record pointed to by cur.
1760 * The cursor refers to the place where the record was (could be inserted)
1761 * when the operation returns.
1762 */
1763int /* error */
1764xfs_inobt_delete(
1765 xfs_btree_cur_t *cur, /* btree cursor */
1766 int *stat) /* success/failure */
1767{
1768 int error;
1769 int i; /* result code */
1770 int level; /* btree level */
1771
1772 /*
1773 * Go up the tree, starting at leaf level.
1774 * If 2 is returned then a join was done; go to the next level.
1775 * Otherwise we are done.
1776 */
1777 for (level = 0, i = 2; i == 2; level++) {
1778 if ((error = xfs_inobt_delrec(cur, level, &i)))
1779 return error;
1780 }
1781 if (i == 0) {
1782 for (level = 1; level < cur->bc_nlevels; level++) {
1783 if (cur->bc_ptrs[level] == 0) {
1784 if ((error = xfs_inobt_decrement(cur, level, &i)))
1785 return error;
1786 break;
1787 }
1788 }
1789 }
1790 *stat = i;
1791 return 0;
1792}
1793
1794
1795/*
1796 * Get the data from the pointed-to record.
1797 */
1798int /* error */
1799xfs_inobt_get_rec(
1800 xfs_btree_cur_t *cur, /* btree cursor */
1801 xfs_agino_t *ino, /* output: starting inode of chunk */
1802 __int32_t *fcnt, /* output: number of free inodes */
1803 xfs_inofree_t *free, /* output: free inode mask */
1804 int *stat) /* output: success/failure */
1805{
1806 xfs_inobt_block_t *block; /* btree block */
1807 xfs_buf_t *bp; /* buffer containing btree block */
1808#ifdef DEBUG
1809 int error; /* error return value */
1810#endif
1811 int ptr; /* record number */
1812 xfs_inobt_rec_t *rec; /* record data */
1813
1814 bp = cur->bc_bufs[0];
1815 ptr = cur->bc_ptrs[0];
1816 block = XFS_BUF_TO_INOBT_BLOCK(bp);
1817#ifdef DEBUG
1818 if ((error = xfs_btree_check_sblock(cur, block, 0, bp)))
1819 return error;
1820#endif
1821 /*
1822 * Off the right end or left end, return failure.
1823 */
1824 if (ptr > INT_GET(block->bb_numrecs, ARCH_CONVERT) || ptr <= 0) {
1825 *stat = 0;
1826 return 0;
1827 }
1828 /*
1829 * Point to the record and extract its data.
1830 */
1831 rec = XFS_INOBT_REC_ADDR(block, ptr, cur);
1832 *ino = INT_GET(rec->ir_startino, ARCH_CONVERT);
1833 *fcnt = INT_GET(rec->ir_freecount, ARCH_CONVERT);
1834 *free = INT_GET(rec->ir_free, ARCH_CONVERT);
1835 *stat = 1;
1836 return 0;
1837}
1838
1839/*
1840 * Increment cursor by one record at the level.
1841 * For nonzero levels the leaf-ward information is untouched.
1842 */
1843int /* error */
1844xfs_inobt_increment(
1845 xfs_btree_cur_t *cur, /* btree cursor */
1846 int level, /* level in btree, 0 is leaf */
1847 int *stat) /* success/failure */
1848{
1849 xfs_inobt_block_t *block; /* btree block */
1850 xfs_buf_t *bp; /* buffer containing btree block */
1851 int error; /* error return value */
1852 int lev; /* btree level */
1853
1854 ASSERT(level < cur->bc_nlevels);
1855 /*
1856 * Read-ahead to the right at this level.
1857 */
1858 xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
1859 /*
1860 * Get a pointer to the btree block.
1861 */
1862 bp = cur->bc_bufs[level];
1863 block = XFS_BUF_TO_INOBT_BLOCK(bp);
1864#ifdef DEBUG
1865 if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
1866 return error;
1867#endif
1868 /*
1869 * Increment the ptr at this level. If we're still in the block
1870 * then we're done.
1871 */
1872 if (++cur->bc_ptrs[level] <= INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
1873 *stat = 1;
1874 return 0;
1875 }
1876 /*
1877 * If we just went off the right edge of the tree, return failure.
1878 */
1879 if (INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK) {
1880 *stat = 0;
1881 return 0;
1882 }
1883 /*
1884 * March up the tree incrementing pointers.
1885 * Stop when we don't go off the right edge of a block.
1886 */
1887 for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1888 bp = cur->bc_bufs[lev];
1889 block = XFS_BUF_TO_INOBT_BLOCK(bp);
1890#ifdef DEBUG
1891 if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
1892 return error;
1893#endif
1894 if (++cur->bc_ptrs[lev] <= INT_GET(block->bb_numrecs, ARCH_CONVERT))
1895 break;
1896 /*
1897 * Read-ahead the right block, we're going to read it
1898 * in the next loop.
1899 */
1900 xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
1901 }
1902 /*
1903 * If we went off the root then we are seriously confused.
1904 */
1905 ASSERT(lev < cur->bc_nlevels);
1906 /*
1907 * Now walk back down the tree, fixing up the cursor's buffer
1908 * pointers and key numbers.
1909 */
1910 for (bp = cur->bc_bufs[lev], block = XFS_BUF_TO_INOBT_BLOCK(bp);
1911 lev > level; ) {
1912 xfs_agblock_t agbno; /* block number of btree block */
1913
1914 agbno = INT_GET(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT);
1915 if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
1916 cur->bc_private.i.agno, agbno, 0, &bp,
1917 XFS_INO_BTREE_REF)))
1918 return error;
1919 lev--;
1920 xfs_btree_setbuf(cur, lev, bp);
1921 block = XFS_BUF_TO_INOBT_BLOCK(bp);
1922 if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
1923 return error;
1924 cur->bc_ptrs[lev] = 1;
1925 }
1926 *stat = 1;
1927 return 0;
1928}
1929
1930/*
1931 * Insert the current record at the point referenced by cur.
1932 * The cursor may be inconsistent on return if splits have been done.
1933 */
1934int /* error */
1935xfs_inobt_insert(
1936 xfs_btree_cur_t *cur, /* btree cursor */
1937 int *stat) /* success/failure */
1938{
1939 int error; /* error return value */
1940 int i; /* result value, 0 for failure */
1941 int level; /* current level number in btree */
1942 xfs_agblock_t nbno; /* new block number (split result) */
1943 xfs_btree_cur_t *ncur; /* new cursor (split result) */
1944 xfs_inobt_rec_t nrec; /* record being inserted this level */
1945 xfs_btree_cur_t *pcur; /* previous level's cursor */
1946
1947 level = 0;
1948 nbno = NULLAGBLOCK;
1949 INT_SET(nrec.ir_startino, ARCH_CONVERT, cur->bc_rec.i.ir_startino);
1950 INT_SET(nrec.ir_freecount, ARCH_CONVERT, cur->bc_rec.i.ir_freecount);
1951 INT_SET(nrec.ir_free, ARCH_CONVERT, cur->bc_rec.i.ir_free);
1952 ncur = (xfs_btree_cur_t *)0;
1953 pcur = cur;
1954 /*
1955 * Loop going up the tree, starting at the leaf level.
1956 * Stop when we don't get a split block, that must mean that
1957 * the insert is finished with this level.
1958 */
1959 do {
1960 /*
1961 * Insert nrec/nbno into this level of the tree.
1962 * Note if we fail, nbno will be null.
1963 */
1964 if ((error = xfs_inobt_insrec(pcur, level++, &nbno, &nrec, &ncur,
1965 &i))) {
1966 if (pcur != cur)
1967 xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
1968 return error;
1969 }
1970 /*
1971 * See if the cursor we just used is trash.
1972 * Can't trash the caller's cursor, but otherwise we should
1973 * if ncur is a new cursor or we're about to be done.
1974 */
1975 if (pcur != cur && (ncur || nbno == NULLAGBLOCK)) {
1976 cur->bc_nlevels = pcur->bc_nlevels;
1977 xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
1978 }
1979 /*
1980 * If we got a new cursor, switch to it.
1981 */
1982 if (ncur) {
1983 pcur = ncur;
1984 ncur = (xfs_btree_cur_t *)0;
1985 }
1986 } while (nbno != NULLAGBLOCK);
1987 *stat = i;
1988 return 0;
1989}
1990
1991/*
1992 * Lookup the record equal to ino in the btree given by cur.
1993 */
1994int /* error */
1995xfs_inobt_lookup_eq(
1996 xfs_btree_cur_t *cur, /* btree cursor */
1997 xfs_agino_t ino, /* starting inode of chunk */
1998 __int32_t fcnt, /* free inode count */
1999 xfs_inofree_t free, /* free inode mask */
2000 int *stat) /* success/failure */
2001{
2002 cur->bc_rec.i.ir_startino = ino;
2003 cur->bc_rec.i.ir_freecount = fcnt;
2004 cur->bc_rec.i.ir_free = free;
2005 return xfs_inobt_lookup(cur, XFS_LOOKUP_EQ, stat);
2006}
2007
2008/*
2009 * Lookup the first record greater than or equal to ino
2010 * in the btree given by cur.
2011 */
2012int /* error */
2013xfs_inobt_lookup_ge(
2014 xfs_btree_cur_t *cur, /* btree cursor */
2015 xfs_agino_t ino, /* starting inode of chunk */
2016 __int32_t fcnt, /* free inode count */
2017 xfs_inofree_t free, /* free inode mask */
2018 int *stat) /* success/failure */
2019{
2020 cur->bc_rec.i.ir_startino = ino;
2021 cur->bc_rec.i.ir_freecount = fcnt;
2022 cur->bc_rec.i.ir_free = free;
2023 return xfs_inobt_lookup(cur, XFS_LOOKUP_GE, stat);
2024}
2025
2026/*
2027 * Lookup the first record less than or equal to ino
2028 * in the btree given by cur.
2029 */
2030int /* error */
2031xfs_inobt_lookup_le(
2032 xfs_btree_cur_t *cur, /* btree cursor */
2033 xfs_agino_t ino, /* starting inode of chunk */
2034 __int32_t fcnt, /* free inode count */
2035 xfs_inofree_t free, /* free inode mask */
2036 int *stat) /* success/failure */
2037{
2038 cur->bc_rec.i.ir_startino = ino;
2039 cur->bc_rec.i.ir_freecount = fcnt;
2040 cur->bc_rec.i.ir_free = free;
2041 return xfs_inobt_lookup(cur, XFS_LOOKUP_LE, stat);
2042}
2043
2044/*
2045 * Update the record referred to by cur, to the value given
2046 * by [ino, fcnt, free].
2047 * This either works (return 0) or gets an EFSCORRUPTED error.
2048 */
2049int /* error */
2050xfs_inobt_update(
2051 xfs_btree_cur_t *cur, /* btree cursor */
2052 xfs_agino_t ino, /* starting inode of chunk */
2053 __int32_t fcnt, /* free inode count */
2054 xfs_inofree_t free) /* free inode mask */
2055{
2056 xfs_inobt_block_t *block; /* btree block to update */
2057 xfs_buf_t *bp; /* buffer containing btree block */
2058 int error; /* error return value */
2059 int ptr; /* current record number (updating) */
2060 xfs_inobt_rec_t *rp; /* pointer to updated record */
2061
2062 /*
2063 * Pick up the current block.
2064 */
2065 bp = cur->bc_bufs[0];
2066 block = XFS_BUF_TO_INOBT_BLOCK(bp);
2067#ifdef DEBUG
2068 if ((error = xfs_btree_check_sblock(cur, block, 0, bp)))
2069 return error;
2070#endif
2071 /*
2072 * Get the address of the rec to be updated.
2073 */
2074 ptr = cur->bc_ptrs[0];
2075 rp = XFS_INOBT_REC_ADDR(block, ptr, cur);
2076 /*
2077 * Fill in the new contents and log them.
2078 */
2079 INT_SET(rp->ir_startino, ARCH_CONVERT, ino);
2080 INT_SET(rp->ir_freecount, ARCH_CONVERT, fcnt);
2081 INT_SET(rp->ir_free, ARCH_CONVERT, free);
2082 xfs_inobt_log_recs(cur, bp, ptr, ptr);
2083 /*
2084 * Updating first record in leaf. Pass new key value up to our parent.
2085 */
2086 if (ptr == 1) {
2087 xfs_inobt_key_t key; /* key containing [ino] */
2088
2089 INT_SET(key.ir_startino, ARCH_CONVERT, ino);
2090 if ((error = xfs_inobt_updkey(cur, &key, 1)))
2091 return error;
2092 }
2093 return 0;
2094}
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
new file mode 100644
index 000000000000..803c4d17a057
--- /dev/null
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -0,0 +1,314 @@
1/*
2 * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_IALLOC_BTREE_H__
33#define __XFS_IALLOC_BTREE_H__
34
35/*
36 * Inode map on-disk structures
37 */
38
39struct xfs_buf;
40struct xfs_btree_cur;
41struct xfs_btree_sblock;
42struct xfs_mount;
43
44/*
45 * There is a btree for the inode map per allocation group.
46 */
47#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */
48
49typedef __uint64_t xfs_inofree_t;
50#define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t))
51#define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3)
52#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1)
53
54#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_MASKN)
55xfs_inofree_t xfs_inobt_maskn(int i, int n);
56#define XFS_INOBT_MASKN(i,n) xfs_inobt_maskn(i,n)
57#else
58#define XFS_INOBT_MASKN(i,n) \
59 ((((n) >= XFS_INODES_PER_CHUNK ? \
60 (xfs_inofree_t)0 : ((xfs_inofree_t)1 << (n))) - 1) << (i))
61#endif
62
63/*
64 * Data record structure
65 */
66typedef struct xfs_inobt_rec
67{
68 xfs_agino_t ir_startino; /* starting inode number */
69 __int32_t ir_freecount; /* count of free inodes (set bits) */
70 xfs_inofree_t ir_free; /* free inode mask */
71} xfs_inobt_rec_t;
72
73/*
74 * Key structure
75 */
76typedef struct xfs_inobt_key
77{
78 xfs_agino_t ir_startino; /* starting inode number */
79} xfs_inobt_key_t;
80
81typedef xfs_agblock_t xfs_inobt_ptr_t; /* btree pointer type */
82 /* btree block header type */
83typedef struct xfs_btree_sblock xfs_inobt_block_t;
84
85#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_INOBT_BLOCK)
86xfs_inobt_block_t *xfs_buf_to_inobt_block(struct xfs_buf *bp);
87#define XFS_BUF_TO_INOBT_BLOCK(bp) xfs_buf_to_inobt_block(bp)
88#else
89#define XFS_BUF_TO_INOBT_BLOCK(bp) ((xfs_inobt_block_t *)(XFS_BUF_PTR(bp)))
90#endif
91
92/*
93 * Bit manipulations for ir_free.
94 */
95#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_MASK)
96xfs_inofree_t xfs_inobt_mask(int i);
97#define XFS_INOBT_MASK(i) xfs_inobt_mask(i)
98#else
99#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i))
100#endif
101#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_IS_FREE)
102int xfs_inobt_is_free(xfs_inobt_rec_t *rp, int i);
103#define XFS_INOBT_IS_FREE(rp,i) xfs_inobt_is_free(rp,i)
104#else
105#define XFS_INOBT_IS_FREE(rp,i) (((rp)->ir_free & XFS_INOBT_MASK(i)) != 0)
106#endif
107#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_SET_FREE)
108void xfs_inobt_set_free(xfs_inobt_rec_t *rp, int i);
109#define XFS_INOBT_SET_FREE(rp,i) xfs_inobt_set_free(rp,i)
110#else
111#define XFS_INOBT_SET_FREE(rp,i) ((rp)->ir_free |= XFS_INOBT_MASK(i))
112#endif
113#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_CLR_FREE)
114void xfs_inobt_clr_free(xfs_inobt_rec_t *rp, int i);
115#define XFS_INOBT_CLR_FREE(rp,i) xfs_inobt_clr_free(rp,i)
116#else
117#define XFS_INOBT_CLR_FREE(rp,i) ((rp)->ir_free &= ~XFS_INOBT_MASK(i))
118#endif
119
120/*
121 * Real block structures have a size equal to the disk block size.
122 */
123#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_BLOCK_SIZE)
124int xfs_inobt_block_size(int lev, struct xfs_btree_cur *cur);
125#define XFS_INOBT_BLOCK_SIZE(lev,cur) xfs_inobt_block_size(lev,cur)
126#else
127#define XFS_INOBT_BLOCK_SIZE(lev,cur) (1 << (cur)->bc_blocklog)
128#endif
129
130#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_BLOCK_MAXRECS)
131int xfs_inobt_block_maxrecs(int lev, struct xfs_btree_cur *cur);
132#define XFS_INOBT_BLOCK_MAXRECS(lev,cur) xfs_inobt_block_maxrecs(lev,cur)
133#else
134#define XFS_INOBT_BLOCK_MAXRECS(lev,cur) \
135 ((cur)->bc_mp->m_inobt_mxr[lev != 0])
136#endif
137#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_BLOCK_MINRECS)
138int xfs_inobt_block_minrecs(int lev, struct xfs_btree_cur *cur);
139#define XFS_INOBT_BLOCK_MINRECS(lev,cur) xfs_inobt_block_minrecs(lev,cur)
140#else
141#define XFS_INOBT_BLOCK_MINRECS(lev,cur) \
142 ((cur)->bc_mp->m_inobt_mnr[lev != 0])
143#endif
144
145#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_IS_LAST_REC)
146int xfs_inobt_is_last_rec(struct xfs_btree_cur *cur);
147#define XFS_INOBT_IS_LAST_REC(cur) xfs_inobt_is_last_rec(cur)
148#else
149#define XFS_INOBT_IS_LAST_REC(cur) \
150 ((cur)->bc_ptrs[0] == \
151 INT_GET(XFS_BUF_TO_INOBT_BLOCK((cur)->bc_bufs[0])->bb_numrecs, ARCH_CONVERT))
152#endif
153
154/*
155 * Maximum number of inode btree levels.
156 */
157#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IN_MAXLEVELS)
158int xfs_in_maxlevels(struct xfs_mount *mp);
159#define XFS_IN_MAXLEVELS(mp) xfs_in_maxlevels(mp)
160#else
161#define XFS_IN_MAXLEVELS(mp) ((mp)->m_in_maxlevels)
162#endif
163
164/*
165 * block numbers in the AG.
166 */
167#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IBT_BLOCK)
168xfs_agblock_t xfs_ibt_block(struct xfs_mount *mp);
169#define XFS_IBT_BLOCK(mp) xfs_ibt_block(mp)
170#else
171#define XFS_IBT_BLOCK(mp) ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1))
172#endif
173#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_PREALLOC_BLOCKS)
174xfs_agblock_t xfs_prealloc_blocks(struct xfs_mount *mp);
175#define XFS_PREALLOC_BLOCKS(mp) xfs_prealloc_blocks(mp)
176#else
177#define XFS_PREALLOC_BLOCKS(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))
178#endif
179
180/*
181 * Record, key, and pointer address macros for btree blocks.
182 */
183#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_REC_ADDR)
184xfs_inobt_rec_t *
185xfs_inobt_rec_addr(xfs_inobt_block_t *bb, int i, struct xfs_btree_cur *cur);
186#define XFS_INOBT_REC_ADDR(bb,i,cur) xfs_inobt_rec_addr(bb,i,cur)
187#else
188#define XFS_INOBT_REC_ADDR(bb,i,cur) \
189 XFS_BTREE_REC_ADDR(XFS_INOBT_BLOCK_SIZE(0,cur), xfs_inobt, bb, i, \
190 XFS_INOBT_BLOCK_MAXRECS(0, cur))
191#endif
192
193#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_KEY_ADDR)
194xfs_inobt_key_t *
195xfs_inobt_key_addr(xfs_inobt_block_t *bb, int i, struct xfs_btree_cur *cur);
196#define XFS_INOBT_KEY_ADDR(bb,i,cur) xfs_inobt_key_addr(bb,i,cur)
197#else
198#define XFS_INOBT_KEY_ADDR(bb,i,cur) \
199 XFS_BTREE_KEY_ADDR(XFS_INOBT_BLOCK_SIZE(1,cur), xfs_inobt, bb, i, \
200 XFS_INOBT_BLOCK_MAXRECS(1, cur))
201#endif
202
203#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_PTR_ADDR)
204xfs_inobt_ptr_t *
205xfs_inobt_ptr_addr(xfs_inobt_block_t *bb, int i, struct xfs_btree_cur *cur);
206#define XFS_INOBT_PTR_ADDR(bb,i,cur) xfs_inobt_ptr_addr(bb,i,cur)
207#else
208#define XFS_INOBT_PTR_ADDR(bb,i,cur) \
209 XFS_BTREE_PTR_ADDR(XFS_INOBT_BLOCK_SIZE(1,cur), xfs_inobt, bb, i, \
210 XFS_INOBT_BLOCK_MAXRECS(1, cur))
211#endif
212
213/*
214 * Prototypes for externally visible routines.
215 */
216
217/*
218 * Decrement cursor by one record at the level.
219 * For nonzero levels the leaf-ward information is untouched.
220 */
221int /* error */
222xfs_inobt_decrement(
223 struct xfs_btree_cur *cur, /* btree cursor */
224 int level, /* level in btree, 0 is leaf */
225 int *stat); /* success/failure */
226
227/*
228 * Delete the record pointed to by cur.
229 * The cursor refers to the place where the record was (could be inserted)
230 * when the operation returns.
231 */
232int /* error */
233xfs_inobt_delete(
234 struct xfs_btree_cur *cur, /* btree cursor */
235 int *stat); /* success/failure */
236
237/*
238 * Get the data from the pointed-to record.
239 */
240int /* error */
241xfs_inobt_get_rec(
242 struct xfs_btree_cur *cur, /* btree cursor */
243 xfs_agino_t *ino, /* output: starting inode of chunk */
244 __int32_t *fcnt, /* output: number of free inodes */
245 xfs_inofree_t *free, /* output: free inode mask */
246 int *stat); /* output: success/failure */
247
248/*
249 * Increment cursor by one record at the level.
250 * For nonzero levels the leaf-ward information is untouched.
251 */
252int /* error */
253xfs_inobt_increment(
254 struct xfs_btree_cur *cur, /* btree cursor */
255 int level, /* level in btree, 0 is leaf */
256 int *stat); /* success/failure */
257
258/*
259 * Insert the current record at the point referenced by cur.
260 * The cursor may be inconsistent on return if splits have been done.
261 */
262int /* error */
263xfs_inobt_insert(
264 struct xfs_btree_cur *cur, /* btree cursor */
265 int *stat); /* success/failure */
266
267/*
268 * Lookup the record equal to ino in the btree given by cur.
269 */
270int /* error */
271xfs_inobt_lookup_eq(
272 struct xfs_btree_cur *cur, /* btree cursor */
273 xfs_agino_t ino, /* starting inode of chunk */
274 __int32_t fcnt, /* free inode count */
275 xfs_inofree_t free, /* free inode mask */
276 int *stat); /* success/failure */
277
278/*
279 * Lookup the first record greater than or equal to ino
280 * in the btree given by cur.
281 */
282int /* error */
283xfs_inobt_lookup_ge(
284 struct xfs_btree_cur *cur, /* btree cursor */
285 xfs_agino_t ino, /* starting inode of chunk */
286 __int32_t fcnt, /* free inode count */
287 xfs_inofree_t free, /* free inode mask */
288 int *stat); /* success/failure */
289
290/*
291 * Lookup the first record less than or equal to ino
292 * in the btree given by cur.
293 */
294int /* error */
295xfs_inobt_lookup_le(
296 struct xfs_btree_cur *cur, /* btree cursor */
297 xfs_agino_t ino, /* starting inode of chunk */
298 __int32_t fcnt, /* free inode count */
299 xfs_inofree_t free, /* free inode mask */
300 int *stat); /* success/failure */
301
302/*
303 * Update the record referred to by cur, to the value given
304 * by [ino, fcnt, free].
305 * This either works (return 0) or gets an EFSCORRUPTED error.
306 */
307int /* error */
308xfs_inobt_update(
309 struct xfs_btree_cur *cur, /* btree cursor */
310 xfs_agino_t ino, /* starting inode of chunk */
311 __int32_t fcnt, /* free inode count */
312 xfs_inofree_t free); /* free inode mask */
313
314#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
new file mode 100644
index 000000000000..3a0ba1dfd0e8
--- /dev/null
+++ b/fs/xfs/xfs_iget.c
@@ -0,0 +1,1022 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34
35#include "xfs_macros.h"
36#include "xfs_types.h"
37#include "xfs_inum.h"
38#include "xfs_log.h"
39#include "xfs_trans.h"
40#include "xfs_sb.h"
41#include "xfs_ag.h"
42#include "xfs_dir.h"
43#include "xfs_dir2.h"
44#include "xfs_dmapi.h"
45#include "xfs_mount.h"
46#include "xfs_alloc_btree.h"
47#include "xfs_bmap_btree.h"
48#include "xfs_ialloc_btree.h"
49#include "xfs_btree.h"
50#include "xfs_ialloc.h"
51#include "xfs_attr_sf.h"
52#include "xfs_dir_sf.h"
53#include "xfs_dir2_sf.h"
54#include "xfs_dinode.h"
55#include "xfs_inode.h"
56#include "xfs_quota.h"
57#include "xfs_utils.h"
58#include "xfs_bit.h"
59
60/*
61 * Initialize the inode hash table for the newly mounted file system.
62 * Choose an initial table size based on user specified value, else
63 * use a simple algorithm using the maximum number of inodes as an
64 * indicator for table size, and clamp it between one and some large
65 * number of pages.
66 */
67void
68xfs_ihash_init(xfs_mount_t *mp)
69{
70 __uint64_t icount;
71 uint i, flags = KM_SLEEP | KM_MAYFAIL;
72
73 if (!mp->m_ihsize) {
74 icount = mp->m_maxicount ? mp->m_maxicount :
75 (mp->m_sb.sb_dblocks << mp->m_sb.sb_inopblog);
76 mp->m_ihsize = 1 << max_t(uint, 8,
77 (xfs_highbit64(icount) + 1) / 2);
78 mp->m_ihsize = min_t(uint, mp->m_ihsize,
79 (64 * NBPP) / sizeof(xfs_ihash_t));
80 }
81
82 while (!(mp->m_ihash = (xfs_ihash_t *)kmem_zalloc(mp->m_ihsize *
83 sizeof(xfs_ihash_t), flags))) {
84 if ((mp->m_ihsize >>= 1) <= NBPP)
85 flags = KM_SLEEP;
86 }
87 for (i = 0; i < mp->m_ihsize; i++) {
88 rwlock_init(&(mp->m_ihash[i].ih_lock));
89 }
90}
91
92/*
93 * Free up structures allocated by xfs_ihash_init, at unmount time.
94 */
95void
96xfs_ihash_free(xfs_mount_t *mp)
97{
98 kmem_free(mp->m_ihash, mp->m_ihsize*sizeof(xfs_ihash_t));
99 mp->m_ihash = NULL;
100}
101
102/*
103 * Initialize the inode cluster hash table for the newly mounted file system.
104 * Its size is derived from the ihash table size.
105 */
106void
107xfs_chash_init(xfs_mount_t *mp)
108{
109 uint i;
110
111 mp->m_chsize = max_t(uint, 1, mp->m_ihsize /
112 (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog));
113 mp->m_chsize = min_t(uint, mp->m_chsize, mp->m_ihsize);
114 mp->m_chash = (xfs_chash_t *)kmem_zalloc(mp->m_chsize
115 * sizeof(xfs_chash_t),
116 KM_SLEEP);
117 for (i = 0; i < mp->m_chsize; i++) {
118 spinlock_init(&mp->m_chash[i].ch_lock,"xfshash");
119 }
120}
121
122/*
123 * Free up structures allocated by xfs_chash_init, at unmount time.
124 */
125void
126xfs_chash_free(xfs_mount_t *mp)
127{
128 int i;
129
130 for (i = 0; i < mp->m_chsize; i++) {
131 spinlock_destroy(&mp->m_chash[i].ch_lock);
132 }
133
134 kmem_free(mp->m_chash, mp->m_chsize*sizeof(xfs_chash_t));
135 mp->m_chash = NULL;
136}
137
138/*
139 * Look up an inode by number in the given file system.
140 * The inode is looked up in the hash table for the file system
141 * represented by the mount point parameter mp. Each bucket of
142 * the hash table is guarded by an individual semaphore.
143 *
144 * If the inode is found in the hash table, its corresponding vnode
145 * is obtained with a call to vn_get(). This call takes care of
146 * coordination with the reclamation of the inode and vnode. Note
147 * that the vmap structure is filled in while holding the hash lock.
148 * This gives us the state of the inode/vnode when we found it and
149 * is used for coordination in vn_get().
150 *
151 * If it is not in core, read it in from the file system's device and
152 * add the inode into the hash table.
153 *
154 * The inode is locked according to the value of the lock_flags parameter.
155 * This flag parameter indicates how and if the inode's IO lock and inode lock
156 * should be taken.
157 *
158 * mp -- the mount point structure for the current file system. It points
159 * to the inode hash table.
160 * tp -- a pointer to the current transaction if there is one. This is
161 * simply passed through to the xfs_iread() call.
162 * ino -- the number of the inode desired. This is the unique identifier
163 * within the file system for the inode being requested.
164 * lock_flags -- flags indicating how to lock the inode. See the comment
165 * for xfs_ilock() for a list of valid values.
166 * bno -- the block number starting the buffer containing the inode,
167 * if known (as by bulkstat), else 0.
168 */
169STATIC int
170xfs_iget_core(
171 vnode_t *vp,
172 xfs_mount_t *mp,
173 xfs_trans_t *tp,
174 xfs_ino_t ino,
175 uint flags,
176 uint lock_flags,
177 xfs_inode_t **ipp,
178 xfs_daddr_t bno)
179{
180 xfs_ihash_t *ih;
181 xfs_inode_t *ip;
182 xfs_inode_t *iq;
183 vnode_t *inode_vp;
184 ulong version;
185 int error;
186 /* REFERENCED */
187 xfs_chash_t *ch;
188 xfs_chashlist_t *chl, *chlnew;
189 SPLDECL(s);
190
191
192 ih = XFS_IHASH(mp, ino);
193
194again:
195 read_lock(&ih->ih_lock);
196
197 for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) {
198 if (ip->i_ino == ino) {
199 /*
200 * If INEW is set this inode is being set up
201 * we need to pause and try again.
202 */
203 if (ip->i_flags & XFS_INEW) {
204 read_unlock(&ih->ih_lock);
205 delay(1);
206 XFS_STATS_INC(xs_ig_frecycle);
207
208 goto again;
209 }
210
211 inode_vp = XFS_ITOV_NULL(ip);
212 if (inode_vp == NULL) {
213 /*
214 * If IRECLAIM is set this inode is
215 * on its way out of the system,
216 * we need to pause and try again.
217 */
218 if (ip->i_flags & XFS_IRECLAIM) {
219 read_unlock(&ih->ih_lock);
220 delay(1);
221 XFS_STATS_INC(xs_ig_frecycle);
222
223 goto again;
224 }
225
226 vn_trace_exit(vp, "xfs_iget.alloc",
227 (inst_t *)__return_address);
228
229 XFS_STATS_INC(xs_ig_found);
230
231 ip->i_flags &= ~XFS_IRECLAIMABLE;
232 read_unlock(&ih->ih_lock);
233
234 XFS_MOUNT_ILOCK(mp);
235 list_del_init(&ip->i_reclaim);
236 XFS_MOUNT_IUNLOCK(mp);
237
238 goto finish_inode;
239
240 } else if (vp != inode_vp) {
241 struct inode *inode = LINVFS_GET_IP(inode_vp);
242
243 /* The inode is being torn down, pause and
244 * try again.
245 */
246 if (inode->i_state & (I_FREEING | I_CLEAR)) {
247 read_unlock(&ih->ih_lock);
248 delay(1);
249 XFS_STATS_INC(xs_ig_frecycle);
250
251 goto again;
252 }
253/* Chances are the other vnode (the one in the inode) is being torn
254 * down right now, and we landed on top of it. Question is, what do
255 * we do? Unhook the old inode and hook up the new one?
256 */
257 cmn_err(CE_PANIC,
258 "xfs_iget_core: ambiguous vns: vp/0x%p, invp/0x%p",
259 inode_vp, vp);
260 }
261
262 read_unlock(&ih->ih_lock);
263
264 XFS_STATS_INC(xs_ig_found);
265
266finish_inode:
267 if (ip->i_d.di_mode == 0) {
268 if (!(flags & IGET_CREATE))
269 return ENOENT;
270 xfs_iocore_inode_reinit(ip);
271 }
272
273 if (lock_flags != 0)
274 xfs_ilock(ip, lock_flags);
275
276 ip->i_flags &= ~XFS_ISTALE;
277
278 vn_trace_exit(vp, "xfs_iget.found",
279 (inst_t *)__return_address);
280 goto return_ip;
281 }
282 }
283
284 /*
285 * Inode cache miss: save the hash chain version stamp and unlock
286 * the chain, so we don't deadlock in vn_alloc.
287 */
288 XFS_STATS_INC(xs_ig_missed);
289
290 version = ih->ih_version;
291
292 read_unlock(&ih->ih_lock);
293
294 /*
295 * Read the disk inode attributes into a new inode structure and get
296 * a new vnode for it. This should also initialize i_ino and i_mount.
297 */
298 error = xfs_iread(mp, tp, ino, &ip, bno);
299 if (error) {
300 return error;
301 }
302
303 vn_trace_exit(vp, "xfs_iget.alloc", (inst_t *)__return_address);
304
305 xfs_inode_lock_init(ip, vp);
306 xfs_iocore_inode_init(ip);
307
308 if (lock_flags != 0) {
309 xfs_ilock(ip, lock_flags);
310 }
311
312 if ((ip->i_d.di_mode == 0) && !(flags & IGET_CREATE)) {
313 xfs_idestroy(ip);
314 return ENOENT;
315 }
316
317 /*
318 * Put ip on its hash chain, unless someone else hashed a duplicate
319 * after we released the hash lock.
320 */
321 write_lock(&ih->ih_lock);
322
323 if (ih->ih_version != version) {
324 for (iq = ih->ih_next; iq != NULL; iq = iq->i_next) {
325 if (iq->i_ino == ino) {
326 write_unlock(&ih->ih_lock);
327 xfs_idestroy(ip);
328
329 XFS_STATS_INC(xs_ig_dup);
330 goto again;
331 }
332 }
333 }
334
335 /*
336 * These values _must_ be set before releasing ihlock!
337 */
338 ip->i_hash = ih;
339 if ((iq = ih->ih_next)) {
340 iq->i_prevp = &ip->i_next;
341 }
342 ip->i_next = iq;
343 ip->i_prevp = &ih->ih_next;
344 ih->ih_next = ip;
345 ip->i_udquot = ip->i_gdquot = NULL;
346 ih->ih_version++;
347 ip->i_flags |= XFS_INEW;
348
349 write_unlock(&ih->ih_lock);
350
351 /*
352 * put ip on its cluster's hash chain
353 */
354 ASSERT(ip->i_chash == NULL && ip->i_cprev == NULL &&
355 ip->i_cnext == NULL);
356
357 chlnew = NULL;
358 ch = XFS_CHASH(mp, ip->i_blkno);
359 chlredo:
360 s = mutex_spinlock(&ch->ch_lock);
361 for (chl = ch->ch_list; chl != NULL; chl = chl->chl_next) {
362 if (chl->chl_blkno == ip->i_blkno) {
363
364 /* insert this inode into the doubly-linked list
365 * where chl points */
366 if ((iq = chl->chl_ip)) {
367 ip->i_cprev = iq->i_cprev;
368 iq->i_cprev->i_cnext = ip;
369 iq->i_cprev = ip;
370 ip->i_cnext = iq;
371 } else {
372 ip->i_cnext = ip;
373 ip->i_cprev = ip;
374 }
375 chl->chl_ip = ip;
376 ip->i_chash = chl;
377 break;
378 }
379 }
380
381 /* no hash list found for this block; add a new hash list */
382 if (chl == NULL) {
383 if (chlnew == NULL) {
384 mutex_spinunlock(&ch->ch_lock, s);
385 ASSERT(xfs_chashlist_zone != NULL);
386 chlnew = (xfs_chashlist_t *)
387 kmem_zone_alloc(xfs_chashlist_zone,
388 KM_SLEEP);
389 ASSERT(chlnew != NULL);
390 goto chlredo;
391 } else {
392 ip->i_cnext = ip;
393 ip->i_cprev = ip;
394 ip->i_chash = chlnew;
395 chlnew->chl_ip = ip;
396 chlnew->chl_blkno = ip->i_blkno;
397 chlnew->chl_next = ch->ch_list;
398 ch->ch_list = chlnew;
399 chlnew = NULL;
400 }
401 } else {
402 if (chlnew != NULL) {
403 kmem_zone_free(xfs_chashlist_zone, chlnew);
404 }
405 }
406
407 mutex_spinunlock(&ch->ch_lock, s);
408
409
410 /*
411 * Link ip to its mount and thread it on the mount's inode list.
412 */
413 XFS_MOUNT_ILOCK(mp);
414 if ((iq = mp->m_inodes)) {
415 ASSERT(iq->i_mprev->i_mnext == iq);
416 ip->i_mprev = iq->i_mprev;
417 iq->i_mprev->i_mnext = ip;
418 iq->i_mprev = ip;
419 ip->i_mnext = iq;
420 } else {
421 ip->i_mnext = ip;
422 ip->i_mprev = ip;
423 }
424 mp->m_inodes = ip;
425
426 XFS_MOUNT_IUNLOCK(mp);
427
428 return_ip:
429 ASSERT(ip->i_df.if_ext_max ==
430 XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
431
432 ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) ==
433 ((ip->i_iocore.io_flags & XFS_IOCORE_RT) != 0));
434
435 *ipp = ip;
436
437 /*
438 * If we have a real type for an on-disk inode, we can set ops(&unlock)
439 * now. If it's a new inode being created, xfs_ialloc will handle it.
440 */
441 VFS_INIT_VNODE(XFS_MTOVFS(mp), vp, XFS_ITOBHV(ip), 1);
442
443 return 0;
444}
445
446
447/*
448 * The 'normal' internal xfs_iget, if needed it will
449 * 'allocate', or 'get', the vnode.
450 */
451int
452xfs_iget(
453 xfs_mount_t *mp,
454 xfs_trans_t *tp,
455 xfs_ino_t ino,
456 uint flags,
457 uint lock_flags,
458 xfs_inode_t **ipp,
459 xfs_daddr_t bno)
460{
461 struct inode *inode;
462 vnode_t *vp = NULL;
463 int error;
464
465retry:
466 XFS_STATS_INC(xs_ig_attempts);
467
468 if ((inode = iget_locked(XFS_MTOVFS(mp)->vfs_super, ino))) {
469 bhv_desc_t *bdp;
470 xfs_inode_t *ip;
471 int newnode;
472
473 vp = LINVFS_GET_VP(inode);
474 if (inode->i_state & I_NEW) {
475inode_allocate:
476 vn_initialize(inode);
477 error = xfs_iget_core(vp, mp, tp, ino, flags,
478 lock_flags, ipp, bno);
479 if (error) {
480 vn_mark_bad(vp);
481 if (inode->i_state & I_NEW)
482 unlock_new_inode(inode);
483 iput(inode);
484 }
485 } else {
486 /* These are true if the inode is in inactive or
487 * reclaim. The linux inode is about to go away,
488 * wait for that path to finish, and try again.
489 */
490 if (vp->v_flag & (VINACT | VRECLM)) {
491 vn_wait(vp);
492 iput(inode);
493 goto retry;
494 }
495
496 if (is_bad_inode(inode)) {
497 iput(inode);
498 return EIO;
499 }
500
501 bdp = vn_bhv_lookup(VN_BHV_HEAD(vp), &xfs_vnodeops);
502 if (bdp == NULL) {
503 XFS_STATS_INC(xs_ig_dup);
504 goto inode_allocate;
505 }
506 ip = XFS_BHVTOI(bdp);
507 if (lock_flags != 0)
508 xfs_ilock(ip, lock_flags);
509 newnode = (ip->i_d.di_mode == 0);
510 if (newnode)
511 xfs_iocore_inode_reinit(ip);
512 XFS_STATS_INC(xs_ig_found);
513 *ipp = ip;
514 error = 0;
515 }
516 } else
517 error = ENOMEM; /* If we got no inode we are out of memory */
518
519 return error;
520}
521
522/*
523 * Do the setup for the various locks within the incore inode.
524 */
525void
526xfs_inode_lock_init(
527 xfs_inode_t *ip,
528 vnode_t *vp)
529{
530 mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
531 "xfsino", (long)vp->v_number);
532 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", vp->v_number);
533 init_waitqueue_head(&ip->i_ipin_wait);
534 atomic_set(&ip->i_pincount, 0);
535 init_sema(&ip->i_flock, 1, "xfsfino", vp->v_number);
536}
537
538/*
539 * Look for the inode corresponding to the given ino in the hash table.
540 * If it is there and its i_transp pointer matches tp, return it.
541 * Otherwise, return NULL.
542 */
543xfs_inode_t *
544xfs_inode_incore(xfs_mount_t *mp,
545 xfs_ino_t ino,
546 xfs_trans_t *tp)
547{
548 xfs_ihash_t *ih;
549 xfs_inode_t *ip;
550
551 ih = XFS_IHASH(mp, ino);
552 read_lock(&ih->ih_lock);
553 for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) {
554 if (ip->i_ino == ino) {
555 /*
556 * If we find it and tp matches, return it.
557 * Otherwise break from the loop and return
558 * NULL.
559 */
560 if (ip->i_transp == tp) {
561 read_unlock(&ih->ih_lock);
562 return (ip);
563 }
564 break;
565 }
566 }
567 read_unlock(&ih->ih_lock);
568 return (NULL);
569}
570
571/*
572 * Decrement reference count of an inode structure and unlock it.
573 *
574 * ip -- the inode being released
575 * lock_flags -- this parameter indicates the inode's locks to be
576 * to be released. See the comment on xfs_iunlock() for a list
577 * of valid values.
578 */
579void
580xfs_iput(xfs_inode_t *ip,
581 uint lock_flags)
582{
583 vnode_t *vp = XFS_ITOV(ip);
584
585 vn_trace_entry(vp, "xfs_iput", (inst_t *)__return_address);
586
587 xfs_iunlock(ip, lock_flags);
588
589 VN_RELE(vp);
590}
591
592/*
593 * Special iput for brand-new inodes that are still locked
594 */
595void
596xfs_iput_new(xfs_inode_t *ip,
597 uint lock_flags)
598{
599 vnode_t *vp = XFS_ITOV(ip);
600 struct inode *inode = LINVFS_GET_IP(vp);
601
602 vn_trace_entry(vp, "xfs_iput_new", (inst_t *)__return_address);
603
604 if ((ip->i_d.di_mode == 0)) {
605 ASSERT(!(ip->i_flags & XFS_IRECLAIMABLE));
606 vn_mark_bad(vp);
607 }
608 if (inode->i_state & I_NEW)
609 unlock_new_inode(inode);
610 if (lock_flags)
611 xfs_iunlock(ip, lock_flags);
612 VN_RELE(vp);
613}
614
615
616/*
617 * This routine embodies the part of the reclaim code that pulls
618 * the inode from the inode hash table and the mount structure's
619 * inode list.
620 * This should only be called from xfs_reclaim().
621 */
622void
623xfs_ireclaim(xfs_inode_t *ip)
624{
625 vnode_t *vp;
626
627 /*
628 * Remove from old hash list and mount list.
629 */
630 XFS_STATS_INC(xs_ig_reclaims);
631
632 xfs_iextract(ip);
633
634 /*
635 * Here we do a spurious inode lock in order to coordinate with
636 * xfs_sync(). This is because xfs_sync() references the inodes
637 * in the mount list without taking references on the corresponding
638 * vnodes. We make that OK here by ensuring that we wait until
639 * the inode is unlocked in xfs_sync() before we go ahead and
640 * free it. We get both the regular lock and the io lock because
641 * the xfs_sync() code may need to drop the regular one but will
642 * still hold the io lock.
643 */
644 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
645
646 /*
647 * Release dquots (and their references) if any. An inode may escape
648 * xfs_inactive and get here via vn_alloc->vn_reclaim path.
649 */
650 XFS_QM_DQDETACH(ip->i_mount, ip);
651
652 /*
653 * Pull our behavior descriptor from the vnode chain.
654 */
655 vp = XFS_ITOV_NULL(ip);
656 if (vp) {
657 vn_bhv_remove(VN_BHV_HEAD(vp), XFS_ITOBHV(ip));
658 }
659
660 /*
661 * Free all memory associated with the inode.
662 */
663 xfs_idestroy(ip);
664}
665
666/*
667 * This routine removes an about-to-be-destroyed inode from
668 * all of the lists in which it is located with the exception
669 * of the behavior chain.
670 */
671void
672xfs_iextract(
673 xfs_inode_t *ip)
674{
675 xfs_ihash_t *ih;
676 xfs_inode_t *iq;
677 xfs_mount_t *mp;
678 xfs_chash_t *ch;
679 xfs_chashlist_t *chl, *chm;
680 SPLDECL(s);
681
682 ih = ip->i_hash;
683 write_lock(&ih->ih_lock);
684 if ((iq = ip->i_next)) {
685 iq->i_prevp = ip->i_prevp;
686 }
687 *ip->i_prevp = iq;
688 write_unlock(&ih->ih_lock);
689
690 /*
691 * Remove from cluster hash list
692 * 1) delete the chashlist if this is the last inode on the chashlist
693 * 2) unchain from list of inodes
694 * 3) point chashlist->chl_ip to 'chl_next' if to this inode.
695 */
696 mp = ip->i_mount;
697 ch = XFS_CHASH(mp, ip->i_blkno);
698 s = mutex_spinlock(&ch->ch_lock);
699
700 if (ip->i_cnext == ip) {
701 /* Last inode on chashlist */
702 ASSERT(ip->i_cnext == ip && ip->i_cprev == ip);
703 ASSERT(ip->i_chash != NULL);
704 chm=NULL;
705 for (chl = ch->ch_list; chl != NULL; chl = chl->chl_next) {
706 if (chl->chl_blkno == ip->i_blkno) {
707 if (chm == NULL) {
708 /* first item on the list */
709 ch->ch_list = chl->chl_next;
710 } else {
711 chm->chl_next = chl->chl_next;
712 }
713 kmem_zone_free(xfs_chashlist_zone, chl);
714 break;
715 } else {
716 ASSERT(chl->chl_ip != ip);
717 chm = chl;
718 }
719 }
720 ASSERT_ALWAYS(chl != NULL);
721 } else {
722 /* delete one inode from a non-empty list */
723 iq = ip->i_cnext;
724 iq->i_cprev = ip->i_cprev;
725 ip->i_cprev->i_cnext = iq;
726 if (ip->i_chash->chl_ip == ip) {
727 ip->i_chash->chl_ip = iq;
728 }
729 ip->i_chash = __return_address;
730 ip->i_cprev = __return_address;
731 ip->i_cnext = __return_address;
732 }
733 mutex_spinunlock(&ch->ch_lock, s);
734
735 /*
736 * Remove from mount's inode list.
737 */
738 XFS_MOUNT_ILOCK(mp);
739 ASSERT((ip->i_mnext != NULL) && (ip->i_mprev != NULL));
740 iq = ip->i_mnext;
741 iq->i_mprev = ip->i_mprev;
742 ip->i_mprev->i_mnext = iq;
743
744 /*
745 * Fix up the head pointer if it points to the inode being deleted.
746 */
747 if (mp->m_inodes == ip) {
748 if (ip == iq) {
749 mp->m_inodes = NULL;
750 } else {
751 mp->m_inodes = iq;
752 }
753 }
754
755 /* Deal with the deleted inodes list */
756 list_del_init(&ip->i_reclaim);
757
758 mp->m_ireclaims++;
759 XFS_MOUNT_IUNLOCK(mp);
760}
761
762/*
763 * This is a wrapper routine around the xfs_ilock() routine
764 * used to centralize some grungy code. It is used in places
765 * that wish to lock the inode solely for reading the extents.
766 * The reason these places can't just call xfs_ilock(SHARED)
767 * is that the inode lock also guards to bringing in of the
768 * extents from disk for a file in b-tree format. If the inode
769 * is in b-tree format, then we need to lock the inode exclusively
770 * until the extents are read in. Locking it exclusively all
771 * the time would limit our parallelism unnecessarily, though.
772 * What we do instead is check to see if the extents have been
773 * read in yet, and only lock the inode exclusively if they
774 * have not.
775 *
776 * The function returns a value which should be given to the
777 * corresponding xfs_iunlock_map_shared(). This value is
778 * the mode in which the lock was actually taken.
779 */
780uint
781xfs_ilock_map_shared(
782 xfs_inode_t *ip)
783{
784 uint lock_mode;
785
786 if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
787 ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
788 lock_mode = XFS_ILOCK_EXCL;
789 } else {
790 lock_mode = XFS_ILOCK_SHARED;
791 }
792
793 xfs_ilock(ip, lock_mode);
794
795 return lock_mode;
796}
797
798/*
799 * This is simply the unlock routine to go with xfs_ilock_map_shared().
800 * All it does is call xfs_iunlock() with the given lock_mode.
801 */
802void
803xfs_iunlock_map_shared(
804 xfs_inode_t *ip,
805 unsigned int lock_mode)
806{
807 xfs_iunlock(ip, lock_mode);
808}
809
810/*
811 * The xfs inode contains 2 locks: a multi-reader lock called the
812 * i_iolock and a multi-reader lock called the i_lock. This routine
813 * allows either or both of the locks to be obtained.
814 *
815 * The 2 locks should always be ordered so that the IO lock is
816 * obtained first in order to prevent deadlock.
817 *
818 * ip -- the inode being locked
819 * lock_flags -- this parameter indicates the inode's locks
820 * to be locked. It can be:
821 * XFS_IOLOCK_SHARED,
822 * XFS_IOLOCK_EXCL,
823 * XFS_ILOCK_SHARED,
824 * XFS_ILOCK_EXCL,
825 * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
826 * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
827 * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
828 * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
829 */
830void
831xfs_ilock(xfs_inode_t *ip,
832 uint lock_flags)
833{
834 /*
835 * You can't set both SHARED and EXCL for the same lock,
836 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
837 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
838 */
839 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
840 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
841 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
842 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
843 ASSERT((lock_flags & ~XFS_LOCK_MASK) == 0);
844
845 if (lock_flags & XFS_IOLOCK_EXCL) {
846 mrupdate(&ip->i_iolock);
847 } else if (lock_flags & XFS_IOLOCK_SHARED) {
848 mraccess(&ip->i_iolock);
849 }
850 if (lock_flags & XFS_ILOCK_EXCL) {
851 mrupdate(&ip->i_lock);
852 } else if (lock_flags & XFS_ILOCK_SHARED) {
853 mraccess(&ip->i_lock);
854 }
855 xfs_ilock_trace(ip, 1, lock_flags, (inst_t *)__return_address);
856}
857
858/*
859 * This is just like xfs_ilock(), except that the caller
860 * is guaranteed not to sleep. It returns 1 if it gets
861 * the requested locks and 0 otherwise. If the IO lock is
862 * obtained but the inode lock cannot be, then the IO lock
863 * is dropped before returning.
864 *
865 * ip -- the inode being locked
866 * lock_flags -- this parameter indicates the inode's locks to be
867 * to be locked. See the comment for xfs_ilock() for a list
868 * of valid values.
869 *
870 */
871int
872xfs_ilock_nowait(xfs_inode_t *ip,
873 uint lock_flags)
874{
875 int iolocked;
876 int ilocked;
877
878 /*
879 * You can't set both SHARED and EXCL for the same lock,
880 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
881 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
882 */
883 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
884 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
885 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
886 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
887 ASSERT((lock_flags & ~XFS_LOCK_MASK) == 0);
888
889 iolocked = 0;
890 if (lock_flags & XFS_IOLOCK_EXCL) {
891 iolocked = mrtryupdate(&ip->i_iolock);
892 if (!iolocked) {
893 return 0;
894 }
895 } else if (lock_flags & XFS_IOLOCK_SHARED) {
896 iolocked = mrtryaccess(&ip->i_iolock);
897 if (!iolocked) {
898 return 0;
899 }
900 }
901 if (lock_flags & XFS_ILOCK_EXCL) {
902 ilocked = mrtryupdate(&ip->i_lock);
903 if (!ilocked) {
904 if (iolocked) {
905 mrunlock(&ip->i_iolock);
906 }
907 return 0;
908 }
909 } else if (lock_flags & XFS_ILOCK_SHARED) {
910 ilocked = mrtryaccess(&ip->i_lock);
911 if (!ilocked) {
912 if (iolocked) {
913 mrunlock(&ip->i_iolock);
914 }
915 return 0;
916 }
917 }
918 xfs_ilock_trace(ip, 2, lock_flags, (inst_t *)__return_address);
919 return 1;
920}
921
922/*
923 * xfs_iunlock() is used to drop the inode locks acquired with
924 * xfs_ilock() and xfs_ilock_nowait(). The caller must pass
925 * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
926 * that we know which locks to drop.
927 *
928 * ip -- the inode being unlocked
929 * lock_flags -- this parameter indicates the inode's locks to be
930 * to be unlocked. See the comment for xfs_ilock() for a list
931 * of valid values for this parameter.
932 *
933 */
934void
935xfs_iunlock(xfs_inode_t *ip,
936 uint lock_flags)
937{
938 /*
939 * You can't set both SHARED and EXCL for the same lock,
940 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
941 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
942 */
943 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
944 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
945 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
946 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
947 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_IUNLOCK_NONOTIFY)) == 0);
948 ASSERT(lock_flags != 0);
949
950 if (lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) {
951 ASSERT(!(lock_flags & XFS_IOLOCK_SHARED) ||
952 (ismrlocked(&ip->i_iolock, MR_ACCESS)));
953 ASSERT(!(lock_flags & XFS_IOLOCK_EXCL) ||
954 (ismrlocked(&ip->i_iolock, MR_UPDATE)));
955 mrunlock(&ip->i_iolock);
956 }
957
958 if (lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) {
959 ASSERT(!(lock_flags & XFS_ILOCK_SHARED) ||
960 (ismrlocked(&ip->i_lock, MR_ACCESS)));
961 ASSERT(!(lock_flags & XFS_ILOCK_EXCL) ||
962 (ismrlocked(&ip->i_lock, MR_UPDATE)));
963 mrunlock(&ip->i_lock);
964
965 /*
966 * Let the AIL know that this item has been unlocked in case
967 * it is in the AIL and anyone is waiting on it. Don't do
968 * this if the caller has asked us not to.
969 */
970 if (!(lock_flags & XFS_IUNLOCK_NONOTIFY) &&
971 ip->i_itemp != NULL) {
972 xfs_trans_unlocked_item(ip->i_mount,
973 (xfs_log_item_t*)(ip->i_itemp));
974 }
975 }
976 xfs_ilock_trace(ip, 3, lock_flags, (inst_t *)__return_address);
977}
978
979/*
980 * give up write locks. the i/o lock cannot be held nested
981 * if it is being demoted.
982 */
983void
984xfs_ilock_demote(xfs_inode_t *ip,
985 uint lock_flags)
986{
987 ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
988 ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
989
990 if (lock_flags & XFS_ILOCK_EXCL) {
991 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
992 mrdemote(&ip->i_lock);
993 }
994 if (lock_flags & XFS_IOLOCK_EXCL) {
995 ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE));
996 mrdemote(&ip->i_iolock);
997 }
998}
999
1000/*
1001 * The following three routines simply manage the i_flock
1002 * semaphore embedded in the inode. This semaphore synchronizes
1003 * processes attempting to flush the in-core inode back to disk.
1004 */
1005void
1006xfs_iflock(xfs_inode_t *ip)
1007{
1008 psema(&(ip->i_flock), PINOD|PLTWAIT);
1009}
1010
1011int
1012xfs_iflock_nowait(xfs_inode_t *ip)
1013{
1014 return (cpsema(&(ip->i_flock)));
1015}
1016
1017void
1018xfs_ifunlock(xfs_inode_t *ip)
1019{
1020 ASSERT(valusema(&(ip->i_flock)) <= 0);
1021 vsema(&(ip->i_flock));
1022}
diff --git a/fs/xfs/xfs_imap.h b/fs/xfs/xfs_imap.h
new file mode 100644
index 000000000000..e385064a066a
--- /dev/null
+++ b/fs/xfs/xfs_imap.h
@@ -0,0 +1,54 @@
1/*
2 * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_IMAP_H__
33#define __XFS_IMAP_H__
34
35/*
36 * This is the structure passed to xfs_imap() to map
37 * an inode number to its on disk location.
38 */
39typedef struct xfs_imap {
40 xfs_daddr_t im_blkno; /* starting BB of inode chunk */
41 uint im_len; /* length in BBs of inode chunk */
42 xfs_agblock_t im_agblkno; /* logical block of inode chunk in ag */
43 ushort im_ioffset; /* inode offset in block in "inodes" */
44 ushort im_boffset; /* inode offset in block in bytes */
45} xfs_imap_t;
46
47#ifdef __KERNEL__
48struct xfs_mount;
49struct xfs_trans;
50int xfs_imap(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
51 xfs_imap_t *, uint);
52#endif
53
54#endif /* __XFS_IMAP_H__ */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
new file mode 100644
index 000000000000..43c632ab86ad
--- /dev/null
+++ b/fs/xfs/xfs_inode.c
@@ -0,0 +1,3876 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34#include "xfs_macros.h"
35#include "xfs_types.h"
36#include "xfs_inum.h"
37#include "xfs_log.h"
38#include "xfs_trans.h"
39#include "xfs_trans_priv.h"
40#include "xfs_sb.h"
41#include "xfs_ag.h"
42#include "xfs_dir.h"
43#include "xfs_dir2.h"
44#include "xfs_dmapi.h"
45#include "xfs_mount.h"
46#include "xfs_alloc_btree.h"
47#include "xfs_bmap_btree.h"
48#include "xfs_ialloc_btree.h"
49#include "xfs_btree.h"
50#include "xfs_imap.h"
51#include "xfs_alloc.h"
52#include "xfs_ialloc.h"
53#include "xfs_attr_sf.h"
54#include "xfs_dir_sf.h"
55#include "xfs_dir2_sf.h"
56#include "xfs_dinode.h"
57#include "xfs_inode_item.h"
58#include "xfs_inode.h"
59#include "xfs_bmap.h"
60#include "xfs_buf_item.h"
61#include "xfs_rw.h"
62#include "xfs_error.h"
63#include "xfs_bit.h"
64#include "xfs_utils.h"
65#include "xfs_dir2_trace.h"
66#include "xfs_quota.h"
67#include "xfs_mac.h"
68#include "xfs_acl.h"
69
70
71kmem_zone_t *xfs_ifork_zone;
72kmem_zone_t *xfs_inode_zone;
73kmem_zone_t *xfs_chashlist_zone;
74
75/*
76 * Used in xfs_itruncate(). This is the maximum number of extents
77 * freed from a file in a single transaction.
78 */
79#define XFS_ITRUNC_MAX_EXTENTS 2
80
81STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *);
82STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
83STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
84STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
85
86
87#ifdef DEBUG
88/*
89 * Make sure that the extents in the given memory buffer
90 * are valid.
91 */
92STATIC void
93xfs_validate_extents(
94 xfs_bmbt_rec_t *ep,
95 int nrecs,
96 int disk,
97 xfs_exntfmt_t fmt)
98{
99 xfs_bmbt_irec_t irec;
100 xfs_bmbt_rec_t rec;
101 int i;
102
103 for (i = 0; i < nrecs; i++) {
104 rec.l0 = get_unaligned((__uint64_t*)&ep->l0);
105 rec.l1 = get_unaligned((__uint64_t*)&ep->l1);
106 if (disk)
107 xfs_bmbt_disk_get_all(&rec, &irec);
108 else
109 xfs_bmbt_get_all(&rec, &irec);
110 if (fmt == XFS_EXTFMT_NOSTATE)
111 ASSERT(irec.br_state == XFS_EXT_NORM);
112 ep++;
113 }
114}
115#else /* DEBUG */
116#define xfs_validate_extents(ep, nrecs, disk, fmt)
117#endif /* DEBUG */
118
119/*
120 * Check that none of the inode's in the buffer have a next
121 * unlinked field of 0.
122 */
123#if defined(DEBUG)
124void
125xfs_inobp_check(
126 xfs_mount_t *mp,
127 xfs_buf_t *bp)
128{
129 int i;
130 int j;
131 xfs_dinode_t *dip;
132
133 j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
134
135 for (i = 0; i < j; i++) {
136 dip = (xfs_dinode_t *)xfs_buf_offset(bp,
137 i * mp->m_sb.sb_inodesize);
138 if (!dip->di_next_unlinked) {
139 xfs_fs_cmn_err(CE_ALERT, mp,
140 "Detected a bogus zero next_unlinked field in incore inode buffer 0x%p. About to pop an ASSERT.",
141 bp);
142 ASSERT(dip->di_next_unlinked);
143 }
144 }
145}
146#endif
147
148/*
149 * called from bwrite on xfs inode buffers
150 */
151void
152xfs_inobp_bwcheck(xfs_buf_t *bp)
153{
154 xfs_mount_t *mp;
155 int i;
156 int j;
157 xfs_dinode_t *dip;
158
159 ASSERT(XFS_BUF_FSPRIVATE3(bp, void *) != NULL);
160
161 mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *);
162
163
164 j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
165
166 for (i = 0; i < j; i++) {
167 dip = (xfs_dinode_t *) xfs_buf_offset(bp,
168 i * mp->m_sb.sb_inodesize);
169 if (INT_GET(dip->di_core.di_magic, ARCH_CONVERT) != XFS_DINODE_MAGIC) {
170 cmn_err(CE_WARN,
171"Bad magic # 0x%x in XFS inode buffer 0x%Lx, starting blockno %Ld, offset 0x%x",
172 INT_GET(dip->di_core.di_magic, ARCH_CONVERT),
173 (__uint64_t)(__psunsigned_t) bp,
174 (__int64_t) XFS_BUF_ADDR(bp),
175 xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
176 xfs_fs_cmn_err(CE_WARN, mp,
177 "corrupt, unmount and run xfs_repair");
178 }
179 if (!dip->di_next_unlinked) {
180 cmn_err(CE_WARN,
181"Bad next_unlinked field (0) in XFS inode buffer 0x%p, starting blockno %Ld, offset 0x%x",
182 (__uint64_t)(__psunsigned_t) bp,
183 (__int64_t) XFS_BUF_ADDR(bp),
184 xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
185 xfs_fs_cmn_err(CE_WARN, mp,
186 "corrupt, unmount and run xfs_repair");
187 }
188 }
189
190 return;
191}
192
193/*
194 * This routine is called to map an inode number within a file
195 * system to the buffer containing the on-disk version of the
196 * inode. It returns a pointer to the buffer containing the
197 * on-disk inode in the bpp parameter, and in the dip parameter
198 * it returns a pointer to the on-disk inode within that buffer.
199 *
200 * If a non-zero error is returned, then the contents of bpp and
201 * dipp are undefined.
202 *
203 * Use xfs_imap() to determine the size and location of the
204 * buffer to read from disk.
205 */
206int
207xfs_inotobp(
208 xfs_mount_t *mp,
209 xfs_trans_t *tp,
210 xfs_ino_t ino,
211 xfs_dinode_t **dipp,
212 xfs_buf_t **bpp,
213 int *offset)
214{
215 int di_ok;
216 xfs_imap_t imap;
217 xfs_buf_t *bp;
218 int error;
219 xfs_dinode_t *dip;
220
221 /*
222 * Call the space managment code to find the location of the
223 * inode on disk.
224 */
225 imap.im_blkno = 0;
226 error = xfs_imap(mp, tp, ino, &imap, XFS_IMAP_LOOKUP);
227 if (error != 0) {
228 cmn_err(CE_WARN,
229 "xfs_inotobp: xfs_imap() returned an "
230 "error %d on %s. Returning error.", error, mp->m_fsname);
231 return error;
232 }
233
234 /*
235 * If the inode number maps to a block outside the bounds of the
236 * file system then return NULL rather than calling read_buf
237 * and panicing when we get an error from the driver.
238 */
239 if ((imap.im_blkno + imap.im_len) >
240 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
241 cmn_err(CE_WARN,
242 "xfs_inotobp: inode number (%d + %d) maps to a block outside the bounds "
243 "of the file system %s. Returning EINVAL.",
244 imap.im_blkno, imap.im_len,mp->m_fsname);
245 return XFS_ERROR(EINVAL);
246 }
247
248 /*
249 * Read in the buffer. If tp is NULL, xfs_trans_read_buf() will
250 * default to just a read_buf() call.
251 */
252 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno,
253 (int)imap.im_len, XFS_BUF_LOCK, &bp);
254
255 if (error) {
256 cmn_err(CE_WARN,
257 "xfs_inotobp: xfs_trans_read_buf() returned an "
258 "error %d on %s. Returning error.", error, mp->m_fsname);
259 return error;
260 }
261 dip = (xfs_dinode_t *)xfs_buf_offset(bp, 0);
262 di_ok =
263 INT_GET(dip->di_core.di_magic, ARCH_CONVERT) == XFS_DINODE_MAGIC &&
264 XFS_DINODE_GOOD_VERSION(INT_GET(dip->di_core.di_version, ARCH_CONVERT));
265 if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP,
266 XFS_RANDOM_ITOBP_INOTOBP))) {
267 XFS_CORRUPTION_ERROR("xfs_inotobp", XFS_ERRLEVEL_LOW, mp, dip);
268 xfs_trans_brelse(tp, bp);
269 cmn_err(CE_WARN,
270 "xfs_inotobp: XFS_TEST_ERROR() returned an "
271 "error on %s. Returning EFSCORRUPTED.", mp->m_fsname);
272 return XFS_ERROR(EFSCORRUPTED);
273 }
274
275 xfs_inobp_check(mp, bp);
276
277 /*
278 * Set *dipp to point to the on-disk inode in the buffer.
279 */
280 *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
281 *bpp = bp;
282 *offset = imap.im_boffset;
283 return 0;
284}
285
286
287/*
288 * This routine is called to map an inode to the buffer containing
289 * the on-disk version of the inode. It returns a pointer to the
290 * buffer containing the on-disk inode in the bpp parameter, and in
291 * the dip parameter it returns a pointer to the on-disk inode within
292 * that buffer.
293 *
294 * If a non-zero error is returned, then the contents of bpp and
295 * dipp are undefined.
296 *
297 * If the inode is new and has not yet been initialized, use xfs_imap()
298 * to determine the size and location of the buffer to read from disk.
299 * If the inode has already been mapped to its buffer and read in once,
300 * then use the mapping information stored in the inode rather than
301 * calling xfs_imap(). This allows us to avoid the overhead of looking
302 * at the inode btree for small block file systems (see xfs_dilocate()).
303 * We can tell whether the inode has been mapped in before by comparing
304 * its disk block address to 0. Only uninitialized inodes will have
305 * 0 for the disk block address.
306 */
307int
308xfs_itobp(
309 xfs_mount_t *mp,
310 xfs_trans_t *tp,
311 xfs_inode_t *ip,
312 xfs_dinode_t **dipp,
313 xfs_buf_t **bpp,
314 xfs_daddr_t bno)
315{
316 xfs_buf_t *bp;
317 int error;
318 xfs_imap_t imap;
319#ifdef __KERNEL__
320 int i;
321 int ni;
322#endif
323
324 if (ip->i_blkno == (xfs_daddr_t)0) {
325 /*
326 * Call the space management code to find the location of the
327 * inode on disk.
328 */
329 imap.im_blkno = bno;
330 error = xfs_imap(mp, tp, ip->i_ino, &imap, XFS_IMAP_LOOKUP);
331 if (error != 0) {
332 return error;
333 }
334
335 /*
336 * If the inode number maps to a block outside the bounds
337 * of the file system then return NULL rather than calling
338 * read_buf and panicing when we get an error from the
339 * driver.
340 */
341 if ((imap.im_blkno + imap.im_len) >
342 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
343#ifdef DEBUG
344 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: "
345 "(imap.im_blkno (0x%llx) "
346 "+ imap.im_len (0x%llx)) > "
347 " XFS_FSB_TO_BB(mp, "
348 "mp->m_sb.sb_dblocks) (0x%llx)",
349 (unsigned long long) imap.im_blkno,
350 (unsigned long long) imap.im_len,
351 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
352#endif /* DEBUG */
353 return XFS_ERROR(EINVAL);
354 }
355
356 /*
357 * Fill in the fields in the inode that will be used to
358 * map the inode to its buffer from now on.
359 */
360 ip->i_blkno = imap.im_blkno;
361 ip->i_len = imap.im_len;
362 ip->i_boffset = imap.im_boffset;
363 } else {
364 /*
365 * We've already mapped the inode once, so just use the
366 * mapping that we saved the first time.
367 */
368 imap.im_blkno = ip->i_blkno;
369 imap.im_len = ip->i_len;
370 imap.im_boffset = ip->i_boffset;
371 }
372 ASSERT(bno == 0 || bno == imap.im_blkno);
373
374 /*
375 * Read in the buffer. If tp is NULL, xfs_trans_read_buf() will
376 * default to just a read_buf() call.
377 */
378 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno,
379 (int)imap.im_len, XFS_BUF_LOCK, &bp);
380
381 if (error) {
382#ifdef DEBUG
383 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: "
384 "xfs_trans_read_buf() returned error %d, "
385 "imap.im_blkno 0x%llx, imap.im_len 0x%llx",
386 error, (unsigned long long) imap.im_blkno,
387 (unsigned long long) imap.im_len);
388#endif /* DEBUG */
389 return error;
390 }
391#ifdef __KERNEL__
392 /*
393 * Validate the magic number and version of every inode in the buffer
394 * (if DEBUG kernel) or the first inode in the buffer, otherwise.
395 */
396#ifdef DEBUG
397 ni = BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog;
398#else
399 ni = 1;
400#endif
401 for (i = 0; i < ni; i++) {
402 int di_ok;
403 xfs_dinode_t *dip;
404
405 dip = (xfs_dinode_t *)xfs_buf_offset(bp,
406 (i << mp->m_sb.sb_inodelog));
407 di_ok = INT_GET(dip->di_core.di_magic, ARCH_CONVERT) == XFS_DINODE_MAGIC &&
408 XFS_DINODE_GOOD_VERSION(INT_GET(dip->di_core.di_version, ARCH_CONVERT));
409 if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP,
410 XFS_RANDOM_ITOBP_INOTOBP))) {
411#ifdef DEBUG
412 prdev("bad inode magic/vsn daddr %lld #%d (magic=%x)",
413 mp->m_ddev_targp,
414 (unsigned long long)imap.im_blkno, i,
415 INT_GET(dip->di_core.di_magic, ARCH_CONVERT));
416#endif
417 XFS_CORRUPTION_ERROR("xfs_itobp", XFS_ERRLEVEL_HIGH,
418 mp, dip);
419 xfs_trans_brelse(tp, bp);
420 return XFS_ERROR(EFSCORRUPTED);
421 }
422 }
423#endif /* __KERNEL__ */
424
425 xfs_inobp_check(mp, bp);
426
427 /*
428 * Mark the buffer as an inode buffer now that it looks good
429 */
430 XFS_BUF_SET_VTYPE(bp, B_FS_INO);
431
432 /*
433 * Set *dipp to point to the on-disk inode in the buffer.
434 */
435 *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
436 *bpp = bp;
437 return 0;
438}
439
440/*
441 * Move inode type and inode format specific information from the
442 * on-disk inode to the in-core inode. For fifos, devs, and sockets
443 * this means set if_rdev to the proper value. For files, directories,
444 * and symlinks this means to bring in the in-line data or extent
445 * pointers. For a file in B-tree format, only the root is immediately
446 * brought in-core. The rest will be in-lined in if_extents when it
447 * is first referenced (see xfs_iread_extents()).
448 */
449STATIC int
450xfs_iformat(
451 xfs_inode_t *ip,
452 xfs_dinode_t *dip)
453{
454 xfs_attr_shortform_t *atp;
455 int size;
456 int error;
457 xfs_fsize_t di_size;
458 ip->i_df.if_ext_max =
459 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
460 error = 0;
461
462 if (unlikely(
463 INT_GET(dip->di_core.di_nextents, ARCH_CONVERT) +
464 INT_GET(dip->di_core.di_anextents, ARCH_CONVERT) >
465 INT_GET(dip->di_core.di_nblocks, ARCH_CONVERT))) {
466 xfs_fs_cmn_err(CE_WARN, ip->i_mount,
467 "corrupt dinode %Lu, extent total = %d, nblocks = %Lu."
468 " Unmount and run xfs_repair.",
469 (unsigned long long)ip->i_ino,
470 (int)(INT_GET(dip->di_core.di_nextents, ARCH_CONVERT)
471 + INT_GET(dip->di_core.di_anextents, ARCH_CONVERT)),
472 (unsigned long long)
473 INT_GET(dip->di_core.di_nblocks, ARCH_CONVERT));
474 XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
475 ip->i_mount, dip);
476 return XFS_ERROR(EFSCORRUPTED);
477 }
478
479 if (unlikely(INT_GET(dip->di_core.di_forkoff, ARCH_CONVERT) > ip->i_mount->m_sb.sb_inodesize)) {
480 xfs_fs_cmn_err(CE_WARN, ip->i_mount,
481 "corrupt dinode %Lu, forkoff = 0x%x."
482 " Unmount and run xfs_repair.",
483 (unsigned long long)ip->i_ino,
484 (int)(INT_GET(dip->di_core.di_forkoff, ARCH_CONVERT)));
485 XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
486 ip->i_mount, dip);
487 return XFS_ERROR(EFSCORRUPTED);
488 }
489
490 switch (ip->i_d.di_mode & S_IFMT) {
491 case S_IFIFO:
492 case S_IFCHR:
493 case S_IFBLK:
494 case S_IFSOCK:
495 if (unlikely(INT_GET(dip->di_core.di_format, ARCH_CONVERT) != XFS_DINODE_FMT_DEV)) {
496 XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
497 ip->i_mount, dip);
498 return XFS_ERROR(EFSCORRUPTED);
499 }
500 ip->i_d.di_size = 0;
501 ip->i_df.if_u2.if_rdev = INT_GET(dip->di_u.di_dev, ARCH_CONVERT);
502 break;
503
504 case S_IFREG:
505 case S_IFLNK:
506 case S_IFDIR:
507 switch (INT_GET(dip->di_core.di_format, ARCH_CONVERT)) {
508 case XFS_DINODE_FMT_LOCAL:
509 /*
510 * no local regular files yet
511 */
512 if (unlikely((INT_GET(dip->di_core.di_mode, ARCH_CONVERT) & S_IFMT) == S_IFREG)) {
513 xfs_fs_cmn_err(CE_WARN, ip->i_mount,
514 "corrupt inode (local format for regular file) %Lu. Unmount and run xfs_repair.",
515 (unsigned long long) ip->i_ino);
516 XFS_CORRUPTION_ERROR("xfs_iformat(4)",
517 XFS_ERRLEVEL_LOW,
518 ip->i_mount, dip);
519 return XFS_ERROR(EFSCORRUPTED);
520 }
521
522 di_size = INT_GET(dip->di_core.di_size, ARCH_CONVERT);
523 if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
524 xfs_fs_cmn_err(CE_WARN, ip->i_mount,
525 "corrupt inode %Lu (bad size %Ld for local inode). Unmount and run xfs_repair.",
526 (unsigned long long) ip->i_ino,
527 (long long) di_size);
528 XFS_CORRUPTION_ERROR("xfs_iformat(5)",
529 XFS_ERRLEVEL_LOW,
530 ip->i_mount, dip);
531 return XFS_ERROR(EFSCORRUPTED);
532 }
533
534 size = (int)di_size;
535 error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
536 break;
537 case XFS_DINODE_FMT_EXTENTS:
538 error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
539 break;
540 case XFS_DINODE_FMT_BTREE:
541 error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
542 break;
543 default:
544 XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
545 ip->i_mount);
546 return XFS_ERROR(EFSCORRUPTED);
547 }
548 break;
549
550 default:
551 XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
552 return XFS_ERROR(EFSCORRUPTED);
553 }
554 if (error) {
555 return error;
556 }
557 if (!XFS_DFORK_Q(dip))
558 return 0;
559 ASSERT(ip->i_afp == NULL);
560 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
561 ip->i_afp->if_ext_max =
562 XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
563 switch (INT_GET(dip->di_core.di_aformat, ARCH_CONVERT)) {
564 case XFS_DINODE_FMT_LOCAL:
565 atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
566 size = (int)INT_GET(atp->hdr.totsize, ARCH_CONVERT);
567 error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
568 break;
569 case XFS_DINODE_FMT_EXTENTS:
570 error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
571 break;
572 case XFS_DINODE_FMT_BTREE:
573 error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
574 break;
575 default:
576 error = XFS_ERROR(EFSCORRUPTED);
577 break;
578 }
579 if (error) {
580 kmem_zone_free(xfs_ifork_zone, ip->i_afp);
581 ip->i_afp = NULL;
582 xfs_idestroy_fork(ip, XFS_DATA_FORK);
583 }
584 return error;
585}
586
587/*
588 * The file is in-lined in the on-disk inode.
589 * If it fits into if_inline_data, then copy
590 * it there, otherwise allocate a buffer for it
591 * and copy the data there. Either way, set
592 * if_data to point at the data.
593 * If we allocate a buffer for the data, make
594 * sure that its size is a multiple of 4 and
595 * record the real size in i_real_bytes.
596 */
597STATIC int
598xfs_iformat_local(
599 xfs_inode_t *ip,
600 xfs_dinode_t *dip,
601 int whichfork,
602 int size)
603{
604 xfs_ifork_t *ifp;
605 int real_size;
606
607 /*
608 * If the size is unreasonable, then something
609 * is wrong and we just bail out rather than crash in
610 * kmem_alloc() or memcpy() below.
611 */
612 if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
613 xfs_fs_cmn_err(CE_WARN, ip->i_mount,
614 "corrupt inode %Lu (bad size %d for local fork, size = %d). Unmount and run xfs_repair.",
615 (unsigned long long) ip->i_ino, size,
616 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
617 XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
618 ip->i_mount, dip);
619 return XFS_ERROR(EFSCORRUPTED);
620 }
621 ifp = XFS_IFORK_PTR(ip, whichfork);
622 real_size = 0;
623 if (size == 0)
624 ifp->if_u1.if_data = NULL;
625 else if (size <= sizeof(ifp->if_u2.if_inline_data))
626 ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
627 else {
628 real_size = roundup(size, 4);
629 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
630 }
631 ifp->if_bytes = size;
632 ifp->if_real_bytes = real_size;
633 if (size)
634 memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
635 ifp->if_flags &= ~XFS_IFEXTENTS;
636 ifp->if_flags |= XFS_IFINLINE;
637 return 0;
638}
639
640/*
641 * The file consists of a set of extents all
642 * of which fit into the on-disk inode.
643 * If there are few enough extents to fit into
644 * the if_inline_ext, then copy them there.
645 * Otherwise allocate a buffer for them and copy
646 * them into it. Either way, set if_extents
647 * to point at the extents.
648 */
649STATIC int
650xfs_iformat_extents(
651 xfs_inode_t *ip,
652 xfs_dinode_t *dip,
653 int whichfork)
654{
655 xfs_bmbt_rec_t *ep, *dp;
656 xfs_ifork_t *ifp;
657 int nex;
658 int real_size;
659 int size;
660 int i;
661
662 ifp = XFS_IFORK_PTR(ip, whichfork);
663 nex = XFS_DFORK_NEXTENTS(dip, whichfork);
664 size = nex * (uint)sizeof(xfs_bmbt_rec_t);
665
666 /*
667 * If the number of extents is unreasonable, then something
668 * is wrong and we just bail out rather than crash in
669 * kmem_alloc() or memcpy() below.
670 */
671 if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
672 xfs_fs_cmn_err(CE_WARN, ip->i_mount,
673 "corrupt inode %Lu ((a)extents = %d). Unmount and run xfs_repair.",
674 (unsigned long long) ip->i_ino, nex);
675 XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
676 ip->i_mount, dip);
677 return XFS_ERROR(EFSCORRUPTED);
678 }
679
680 real_size = 0;
681 if (nex == 0)
682 ifp->if_u1.if_extents = NULL;
683 else if (nex <= XFS_INLINE_EXTS)
684 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
685 else {
686 ifp->if_u1.if_extents = kmem_alloc(size, KM_SLEEP);
687 ASSERT(ifp->if_u1.if_extents != NULL);
688 real_size = size;
689 }
690 ifp->if_bytes = size;
691 ifp->if_real_bytes = real_size;
692 if (size) {
693 dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
694 xfs_validate_extents(dp, nex, 1, XFS_EXTFMT_INODE(ip));
695 ep = ifp->if_u1.if_extents;
696 for (i = 0; i < nex; i++, ep++, dp++) {
697 ep->l0 = INT_GET(get_unaligned((__uint64_t*)&dp->l0),
698 ARCH_CONVERT);
699 ep->l1 = INT_GET(get_unaligned((__uint64_t*)&dp->l1),
700 ARCH_CONVERT);
701 }
702 xfs_bmap_trace_exlist("xfs_iformat_extents", ip, nex,
703 whichfork);
704 if (whichfork != XFS_DATA_FORK ||
705 XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
706 if (unlikely(xfs_check_nostate_extents(
707 ifp->if_u1.if_extents, nex))) {
708 XFS_ERROR_REPORT("xfs_iformat_extents(2)",
709 XFS_ERRLEVEL_LOW,
710 ip->i_mount);
711 return XFS_ERROR(EFSCORRUPTED);
712 }
713 }
714 ifp->if_flags |= XFS_IFEXTENTS;
715 return 0;
716}
717
718/*
719 * The file has too many extents to fit into
720 * the inode, so they are in B-tree format.
721 * Allocate a buffer for the root of the B-tree
722 * and copy the root into it. The i_extents
723 * field will remain NULL until all of the
724 * extents are read in (when they are needed).
725 */
726STATIC int
727xfs_iformat_btree(
728 xfs_inode_t *ip,
729 xfs_dinode_t *dip,
730 int whichfork)
731{
732 xfs_bmdr_block_t *dfp;
733 xfs_ifork_t *ifp;
734 /* REFERENCED */
735 int nrecs;
736 int size;
737
738 ifp = XFS_IFORK_PTR(ip, whichfork);
739 dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
740 size = XFS_BMAP_BROOT_SPACE(dfp);
741 nrecs = XFS_BMAP_BROOT_NUMRECS(dfp);
742
743 /*
744 * blow out if -- fork has less extents than can fit in
745 * fork (fork shouldn't be a btree format), root btree
746 * block has more records than can fit into the fork,
747 * or the number of extents is greater than the number of
748 * blocks.
749 */
750 if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max
751 || XFS_BMDR_SPACE_CALC(nrecs) >
752 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)
753 || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
754 xfs_fs_cmn_err(CE_WARN, ip->i_mount,
755 "corrupt inode %Lu (btree). Unmount and run xfs_repair.",
756 (unsigned long long) ip->i_ino);
757 XFS_ERROR_REPORT("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
758 ip->i_mount);
759 return XFS_ERROR(EFSCORRUPTED);
760 }
761
762 ifp->if_broot_bytes = size;
763 ifp->if_broot = kmem_alloc(size, KM_SLEEP);
764 ASSERT(ifp->if_broot != NULL);
765 /*
766 * Copy and convert from the on-disk structure
767 * to the in-memory structure.
768 */
769 xfs_bmdr_to_bmbt(dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
770 ifp->if_broot, size);
771 ifp->if_flags &= ~XFS_IFEXTENTS;
772 ifp->if_flags |= XFS_IFBROOT;
773
774 return 0;
775}
776
777/*
778 * xfs_xlate_dinode_core - translate an xfs_inode_core_t between ondisk
779 * and native format
780 *
781 * buf = on-disk representation
782 * dip = native representation
783 * dir = direction - +ve -> disk to native
784 * -ve -> native to disk
785 */
786void
787xfs_xlate_dinode_core(
788 xfs_caddr_t buf,
789 xfs_dinode_core_t *dip,
790 int dir)
791{
792 xfs_dinode_core_t *buf_core = (xfs_dinode_core_t *)buf;
793 xfs_dinode_core_t *mem_core = (xfs_dinode_core_t *)dip;
794 xfs_arch_t arch = ARCH_CONVERT;
795
796 ASSERT(dir);
797
798 INT_XLATE(buf_core->di_magic, mem_core->di_magic, dir, arch);
799 INT_XLATE(buf_core->di_mode, mem_core->di_mode, dir, arch);
800 INT_XLATE(buf_core->di_version, mem_core->di_version, dir, arch);
801 INT_XLATE(buf_core->di_format, mem_core->di_format, dir, arch);
802 INT_XLATE(buf_core->di_onlink, mem_core->di_onlink, dir, arch);
803 INT_XLATE(buf_core->di_uid, mem_core->di_uid, dir, arch);
804 INT_XLATE(buf_core->di_gid, mem_core->di_gid, dir, arch);
805 INT_XLATE(buf_core->di_nlink, mem_core->di_nlink, dir, arch);
806 INT_XLATE(buf_core->di_projid, mem_core->di_projid, dir, arch);
807
808 if (dir > 0) {
809 memcpy(mem_core->di_pad, buf_core->di_pad,
810 sizeof(buf_core->di_pad));
811 } else {
812 memcpy(buf_core->di_pad, mem_core->di_pad,
813 sizeof(buf_core->di_pad));
814 }
815
816 INT_XLATE(buf_core->di_flushiter, mem_core->di_flushiter, dir, arch);
817
818 INT_XLATE(buf_core->di_atime.t_sec, mem_core->di_atime.t_sec,
819 dir, arch);
820 INT_XLATE(buf_core->di_atime.t_nsec, mem_core->di_atime.t_nsec,
821 dir, arch);
822 INT_XLATE(buf_core->di_mtime.t_sec, mem_core->di_mtime.t_sec,
823 dir, arch);
824 INT_XLATE(buf_core->di_mtime.t_nsec, mem_core->di_mtime.t_nsec,
825 dir, arch);
826 INT_XLATE(buf_core->di_ctime.t_sec, mem_core->di_ctime.t_sec,
827 dir, arch);
828 INT_XLATE(buf_core->di_ctime.t_nsec, mem_core->di_ctime.t_nsec,
829 dir, arch);
830 INT_XLATE(buf_core->di_size, mem_core->di_size, dir, arch);
831 INT_XLATE(buf_core->di_nblocks, mem_core->di_nblocks, dir, arch);
832 INT_XLATE(buf_core->di_extsize, mem_core->di_extsize, dir, arch);
833 INT_XLATE(buf_core->di_nextents, mem_core->di_nextents, dir, arch);
834 INT_XLATE(buf_core->di_anextents, mem_core->di_anextents, dir, arch);
835 INT_XLATE(buf_core->di_forkoff, mem_core->di_forkoff, dir, arch);
836 INT_XLATE(buf_core->di_aformat, mem_core->di_aformat, dir, arch);
837 INT_XLATE(buf_core->di_dmevmask, mem_core->di_dmevmask, dir, arch);
838 INT_XLATE(buf_core->di_dmstate, mem_core->di_dmstate, dir, arch);
839 INT_XLATE(buf_core->di_flags, mem_core->di_flags, dir, arch);
840 INT_XLATE(buf_core->di_gen, mem_core->di_gen, dir, arch);
841}
842
843STATIC uint
844_xfs_dic2xflags(
845 xfs_dinode_core_t *dic,
846 __uint16_t di_flags)
847{
848 uint flags = 0;
849
850 if (di_flags & XFS_DIFLAG_ANY) {
851 if (di_flags & XFS_DIFLAG_REALTIME)
852 flags |= XFS_XFLAG_REALTIME;
853 if (di_flags & XFS_DIFLAG_PREALLOC)
854 flags |= XFS_XFLAG_PREALLOC;
855 if (di_flags & XFS_DIFLAG_IMMUTABLE)
856 flags |= XFS_XFLAG_IMMUTABLE;
857 if (di_flags & XFS_DIFLAG_APPEND)
858 flags |= XFS_XFLAG_APPEND;
859 if (di_flags & XFS_DIFLAG_SYNC)
860 flags |= XFS_XFLAG_SYNC;
861 if (di_flags & XFS_DIFLAG_NOATIME)
862 flags |= XFS_XFLAG_NOATIME;
863 if (di_flags & XFS_DIFLAG_NODUMP)
864 flags |= XFS_XFLAG_NODUMP;
865 if (di_flags & XFS_DIFLAG_RTINHERIT)
866 flags |= XFS_XFLAG_RTINHERIT;
867 if (di_flags & XFS_DIFLAG_PROJINHERIT)
868 flags |= XFS_XFLAG_PROJINHERIT;
869 if (di_flags & XFS_DIFLAG_NOSYMLINKS)
870 flags |= XFS_XFLAG_NOSYMLINKS;
871 }
872
873 return flags;
874}
875
876uint
877xfs_ip2xflags(
878 xfs_inode_t *ip)
879{
880 xfs_dinode_core_t *dic = &ip->i_d;
881
882 return _xfs_dic2xflags(dic, dic->di_flags) |
883 (XFS_CFORK_Q(dic) ? XFS_XFLAG_HASATTR : 0);
884}
885
886uint
887xfs_dic2xflags(
888 xfs_dinode_core_t *dic)
889{
890 return _xfs_dic2xflags(dic, INT_GET(dic->di_flags, ARCH_CONVERT)) |
891 (XFS_CFORK_Q_DISK(dic) ? XFS_XFLAG_HASATTR : 0);
892}
893
894/*
895 * Given a mount structure and an inode number, return a pointer
896 * to a newly allocated in-core inode coresponding to the given
897 * inode number.
898 *
899 * Initialize the inode's attributes and extent pointers if it
900 * already has them (it will not if the inode has no links).
901 */
902int
903xfs_iread(
904 xfs_mount_t *mp,
905 xfs_trans_t *tp,
906 xfs_ino_t ino,
907 xfs_inode_t **ipp,
908 xfs_daddr_t bno)
909{
910 xfs_buf_t *bp;
911 xfs_dinode_t *dip;
912 xfs_inode_t *ip;
913 int error;
914
915 ASSERT(xfs_inode_zone != NULL);
916
917 ip = kmem_zone_zalloc(xfs_inode_zone, KM_SLEEP);
918 ip->i_ino = ino;
919 ip->i_mount = mp;
920
921 /*
922 * Get pointer's to the on-disk inode and the buffer containing it.
923 * If the inode number refers to a block outside the file system
924 * then xfs_itobp() will return NULL. In this case we should
925 * return NULL as well. Set i_blkno to 0 so that xfs_itobp() will
926 * know that this is a new incore inode.
927 */
928 error = xfs_itobp(mp, tp, ip, &dip, &bp, bno);
929
930 if (error != 0) {
931 kmem_zone_free(xfs_inode_zone, ip);
932 return error;
933 }
934
935 /*
936 * Initialize inode's trace buffers.
937 * Do this before xfs_iformat in case it adds entries.
938 */
939#ifdef XFS_BMAP_TRACE
940 ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_SLEEP);
941#endif
942#ifdef XFS_BMBT_TRACE
943 ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_SLEEP);
944#endif
945#ifdef XFS_RW_TRACE
946 ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_SLEEP);
947#endif
948#ifdef XFS_ILOCK_TRACE
949 ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_SLEEP);
950#endif
951#ifdef XFS_DIR2_TRACE
952 ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_SLEEP);
953#endif
954
955 /*
956 * If we got something that isn't an inode it means someone
957 * (nfs or dmi) has a stale handle.
958 */
959 if (INT_GET(dip->di_core.di_magic, ARCH_CONVERT) != XFS_DINODE_MAGIC) {
960 kmem_zone_free(xfs_inode_zone, ip);
961 xfs_trans_brelse(tp, bp);
962#ifdef DEBUG
963 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
964 "dip->di_core.di_magic (0x%x) != "
965 "XFS_DINODE_MAGIC (0x%x)",
966 INT_GET(dip->di_core.di_magic, ARCH_CONVERT),
967 XFS_DINODE_MAGIC);
968#endif /* DEBUG */
969 return XFS_ERROR(EINVAL);
970 }
971
972 /*
973 * If the on-disk inode is already linked to a directory
974 * entry, copy all of the inode into the in-core inode.
975 * xfs_iformat() handles copying in the inode format
976 * specific information.
977 * Otherwise, just get the truly permanent information.
978 */
979 if (dip->di_core.di_mode) {
980 xfs_xlate_dinode_core((xfs_caddr_t)&dip->di_core,
981 &(ip->i_d), 1);
982 error = xfs_iformat(ip, dip);
983 if (error) {
984 kmem_zone_free(xfs_inode_zone, ip);
985 xfs_trans_brelse(tp, bp);
986#ifdef DEBUG
987 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
988 "xfs_iformat() returned error %d",
989 error);
990#endif /* DEBUG */
991 return error;
992 }
993 } else {
994 ip->i_d.di_magic = INT_GET(dip->di_core.di_magic, ARCH_CONVERT);
995 ip->i_d.di_version = INT_GET(dip->di_core.di_version, ARCH_CONVERT);
996 ip->i_d.di_gen = INT_GET(dip->di_core.di_gen, ARCH_CONVERT);
997 ip->i_d.di_flushiter = INT_GET(dip->di_core.di_flushiter, ARCH_CONVERT);
998 /*
999 * Make sure to pull in the mode here as well in
1000 * case the inode is released without being used.
1001 * This ensures that xfs_inactive() will see that
1002 * the inode is already free and not try to mess
1003 * with the uninitialized part of it.
1004 */
1005 ip->i_d.di_mode = 0;
1006 /*
1007 * Initialize the per-fork minima and maxima for a new
1008 * inode here. xfs_iformat will do it for old inodes.
1009 */
1010 ip->i_df.if_ext_max =
1011 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
1012 }
1013
1014 INIT_LIST_HEAD(&ip->i_reclaim);
1015
1016 /*
1017 * The inode format changed when we moved the link count and
1018 * made it 32 bits long. If this is an old format inode,
1019 * convert it in memory to look like a new one. If it gets
1020 * flushed to disk we will convert back before flushing or
1021 * logging it. We zero out the new projid field and the old link
1022 * count field. We'll handle clearing the pad field (the remains
1023 * of the old uuid field) when we actually convert the inode to
1024 * the new format. We don't change the version number so that we
1025 * can distinguish this from a real new format inode.
1026 */
1027 if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
1028 ip->i_d.di_nlink = ip->i_d.di_onlink;
1029 ip->i_d.di_onlink = 0;
1030 ip->i_d.di_projid = 0;
1031 }
1032
1033 ip->i_delayed_blks = 0;
1034
1035 /*
1036 * Mark the buffer containing the inode as something to keep
1037 * around for a while. This helps to keep recently accessed
1038 * meta-data in-core longer.
1039 */
1040 XFS_BUF_SET_REF(bp, XFS_INO_REF);
1041
1042 /*
1043 * Use xfs_trans_brelse() to release the buffer containing the
1044 * on-disk inode, because it was acquired with xfs_trans_read_buf()
1045 * in xfs_itobp() above. If tp is NULL, this is just a normal
1046 * brelse(). If we're within a transaction, then xfs_trans_brelse()
1047 * will only release the buffer if it is not dirty within the
1048 * transaction. It will be OK to release the buffer in this case,
1049 * because inodes on disk are never destroyed and we will be
1050 * locking the new in-core inode before putting it in the hash
1051 * table where other processes can find it. Thus we don't have
1052 * to worry about the inode being changed just because we released
1053 * the buffer.
1054 */
1055 xfs_trans_brelse(tp, bp);
1056 *ipp = ip;
1057 return 0;
1058}
1059
1060/*
1061 * Read in extents from a btree-format inode.
1062 * Allocate and fill in if_extents. Real work is done in xfs_bmap.c.
1063 */
1064int
1065xfs_iread_extents(
1066 xfs_trans_t *tp,
1067 xfs_inode_t *ip,
1068 int whichfork)
1069{
1070 int error;
1071 xfs_ifork_t *ifp;
1072 size_t size;
1073
1074 if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
1075 XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
1076 ip->i_mount);
1077 return XFS_ERROR(EFSCORRUPTED);
1078 }
1079 size = XFS_IFORK_NEXTENTS(ip, whichfork) * (uint)sizeof(xfs_bmbt_rec_t);
1080 ifp = XFS_IFORK_PTR(ip, whichfork);
1081 /*
1082 * We know that the size is valid (it's checked in iformat_btree)
1083 */
1084 ifp->if_u1.if_extents = kmem_alloc(size, KM_SLEEP);
1085 ASSERT(ifp->if_u1.if_extents != NULL);
1086 ifp->if_lastex = NULLEXTNUM;
1087 ifp->if_bytes = ifp->if_real_bytes = (int)size;
1088 ifp->if_flags |= XFS_IFEXTENTS;
1089 error = xfs_bmap_read_extents(tp, ip, whichfork);
1090 if (error) {
1091 kmem_free(ifp->if_u1.if_extents, size);
1092 ifp->if_u1.if_extents = NULL;
1093 ifp->if_bytes = ifp->if_real_bytes = 0;
1094 ifp->if_flags &= ~XFS_IFEXTENTS;
1095 return error;
1096 }
1097 xfs_validate_extents((xfs_bmbt_rec_t *)ifp->if_u1.if_extents,
1098 XFS_IFORK_NEXTENTS(ip, whichfork), 0, XFS_EXTFMT_INODE(ip));
1099 return 0;
1100}
1101
1102/*
1103 * Allocate an inode on disk and return a copy of its in-core version.
1104 * The in-core inode is locked exclusively. Set mode, nlink, and rdev
1105 * appropriately within the inode. The uid and gid for the inode are
1106 * set according to the contents of the given cred structure.
1107 *
1108 * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
1109 * has a free inode available, call xfs_iget()
1110 * to obtain the in-core version of the allocated inode. Finally,
1111 * fill in the inode and log its initial contents. In this case,
1112 * ialloc_context would be set to NULL and call_again set to false.
1113 *
1114 * If xfs_dialloc() does not have an available inode,
1115 * it will replenish its supply by doing an allocation. Since we can
1116 * only do one allocation within a transaction without deadlocks, we
1117 * must commit the current transaction before returning the inode itself.
1118 * In this case, therefore, we will set call_again to true and return.
1119 * The caller should then commit the current transaction, start a new
1120 * transaction, and call xfs_ialloc() again to actually get the inode.
1121 *
1122 * To ensure that some other process does not grab the inode that
1123 * was allocated during the first call to xfs_ialloc(), this routine
1124 * also returns the [locked] bp pointing to the head of the freelist
1125 * as ialloc_context. The caller should hold this buffer across
1126 * the commit and pass it back into this routine on the second call.
1127 */
1128int
1129xfs_ialloc(
1130 xfs_trans_t *tp,
1131 xfs_inode_t *pip,
1132 mode_t mode,
1133 nlink_t nlink,
1134 xfs_dev_t rdev,
1135 cred_t *cr,
1136 xfs_prid_t prid,
1137 int okalloc,
1138 xfs_buf_t **ialloc_context,
1139 boolean_t *call_again,
1140 xfs_inode_t **ipp)
1141{
1142 xfs_ino_t ino;
1143 xfs_inode_t *ip;
1144 vnode_t *vp;
1145 uint flags;
1146 int error;
1147
1148 /*
1149 * Call the space management code to pick
1150 * the on-disk inode to be allocated.
1151 */
1152 error = xfs_dialloc(tp, pip->i_ino, mode, okalloc,
1153 ialloc_context, call_again, &ino);
1154 if (error != 0) {
1155 return error;
1156 }
1157 if (*call_again || ino == NULLFSINO) {
1158 *ipp = NULL;
1159 return 0;
1160 }
1161 ASSERT(*ialloc_context == NULL);
1162
1163 /*
1164 * Get the in-core inode with the lock held exclusively.
1165 * This is because we're setting fields here we need
1166 * to prevent others from looking at until we're done.
1167 */
1168 error = xfs_trans_iget(tp->t_mountp, tp, ino,
1169 IGET_CREATE, XFS_ILOCK_EXCL, &ip);
1170 if (error != 0) {
1171 return error;
1172 }
1173 ASSERT(ip != NULL);
1174
1175 vp = XFS_ITOV(ip);
1176 vp->v_type = IFTOVT(mode);
1177 ip->i_d.di_mode = (__uint16_t)mode;
1178 ip->i_d.di_onlink = 0;
1179 ip->i_d.di_nlink = nlink;
1180 ASSERT(ip->i_d.di_nlink == nlink);
1181 ip->i_d.di_uid = current_fsuid(cr);
1182 ip->i_d.di_gid = current_fsgid(cr);
1183 ip->i_d.di_projid = prid;
1184 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
1185
1186 /*
1187 * If the superblock version is up to where we support new format
1188 * inodes and this is currently an old format inode, then change
1189 * the inode version number now. This way we only do the conversion
1190 * here rather than here and in the flush/logging code.
1191 */
1192 if (XFS_SB_VERSION_HASNLINK(&tp->t_mountp->m_sb) &&
1193 ip->i_d.di_version == XFS_DINODE_VERSION_1) {
1194 ip->i_d.di_version = XFS_DINODE_VERSION_2;
1195 /*
1196 * We've already zeroed the old link count, the projid field,
1197 * and the pad field.
1198 */
1199 }
1200
1201 /*
1202 * Project ids won't be stored on disk if we are using a version 1 inode.
1203 */
1204 if ( (prid != 0) && (ip->i_d.di_version == XFS_DINODE_VERSION_1))
1205 xfs_bump_ino_vers2(tp, ip);
1206
1207 if (XFS_INHERIT_GID(pip, vp->v_vfsp)) {
1208 ip->i_d.di_gid = pip->i_d.di_gid;
1209 if ((pip->i_d.di_mode & S_ISGID) && (mode & S_IFMT) == S_IFDIR) {
1210 ip->i_d.di_mode |= S_ISGID;
1211 }
1212 }
1213
1214 /*
1215 * If the group ID of the new file does not match the effective group
1216 * ID or one of the supplementary group IDs, the S_ISGID bit is cleared
1217 * (and only if the irix_sgid_inherit compatibility variable is set).
1218 */
1219 if ((irix_sgid_inherit) &&
1220 (ip->i_d.di_mode & S_ISGID) &&
1221 (!in_group_p((gid_t)ip->i_d.di_gid))) {
1222 ip->i_d.di_mode &= ~S_ISGID;
1223 }
1224
1225 ip->i_d.di_size = 0;
1226 ip->i_d.di_nextents = 0;
1227 ASSERT(ip->i_d.di_nblocks == 0);
1228 xfs_ichgtime(ip, XFS_ICHGTIME_CHG|XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD);
1229 /*
1230 * di_gen will have been taken care of in xfs_iread.
1231 */
1232 ip->i_d.di_extsize = 0;
1233 ip->i_d.di_dmevmask = 0;
1234 ip->i_d.di_dmstate = 0;
1235 ip->i_d.di_flags = 0;
1236 flags = XFS_ILOG_CORE;
1237 switch (mode & S_IFMT) {
1238 case S_IFIFO:
1239 case S_IFCHR:
1240 case S_IFBLK:
1241 case S_IFSOCK:
1242 ip->i_d.di_format = XFS_DINODE_FMT_DEV;
1243 ip->i_df.if_u2.if_rdev = rdev;
1244 ip->i_df.if_flags = 0;
1245 flags |= XFS_ILOG_DEV;
1246 break;
1247 case S_IFREG:
1248 case S_IFDIR:
1249 if (unlikely(pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
1250 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) {
1251 if ((mode & S_IFMT) == S_IFDIR) {
1252 ip->i_d.di_flags |= XFS_DIFLAG_RTINHERIT;
1253 } else {
1254 ip->i_d.di_flags |= XFS_DIFLAG_REALTIME;
1255 ip->i_iocore.io_flags |= XFS_IOCORE_RT;
1256 }
1257 }
1258 if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
1259 xfs_inherit_noatime)
1260 ip->i_d.di_flags |= XFS_DIFLAG_NOATIME;
1261 if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) &&
1262 xfs_inherit_nodump)
1263 ip->i_d.di_flags |= XFS_DIFLAG_NODUMP;
1264 if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) &&
1265 xfs_inherit_sync)
1266 ip->i_d.di_flags |= XFS_DIFLAG_SYNC;
1267 if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
1268 xfs_inherit_nosymlinks)
1269 ip->i_d.di_flags |= XFS_DIFLAG_NOSYMLINKS;
1270 }
1271 /* FALLTHROUGH */
1272 case S_IFLNK:
1273 ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
1274 ip->i_df.if_flags = XFS_IFEXTENTS;
1275 ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0;
1276 ip->i_df.if_u1.if_extents = NULL;
1277 break;
1278 default:
1279 ASSERT(0);
1280 }
1281 /*
1282 * Attribute fork settings for new inode.
1283 */
1284 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
1285 ip->i_d.di_anextents = 0;
1286
1287 /*
1288 * Log the new values stuffed into the inode.
1289 */
1290 xfs_trans_log_inode(tp, ip, flags);
1291
1292 /* now that we have a v_type we can set Linux inode ops (& unlock) */
1293 VFS_INIT_VNODE(XFS_MTOVFS(tp->t_mountp), vp, XFS_ITOBHV(ip), 1);
1294
1295 *ipp = ip;
1296 return 0;
1297}
1298
1299/*
1300 * Check to make sure that there are no blocks allocated to the
1301 * file beyond the size of the file. We don't check this for
1302 * files with fixed size extents or real time extents, but we
1303 * at least do it for regular files.
1304 */
1305#ifdef DEBUG
1306void
1307xfs_isize_check(
1308 xfs_mount_t *mp,
1309 xfs_inode_t *ip,
1310 xfs_fsize_t isize)
1311{
1312 xfs_fileoff_t map_first;
1313 int nimaps;
1314 xfs_bmbt_irec_t imaps[2];
1315
1316 if ((ip->i_d.di_mode & S_IFMT) != S_IFREG)
1317 return;
1318
1319 if ( ip->i_d.di_flags & XFS_DIFLAG_REALTIME )
1320 return;
1321
1322 nimaps = 2;
1323 map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
1324 /*
1325 * The filesystem could be shutting down, so bmapi may return
1326 * an error.
1327 */
1328 if (xfs_bmapi(NULL, ip, map_first,
1329 (XFS_B_TO_FSB(mp,
1330 (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) -
1331 map_first),
1332 XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps,
1333 NULL))
1334 return;
1335 ASSERT(nimaps == 1);
1336 ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
1337}
1338#endif /* DEBUG */
1339
1340/*
1341 * Calculate the last possible buffered byte in a file. This must
1342 * include data that was buffered beyond the EOF by the write code.
1343 * This also needs to deal with overflowing the xfs_fsize_t type
1344 * which can happen for sizes near the limit.
1345 *
1346 * We also need to take into account any blocks beyond the EOF. It
1347 * may be the case that they were buffered by a write which failed.
1348 * In that case the pages will still be in memory, but the inode size
1349 * will never have been updated.
1350 */
1351xfs_fsize_t
1352xfs_file_last_byte(
1353 xfs_inode_t *ip)
1354{
1355 xfs_mount_t *mp;
1356 xfs_fsize_t last_byte;
1357 xfs_fileoff_t last_block;
1358 xfs_fileoff_t size_last_block;
1359 int error;
1360
1361 ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE | MR_ACCESS));
1362
1363 mp = ip->i_mount;
1364 /*
1365 * Only check for blocks beyond the EOF if the extents have
1366 * been read in. This eliminates the need for the inode lock,
1367 * and it also saves us from looking when it really isn't
1368 * necessary.
1369 */
1370 if (ip->i_df.if_flags & XFS_IFEXTENTS) {
1371 error = xfs_bmap_last_offset(NULL, ip, &last_block,
1372 XFS_DATA_FORK);
1373 if (error) {
1374 last_block = 0;
1375 }
1376 } else {
1377 last_block = 0;
1378 }
1379 size_last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)ip->i_d.di_size);
1380 last_block = XFS_FILEOFF_MAX(last_block, size_last_block);
1381
1382 last_byte = XFS_FSB_TO_B(mp, last_block);
1383 if (last_byte < 0) {
1384 return XFS_MAXIOFFSET(mp);
1385 }
1386 last_byte += (1 << mp->m_writeio_log);
1387 if (last_byte < 0) {
1388 return XFS_MAXIOFFSET(mp);
1389 }
1390 return last_byte;
1391}
1392
1393#if defined(XFS_RW_TRACE)
1394STATIC void
1395xfs_itrunc_trace(
1396 int tag,
1397 xfs_inode_t *ip,
1398 int flag,
1399 xfs_fsize_t new_size,
1400 xfs_off_t toss_start,
1401 xfs_off_t toss_finish)
1402{
1403 if (ip->i_rwtrace == NULL) {
1404 return;
1405 }
1406
1407 ktrace_enter(ip->i_rwtrace,
1408 (void*)((long)tag),
1409 (void*)ip,
1410 (void*)(unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff),
1411 (void*)(unsigned long)(ip->i_d.di_size & 0xffffffff),
1412 (void*)((long)flag),
1413 (void*)(unsigned long)((new_size >> 32) & 0xffffffff),
1414 (void*)(unsigned long)(new_size & 0xffffffff),
1415 (void*)(unsigned long)((toss_start >> 32) & 0xffffffff),
1416 (void*)(unsigned long)(toss_start & 0xffffffff),
1417 (void*)(unsigned long)((toss_finish >> 32) & 0xffffffff),
1418 (void*)(unsigned long)(toss_finish & 0xffffffff),
1419 (void*)(unsigned long)current_cpu(),
1420 (void*)0,
1421 (void*)0,
1422 (void*)0,
1423 (void*)0);
1424}
1425#else
1426#define xfs_itrunc_trace(tag, ip, flag, new_size, toss_start, toss_finish)
1427#endif
1428
1429/*
1430 * Start the truncation of the file to new_size. The new size
1431 * must be smaller than the current size. This routine will
1432 * clear the buffer and page caches of file data in the removed
1433 * range, and xfs_itruncate_finish() will remove the underlying
1434 * disk blocks.
1435 *
1436 * The inode must have its I/O lock locked EXCLUSIVELY, and it
1437 * must NOT have the inode lock held at all. This is because we're
1438 * calling into the buffer/page cache code and we can't hold the
1439 * inode lock when we do so.
1440 *
1441 * The flags parameter can have either the value XFS_ITRUNC_DEFINITE
1442 * or XFS_ITRUNC_MAYBE. The XFS_ITRUNC_MAYBE value should be used
1443 * in the case that the caller is locking things out of order and
1444 * may not be able to call xfs_itruncate_finish() with the inode lock
1445 * held without dropping the I/O lock. If the caller must drop the
1446 * I/O lock before calling xfs_itruncate_finish(), then xfs_itruncate_start()
1447 * must be called again with all the same restrictions as the initial
1448 * call.
1449 */
1450void
1451xfs_itruncate_start(
1452 xfs_inode_t *ip,
1453 uint flags,
1454 xfs_fsize_t new_size)
1455{
1456 xfs_fsize_t last_byte;
1457 xfs_off_t toss_start;
1458 xfs_mount_t *mp;
1459 vnode_t *vp;
1460
1461 ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0);
1462 ASSERT((new_size == 0) || (new_size <= ip->i_d.di_size));
1463 ASSERT((flags == XFS_ITRUNC_DEFINITE) ||
1464 (flags == XFS_ITRUNC_MAYBE));
1465
1466 mp = ip->i_mount;
1467 vp = XFS_ITOV(ip);
1468 /*
1469 * Call VOP_TOSS_PAGES() or VOP_FLUSHINVAL_PAGES() to get rid of pages and buffers
1470 * overlapping the region being removed. We have to use
1471 * the less efficient VOP_FLUSHINVAL_PAGES() in the case that the
1472 * caller may not be able to finish the truncate without
1473 * dropping the inode's I/O lock. Make sure
1474 * to catch any pages brought in by buffers overlapping
1475 * the EOF by searching out beyond the isize by our
1476 * block size. We round new_size up to a block boundary
1477 * so that we don't toss things on the same block as
1478 * new_size but before it.
1479 *
1480 * Before calling VOP_TOSS_PAGES() or VOP_FLUSHINVAL_PAGES(), make sure to
1481 * call remapf() over the same region if the file is mapped.
1482 * This frees up mapped file references to the pages in the
1483 * given range and for the VOP_FLUSHINVAL_PAGES() case it ensures
1484 * that we get the latest mapped changes flushed out.
1485 */
1486 toss_start = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1487 toss_start = XFS_FSB_TO_B(mp, toss_start);
1488 if (toss_start < 0) {
1489 /*
1490 * The place to start tossing is beyond our maximum
1491 * file size, so there is no way that the data extended
1492 * out there.
1493 */
1494 return;
1495 }
1496 last_byte = xfs_file_last_byte(ip);
1497 xfs_itrunc_trace(XFS_ITRUNC_START, ip, flags, new_size, toss_start,
1498 last_byte);
1499 if (last_byte > toss_start) {
1500 if (flags & XFS_ITRUNC_DEFINITE) {
1501 VOP_TOSS_PAGES(vp, toss_start, -1, FI_REMAPF_LOCKED);
1502 } else {
1503 VOP_FLUSHINVAL_PAGES(vp, toss_start, -1, FI_REMAPF_LOCKED);
1504 }
1505 }
1506
1507#ifdef DEBUG
1508 if (new_size == 0) {
1509 ASSERT(VN_CACHED(vp) == 0);
1510 }
1511#endif
1512}
1513
1514/*
1515 * Shrink the file to the given new_size. The new
1516 * size must be smaller than the current size.
1517 * This will free up the underlying blocks
1518 * in the removed range after a call to xfs_itruncate_start()
1519 * or xfs_atruncate_start().
1520 *
1521 * The transaction passed to this routine must have made
1522 * a permanent log reservation of at least XFS_ITRUNCATE_LOG_RES.
1523 * This routine may commit the given transaction and
1524 * start new ones, so make sure everything involved in
1525 * the transaction is tidy before calling here.
1526 * Some transaction will be returned to the caller to be
1527 * committed. The incoming transaction must already include
1528 * the inode, and both inode locks must be held exclusively.
1529 * The inode must also be "held" within the transaction. On
1530 * return the inode will be "held" within the returned transaction.
1531 * This routine does NOT require any disk space to be reserved
1532 * for it within the transaction.
1533 *
1534 * The fork parameter must be either xfs_attr_fork or xfs_data_fork,
1535 * and it indicates the fork which is to be truncated. For the
1536 * attribute fork we only support truncation to size 0.
1537 *
1538 * We use the sync parameter to indicate whether or not the first
1539 * transaction we perform might have to be synchronous. For the attr fork,
1540 * it needs to be so if the unlink of the inode is not yet known to be
1541 * permanent in the log. This keeps us from freeing and reusing the
1542 * blocks of the attribute fork before the unlink of the inode becomes
1543 * permanent.
1544 *
1545 * For the data fork, we normally have to run synchronously if we're
1546 * being called out of the inactive path or we're being called
1547 * out of the create path where we're truncating an existing file.
1548 * Either way, the truncate needs to be sync so blocks don't reappear
1549 * in the file with altered data in case of a crash. wsync filesystems
1550 * can run the first case async because anything that shrinks the inode
1551 * has to run sync so by the time we're called here from inactive, the
1552 * inode size is permanently set to 0.
1553 *
1554 * Calls from the truncate path always need to be sync unless we're
1555 * in a wsync filesystem and the file has already been unlinked.
1556 *
1557 * The caller is responsible for correctly setting the sync parameter.
1558 * It gets too hard for us to guess here which path we're being called
1559 * out of just based on inode state.
1560 */
1561int
1562xfs_itruncate_finish(
1563 xfs_trans_t **tp,
1564 xfs_inode_t *ip,
1565 xfs_fsize_t new_size,
1566 int fork,
1567 int sync)
1568{
1569 xfs_fsblock_t first_block;
1570 xfs_fileoff_t first_unmap_block;
1571 xfs_fileoff_t last_block;
1572 xfs_filblks_t unmap_len=0;
1573 xfs_mount_t *mp;
1574 xfs_trans_t *ntp;
1575 int done;
1576 int committed;
1577 xfs_bmap_free_t free_list;
1578 int error;
1579
1580 ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0);
1581 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0);
1582 ASSERT((new_size == 0) || (new_size <= ip->i_d.di_size));
1583 ASSERT(*tp != NULL);
1584 ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
1585 ASSERT(ip->i_transp == *tp);
1586 ASSERT(ip->i_itemp != NULL);
1587 ASSERT(ip->i_itemp->ili_flags & XFS_ILI_HOLD);
1588
1589
1590 ntp = *tp;
1591 mp = (ntp)->t_mountp;
1592 ASSERT(! XFS_NOT_DQATTACHED(mp, ip));
1593
1594 /*
1595 * We only support truncating the entire attribute fork.
1596 */
1597 if (fork == XFS_ATTR_FORK) {
1598 new_size = 0LL;
1599 }
1600 first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1601 xfs_itrunc_trace(XFS_ITRUNC_FINISH1, ip, 0, new_size, 0, 0);
1602 /*
1603 * The first thing we do is set the size to new_size permanently
1604 * on disk. This way we don't have to worry about anyone ever
1605 * being able to look at the data being freed even in the face
1606 * of a crash. What we're getting around here is the case where
1607 * we free a block, it is allocated to another file, it is written
1608 * to, and then we crash. If the new data gets written to the
1609 * file but the log buffers containing the free and reallocation
1610 * don't, then we'd end up with garbage in the blocks being freed.
1611 * As long as we make the new_size permanent before actually
1612 * freeing any blocks it doesn't matter if they get writtten to.
1613 *
1614 * The callers must signal into us whether or not the size
1615 * setting here must be synchronous. There are a few cases
1616 * where it doesn't have to be synchronous. Those cases
1617 * occur if the file is unlinked and we know the unlink is
1618 * permanent or if the blocks being truncated are guaranteed
1619 * to be beyond the inode eof (regardless of the link count)
1620 * and the eof value is permanent. Both of these cases occur
1621 * only on wsync-mounted filesystems. In those cases, we're
1622 * guaranteed that no user will ever see the data in the blocks
1623 * that are being truncated so the truncate can run async.
1624 * In the free beyond eof case, the file may wind up with
1625 * more blocks allocated to it than it needs if we crash
1626 * and that won't get fixed until the next time the file
1627 * is re-opened and closed but that's ok as that shouldn't
1628 * be too many blocks.
1629 *
1630 * However, we can't just make all wsync xactions run async
1631 * because there's one call out of the create path that needs
1632 * to run sync where it's truncating an existing file to size
1633 * 0 whose size is > 0.
1634 *
1635 * It's probably possible to come up with a test in this
1636 * routine that would correctly distinguish all the above
1637 * cases from the values of the function parameters and the
1638 * inode state but for sanity's sake, I've decided to let the
1639 * layers above just tell us. It's simpler to correctly figure
1640 * out in the layer above exactly under what conditions we
1641 * can run async and I think it's easier for others read and
1642 * follow the logic in case something has to be changed.
1643 * cscope is your friend -- rcc.
1644 *
1645 * The attribute fork is much simpler.
1646 *
1647 * For the attribute fork we allow the caller to tell us whether
1648 * the unlink of the inode that led to this call is yet permanent
1649 * in the on disk log. If it is not and we will be freeing extents
1650 * in this inode then we make the first transaction synchronous
1651 * to make sure that the unlink is permanent by the time we free
1652 * the blocks.
1653 */
1654 if (fork == XFS_DATA_FORK) {
1655 if (ip->i_d.di_nextents > 0) {
1656 ip->i_d.di_size = new_size;
1657 xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
1658 }
1659 } else if (sync) {
1660 ASSERT(!(mp->m_flags & XFS_MOUNT_WSYNC));
1661 if (ip->i_d.di_anextents > 0)
1662 xfs_trans_set_sync(ntp);
1663 }
1664 ASSERT(fork == XFS_DATA_FORK ||
1665 (fork == XFS_ATTR_FORK &&
1666 ((sync && !(mp->m_flags & XFS_MOUNT_WSYNC)) ||
1667 (sync == 0 && (mp->m_flags & XFS_MOUNT_WSYNC)))));
1668
1669 /*
1670 * Since it is possible for space to become allocated beyond
1671 * the end of the file (in a crash where the space is allocated
1672 * but the inode size is not yet updated), simply remove any
1673 * blocks which show up between the new EOF and the maximum
1674 * possible file size. If the first block to be removed is
1675 * beyond the maximum file size (ie it is the same as last_block),
1676 * then there is nothing to do.
1677 */
1678 last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
1679 ASSERT(first_unmap_block <= last_block);
1680 done = 0;
1681 if (last_block == first_unmap_block) {
1682 done = 1;
1683 } else {
1684 unmap_len = last_block - first_unmap_block + 1;
1685 }
1686 while (!done) {
1687 /*
1688 * Free up up to XFS_ITRUNC_MAX_EXTENTS. xfs_bunmapi()
1689 * will tell us whether it freed the entire range or
1690 * not. If this is a synchronous mount (wsync),
1691 * then we can tell bunmapi to keep all the
1692 * transactions asynchronous since the unlink
1693 * transaction that made this inode inactive has
1694 * already hit the disk. There's no danger of
1695 * the freed blocks being reused, there being a
1696 * crash, and the reused blocks suddenly reappearing
1697 * in this file with garbage in them once recovery
1698 * runs.
1699 */
1700 XFS_BMAP_INIT(&free_list, &first_block);
1701 error = xfs_bunmapi(ntp, ip, first_unmap_block,
1702 unmap_len,
1703 XFS_BMAPI_AFLAG(fork) |
1704 (sync ? 0 : XFS_BMAPI_ASYNC),
1705 XFS_ITRUNC_MAX_EXTENTS,
1706 &first_block, &free_list, &done);
1707 if (error) {
1708 /*
1709 * If the bunmapi call encounters an error,
1710 * return to the caller where the transaction
1711 * can be properly aborted. We just need to
1712 * make sure we're not holding any resources
1713 * that we were not when we came in.
1714 */
1715 xfs_bmap_cancel(&free_list);
1716 return error;
1717 }
1718
1719 /*
1720 * Duplicate the transaction that has the permanent
1721 * reservation and commit the old transaction.
1722 */
1723 error = xfs_bmap_finish(tp, &free_list, first_block,
1724 &committed);
1725 ntp = *tp;
1726 if (error) {
1727 /*
1728 * If the bmap finish call encounters an error,
1729 * return to the caller where the transaction
1730 * can be properly aborted. We just need to
1731 * make sure we're not holding any resources
1732 * that we were not when we came in.
1733 *
1734 * Aborting from this point might lose some
1735 * blocks in the file system, but oh well.
1736 */
1737 xfs_bmap_cancel(&free_list);
1738 if (committed) {
1739 /*
1740 * If the passed in transaction committed
1741 * in xfs_bmap_finish(), then we want to
1742 * add the inode to this one before returning.
1743 * This keeps things simple for the higher
1744 * level code, because it always knows that
1745 * the inode is locked and held in the
1746 * transaction that returns to it whether
1747 * errors occur or not. We don't mark the
1748 * inode dirty so that this transaction can
1749 * be easily aborted if possible.
1750 */
1751 xfs_trans_ijoin(ntp, ip,
1752 XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1753 xfs_trans_ihold(ntp, ip);
1754 }
1755 return error;
1756 }
1757
1758 if (committed) {
1759 /*
1760 * The first xact was committed,
1761 * so add the inode to the new one.
1762 * Mark it dirty so it will be logged
1763 * and moved forward in the log as
1764 * part of every commit.
1765 */
1766 xfs_trans_ijoin(ntp, ip,
1767 XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1768 xfs_trans_ihold(ntp, ip);
1769 xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
1770 }
1771 ntp = xfs_trans_dup(ntp);
1772 (void) xfs_trans_commit(*tp, 0, NULL);
1773 *tp = ntp;
1774 error = xfs_trans_reserve(ntp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1775 XFS_TRANS_PERM_LOG_RES,
1776 XFS_ITRUNCATE_LOG_COUNT);
1777 /*
1778 * Add the inode being truncated to the next chained
1779 * transaction.
1780 */
1781 xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1782 xfs_trans_ihold(ntp, ip);
1783 if (error)
1784 return (error);
1785 }
1786 /*
1787 * Only update the size in the case of the data fork, but
1788 * always re-log the inode so that our permanent transaction
1789 * can keep on rolling it forward in the log.
1790 */
1791 if (fork == XFS_DATA_FORK) {
1792 xfs_isize_check(mp, ip, new_size);
1793 ip->i_d.di_size = new_size;
1794 }
1795 xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
1796 ASSERT((new_size != 0) ||
1797 (fork == XFS_ATTR_FORK) ||
1798 (ip->i_delayed_blks == 0));
1799 ASSERT((new_size != 0) ||
1800 (fork == XFS_ATTR_FORK) ||
1801 (ip->i_d.di_nextents == 0));
1802 xfs_itrunc_trace(XFS_ITRUNC_FINISH2, ip, 0, new_size, 0, 0);
1803 return 0;
1804}
1805
1806
1807/*
1808 * xfs_igrow_start
1809 *
1810 * Do the first part of growing a file: zero any data in the last
1811 * block that is beyond the old EOF. We need to do this before
1812 * the inode is joined to the transaction to modify the i_size.
1813 * That way we can drop the inode lock and call into the buffer
1814 * cache to get the buffer mapping the EOF.
1815 */
1816int
1817xfs_igrow_start(
1818 xfs_inode_t *ip,
1819 xfs_fsize_t new_size,
1820 cred_t *credp)
1821{
1822 xfs_fsize_t isize;
1823 int error;
1824
1825 ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0);
1826 ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0);
1827 ASSERT(new_size > ip->i_d.di_size);
1828
1829 error = 0;
1830 isize = ip->i_d.di_size;
1831 /*
1832 * Zero any pages that may have been created by
1833 * xfs_write_file() beyond the end of the file
1834 * and any blocks between the old and new file sizes.
1835 */
1836 error = xfs_zero_eof(XFS_ITOV(ip), &ip->i_iocore, new_size, isize,
1837 new_size);
1838 return error;
1839}
1840
1841/*
1842 * xfs_igrow_finish
1843 *
1844 * This routine is called to extend the size of a file.
1845 * The inode must have both the iolock and the ilock locked
1846 * for update and it must be a part of the current transaction.
1847 * The xfs_igrow_start() function must have been called previously.
1848 * If the change_flag is not zero, the inode change timestamp will
1849 * be updated.
1850 */
1851void
1852xfs_igrow_finish(
1853 xfs_trans_t *tp,
1854 xfs_inode_t *ip,
1855 xfs_fsize_t new_size,
1856 int change_flag)
1857{
1858 ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0);
1859 ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0);
1860 ASSERT(ip->i_transp == tp);
1861 ASSERT(new_size > ip->i_d.di_size);
1862
1863 /*
1864 * Update the file size. Update the inode change timestamp
1865 * if change_flag set.
1866 */
1867 ip->i_d.di_size = new_size;
1868 if (change_flag)
1869 xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
1870 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1871
1872}
1873
1874
1875/*
1876 * This is called when the inode's link count goes to 0.
1877 * We place the on-disk inode on a list in the AGI. It
1878 * will be pulled from this list when the inode is freed.
1879 */
1880int
1881xfs_iunlink(
1882 xfs_trans_t *tp,
1883 xfs_inode_t *ip)
1884{
1885 xfs_mount_t *mp;
1886 xfs_agi_t *agi;
1887 xfs_dinode_t *dip;
1888 xfs_buf_t *agibp;
1889 xfs_buf_t *ibp;
1890 xfs_agnumber_t agno;
1891 xfs_daddr_t agdaddr;
1892 xfs_agino_t agino;
1893 short bucket_index;
1894 int offset;
1895 int error;
1896 int agi_ok;
1897
1898 ASSERT(ip->i_d.di_nlink == 0);
1899 ASSERT(ip->i_d.di_mode != 0);
1900 ASSERT(ip->i_transp == tp);
1901
1902 mp = tp->t_mountp;
1903
1904 agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
1905 agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
1906
1907 /*
1908 * Get the agi buffer first. It ensures lock ordering
1909 * on the list.
1910 */
1911 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr,
1912 XFS_FSS_TO_BB(mp, 1), 0, &agibp);
1913 if (error) {
1914 return error;
1915 }
1916 /*
1917 * Validate the magic number of the agi block.
1918 */
1919 agi = XFS_BUF_TO_AGI(agibp);
1920 agi_ok =
1921 INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC &&
1922 XFS_AGI_GOOD_VERSION(INT_GET(agi->agi_versionnum, ARCH_CONVERT));
1923 if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK,
1924 XFS_RANDOM_IUNLINK))) {
1925 XFS_CORRUPTION_ERROR("xfs_iunlink", XFS_ERRLEVEL_LOW, mp, agi);
1926 xfs_trans_brelse(tp, agibp);
1927 return XFS_ERROR(EFSCORRUPTED);
1928 }
1929 /*
1930 * Get the index into the agi hash table for the
1931 * list this inode will go on.
1932 */
1933 agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1934 ASSERT(agino != 0);
1935 bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
1936 ASSERT(agi->agi_unlinked[bucket_index]);
1937 ASSERT(INT_GET(agi->agi_unlinked[bucket_index], ARCH_CONVERT) != agino);
1938
1939 if (INT_GET(agi->agi_unlinked[bucket_index], ARCH_CONVERT) != NULLAGINO) {
1940 /*
1941 * There is already another inode in the bucket we need
1942 * to add ourselves to. Add us at the front of the list.
1943 * Here we put the head pointer into our next pointer,
1944 * and then we fall through to point the head at us.
1945 */
1946 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
1947 if (error) {
1948 return error;
1949 }
1950 ASSERT(INT_GET(dip->di_next_unlinked, ARCH_CONVERT) == NULLAGINO);
1951 ASSERT(dip->di_next_unlinked);
1952 /* both on-disk, don't endian flip twice */
1953 dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
1954 offset = ip->i_boffset +
1955 offsetof(xfs_dinode_t, di_next_unlinked);
1956 xfs_trans_inode_buf(tp, ibp);
1957 xfs_trans_log_buf(tp, ibp, offset,
1958 (offset + sizeof(xfs_agino_t) - 1));
1959 xfs_inobp_check(mp, ibp);
1960 }
1961
1962 /*
1963 * Point the bucket head pointer at the inode being inserted.
1964 */
1965 ASSERT(agino != 0);
1966 INT_SET(agi->agi_unlinked[bucket_index], ARCH_CONVERT, agino);
1967 offset = offsetof(xfs_agi_t, agi_unlinked) +
1968 (sizeof(xfs_agino_t) * bucket_index);
1969 xfs_trans_log_buf(tp, agibp, offset,
1970 (offset + sizeof(xfs_agino_t) - 1));
1971 return 0;
1972}
1973
1974/*
1975 * Pull the on-disk inode from the AGI unlinked list.
1976 */
1977STATIC int
1978xfs_iunlink_remove(
1979 xfs_trans_t *tp,
1980 xfs_inode_t *ip)
1981{
1982 xfs_ino_t next_ino;
1983 xfs_mount_t *mp;
1984 xfs_agi_t *agi;
1985 xfs_dinode_t *dip;
1986 xfs_buf_t *agibp;
1987 xfs_buf_t *ibp;
1988 xfs_agnumber_t agno;
1989 xfs_daddr_t agdaddr;
1990 xfs_agino_t agino;
1991 xfs_agino_t next_agino;
1992 xfs_buf_t *last_ibp;
1993 xfs_dinode_t *last_dip;
1994 short bucket_index;
1995 int offset, last_offset;
1996 int error;
1997 int agi_ok;
1998
1999 /*
2000 * First pull the on-disk inode from the AGI unlinked list.
2001 */
2002 mp = tp->t_mountp;
2003
2004 agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
2005 agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
2006
2007 /*
2008 * Get the agi buffer first. It ensures lock ordering
2009 * on the list.
2010 */
2011 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr,
2012 XFS_FSS_TO_BB(mp, 1), 0, &agibp);
2013 if (error) {
2014 cmn_err(CE_WARN,
2015 "xfs_iunlink_remove: xfs_trans_read_buf() returned an error %d on %s. Returning error.",
2016 error, mp->m_fsname);
2017 return error;
2018 }
2019 /*
2020 * Validate the magic number of the agi block.
2021 */
2022 agi = XFS_BUF_TO_AGI(agibp);
2023 agi_ok =
2024 INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC &&
2025 XFS_AGI_GOOD_VERSION(INT_GET(agi->agi_versionnum, ARCH_CONVERT));
2026 if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK_REMOVE,
2027 XFS_RANDOM_IUNLINK_REMOVE))) {
2028 XFS_CORRUPTION_ERROR("xfs_iunlink_remove", XFS_ERRLEVEL_LOW,
2029 mp, agi);
2030 xfs_trans_brelse(tp, agibp);
2031 cmn_err(CE_WARN,
2032 "xfs_iunlink_remove: XFS_TEST_ERROR() returned an error on %s. Returning EFSCORRUPTED.",
2033 mp->m_fsname);
2034 return XFS_ERROR(EFSCORRUPTED);
2035 }
2036 /*
2037 * Get the index into the agi hash table for the
2038 * list this inode will go on.
2039 */
2040 agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
2041 ASSERT(agino != 0);
2042 bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
2043 ASSERT(INT_GET(agi->agi_unlinked[bucket_index], ARCH_CONVERT) != NULLAGINO);
2044 ASSERT(agi->agi_unlinked[bucket_index]);
2045
2046 if (INT_GET(agi->agi_unlinked[bucket_index], ARCH_CONVERT) == agino) {
2047 /*
2048 * We're at the head of the list. Get the inode's
2049 * on-disk buffer to see if there is anyone after us
2050 * on the list. Only modify our next pointer if it
2051 * is not already NULLAGINO. This saves us the overhead
2052 * of dealing with the buffer when there is no need to
2053 * change it.
2054 */
2055 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
2056 if (error) {
2057 cmn_err(CE_WARN,
2058 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.",
2059 error, mp->m_fsname);
2060 return error;
2061 }
2062 next_agino = INT_GET(dip->di_next_unlinked, ARCH_CONVERT);
2063 ASSERT(next_agino != 0);
2064 if (next_agino != NULLAGINO) {
2065 INT_SET(dip->di_next_unlinked, ARCH_CONVERT, NULLAGINO);
2066 offset = ip->i_boffset +
2067 offsetof(xfs_dinode_t, di_next_unlinked);
2068 xfs_trans_inode_buf(tp, ibp);
2069 xfs_trans_log_buf(tp, ibp, offset,
2070 (offset + sizeof(xfs_agino_t) - 1));
2071 xfs_inobp_check(mp, ibp);
2072 } else {
2073 xfs_trans_brelse(tp, ibp);
2074 }
2075 /*
2076 * Point the bucket head pointer at the next inode.
2077 */
2078 ASSERT(next_agino != 0);
2079 ASSERT(next_agino != agino);
2080 INT_SET(agi->agi_unlinked[bucket_index], ARCH_CONVERT, next_agino);
2081 offset = offsetof(xfs_agi_t, agi_unlinked) +
2082 (sizeof(xfs_agino_t) * bucket_index);
2083 xfs_trans_log_buf(tp, agibp, offset,
2084 (offset + sizeof(xfs_agino_t) - 1));
2085 } else {
2086 /*
2087 * We need to search the list for the inode being freed.
2088 */
2089 next_agino = INT_GET(agi->agi_unlinked[bucket_index], ARCH_CONVERT);
2090 last_ibp = NULL;
2091 while (next_agino != agino) {
2092 /*
2093 * If the last inode wasn't the one pointing to
2094 * us, then release its buffer since we're not
2095 * going to do anything with it.
2096 */
2097 if (last_ibp != NULL) {
2098 xfs_trans_brelse(tp, last_ibp);
2099 }
2100 next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
2101 error = xfs_inotobp(mp, tp, next_ino, &last_dip,
2102 &last_ibp, &last_offset);
2103 if (error) {
2104 cmn_err(CE_WARN,
2105 "xfs_iunlink_remove: xfs_inotobp() returned an error %d on %s. Returning error.",
2106 error, mp->m_fsname);
2107 return error;
2108 }
2109 next_agino = INT_GET(last_dip->di_next_unlinked, ARCH_CONVERT);
2110 ASSERT(next_agino != NULLAGINO);
2111 ASSERT(next_agino != 0);
2112 }
2113 /*
2114 * Now last_ibp points to the buffer previous to us on
2115 * the unlinked list. Pull us from the list.
2116 */
2117 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
2118 if (error) {
2119 cmn_err(CE_WARN,
2120 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.",
2121 error, mp->m_fsname);
2122 return error;
2123 }
2124 next_agino = INT_GET(dip->di_next_unlinked, ARCH_CONVERT);
2125 ASSERT(next_agino != 0);
2126 ASSERT(next_agino != agino);
2127 if (next_agino != NULLAGINO) {
2128 INT_SET(dip->di_next_unlinked, ARCH_CONVERT, NULLAGINO);
2129 offset = ip->i_boffset +
2130 offsetof(xfs_dinode_t, di_next_unlinked);
2131 xfs_trans_inode_buf(tp, ibp);
2132 xfs_trans_log_buf(tp, ibp, offset,
2133 (offset + sizeof(xfs_agino_t) - 1));
2134 xfs_inobp_check(mp, ibp);
2135 } else {
2136 xfs_trans_brelse(tp, ibp);
2137 }
2138 /*
2139 * Point the previous inode on the list to the next inode.
2140 */
2141 INT_SET(last_dip->di_next_unlinked, ARCH_CONVERT, next_agino);
2142 ASSERT(next_agino != 0);
2143 offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
2144 xfs_trans_inode_buf(tp, last_ibp);
2145 xfs_trans_log_buf(tp, last_ibp, offset,
2146 (offset + sizeof(xfs_agino_t) - 1));
2147 xfs_inobp_check(mp, last_ibp);
2148 }
2149 return 0;
2150}
2151
2152static __inline__ int xfs_inode_clean(xfs_inode_t *ip)
2153{
2154 return (((ip->i_itemp == NULL) ||
2155 !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
2156 (ip->i_update_core == 0));
2157}
2158
2159void
2160xfs_ifree_cluster(
2161 xfs_inode_t *free_ip,
2162 xfs_trans_t *tp,
2163 xfs_ino_t inum)
2164{
2165 xfs_mount_t *mp = free_ip->i_mount;
2166 int blks_per_cluster;
2167 int nbufs;
2168 int ninodes;
2169 int i, j, found, pre_flushed;
2170 xfs_daddr_t blkno;
2171 xfs_buf_t *bp;
2172 xfs_ihash_t *ih;
2173 xfs_inode_t *ip, **ip_found;
2174 xfs_inode_log_item_t *iip;
2175 xfs_log_item_t *lip;
2176 SPLDECL(s);
2177
2178 if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
2179 blks_per_cluster = 1;
2180 ninodes = mp->m_sb.sb_inopblock;
2181 nbufs = XFS_IALLOC_BLOCKS(mp);
2182 } else {
2183 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
2184 mp->m_sb.sb_blocksize;
2185 ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
2186 nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
2187 }
2188
2189 ip_found = kmem_alloc(ninodes * sizeof(xfs_inode_t *), KM_NOFS);
2190
2191 for (j = 0; j < nbufs; j++, inum += ninodes) {
2192 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
2193 XFS_INO_TO_AGBNO(mp, inum));
2194
2195
2196 /*
2197 * Look for each inode in memory and attempt to lock it,
2198 * we can be racing with flush and tail pushing here.
2199 * any inode we get the locks on, add to an array of
2200 * inode items to process later.
2201 *
2202 * The get the buffer lock, we could beat a flush
2203 * or tail pushing thread to the lock here, in which
2204 * case they will go looking for the inode buffer
2205 * and fail, we need some other form of interlock
2206 * here.
2207 */
2208 found = 0;
2209 for (i = 0; i < ninodes; i++) {
2210 ih = XFS_IHASH(mp, inum + i);
2211 read_lock(&ih->ih_lock);
2212 for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) {
2213 if (ip->i_ino == inum + i)
2214 break;
2215 }
2216
2217 /* Inode not in memory or we found it already,
2218 * nothing to do
2219 */
2220 if (!ip || (ip->i_flags & XFS_ISTALE)) {
2221 read_unlock(&ih->ih_lock);
2222 continue;
2223 }
2224
2225 if (xfs_inode_clean(ip)) {
2226 read_unlock(&ih->ih_lock);
2227 continue;
2228 }
2229
2230 /* If we can get the locks then add it to the
2231 * list, otherwise by the time we get the bp lock
2232 * below it will already be attached to the
2233 * inode buffer.
2234 */
2235
2236 /* This inode will already be locked - by us, lets
2237 * keep it that way.
2238 */
2239
2240 if (ip == free_ip) {
2241 if (xfs_iflock_nowait(ip)) {
2242 ip->i_flags |= XFS_ISTALE;
2243
2244 if (xfs_inode_clean(ip)) {
2245 xfs_ifunlock(ip);
2246 } else {
2247 ip_found[found++] = ip;
2248 }
2249 }
2250 read_unlock(&ih->ih_lock);
2251 continue;
2252 }
2253
2254 if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2255 if (xfs_iflock_nowait(ip)) {
2256 ip->i_flags |= XFS_ISTALE;
2257
2258 if (xfs_inode_clean(ip)) {
2259 xfs_ifunlock(ip);
2260 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2261 } else {
2262 ip_found[found++] = ip;
2263 }
2264 } else {
2265 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2266 }
2267 }
2268
2269 read_unlock(&ih->ih_lock);
2270 }
2271
2272 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
2273 mp->m_bsize * blks_per_cluster,
2274 XFS_BUF_LOCK);
2275
2276 pre_flushed = 0;
2277 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
2278 while (lip) {
2279 if (lip->li_type == XFS_LI_INODE) {
2280 iip = (xfs_inode_log_item_t *)lip;
2281 ASSERT(iip->ili_logged == 1);
2282 lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
2283 AIL_LOCK(mp,s);
2284 iip->ili_flush_lsn = iip->ili_item.li_lsn;
2285 AIL_UNLOCK(mp, s);
2286 iip->ili_inode->i_flags |= XFS_ISTALE;
2287 pre_flushed++;
2288 }
2289 lip = lip->li_bio_list;
2290 }
2291
2292 for (i = 0; i < found; i++) {
2293 ip = ip_found[i];
2294 iip = ip->i_itemp;
2295
2296 if (!iip) {
2297 ip->i_update_core = 0;
2298 xfs_ifunlock(ip);
2299 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2300 continue;
2301 }
2302
2303 iip->ili_last_fields = iip->ili_format.ilf_fields;
2304 iip->ili_format.ilf_fields = 0;
2305 iip->ili_logged = 1;
2306 AIL_LOCK(mp,s);
2307 iip->ili_flush_lsn = iip->ili_item.li_lsn;
2308 AIL_UNLOCK(mp, s);
2309
2310 xfs_buf_attach_iodone(bp,
2311 (void(*)(xfs_buf_t*,xfs_log_item_t*))
2312 xfs_istale_done, (xfs_log_item_t *)iip);
2313 if (ip != free_ip) {
2314 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2315 }
2316 }
2317
2318 if (found || pre_flushed)
2319 xfs_trans_stale_inode_buf(tp, bp);
2320 xfs_trans_binval(tp, bp);
2321 }
2322
2323 kmem_free(ip_found, ninodes * sizeof(xfs_inode_t *));
2324}
2325
2326/*
2327 * This is called to return an inode to the inode free list.
2328 * The inode should already be truncated to 0 length and have
2329 * no pages associated with it. This routine also assumes that
2330 * the inode is already a part of the transaction.
2331 *
2332 * The on-disk copy of the inode will have been added to the list
2333 * of unlinked inodes in the AGI. We need to remove the inode from
2334 * that list atomically with respect to freeing it here.
2335 */
2336int
2337xfs_ifree(
2338 xfs_trans_t *tp,
2339 xfs_inode_t *ip,
2340 xfs_bmap_free_t *flist)
2341{
2342 int error;
2343 int delete;
2344 xfs_ino_t first_ino;
2345
2346 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
2347 ASSERT(ip->i_transp == tp);
2348 ASSERT(ip->i_d.di_nlink == 0);
2349 ASSERT(ip->i_d.di_nextents == 0);
2350 ASSERT(ip->i_d.di_anextents == 0);
2351 ASSERT((ip->i_d.di_size == 0) ||
2352 ((ip->i_d.di_mode & S_IFMT) != S_IFREG));
2353 ASSERT(ip->i_d.di_nblocks == 0);
2354
2355 /*
2356 * Pull the on-disk inode from the AGI unlinked list.
2357 */
2358 error = xfs_iunlink_remove(tp, ip);
2359 if (error != 0) {
2360 return error;
2361 }
2362
2363 error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino);
2364 if (error != 0) {
2365 return error;
2366 }
2367 ip->i_d.di_mode = 0; /* mark incore inode as free */
2368 ip->i_d.di_flags = 0;
2369 ip->i_d.di_dmevmask = 0;
2370 ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */
2371 ip->i_df.if_ext_max =
2372 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
2373 ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
2374 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
2375 /*
2376 * Bump the generation count so no one will be confused
2377 * by reincarnations of this inode.
2378 */
2379 ip->i_d.di_gen++;
2380 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2381
2382 if (delete) {
2383 xfs_ifree_cluster(ip, tp, first_ino);
2384 }
2385
2386 return 0;
2387}
2388
2389/*
2390 * Reallocate the space for if_broot based on the number of records
2391 * being added or deleted as indicated in rec_diff. Move the records
2392 * and pointers in if_broot to fit the new size. When shrinking this
2393 * will eliminate holes between the records and pointers created by
2394 * the caller. When growing this will create holes to be filled in
2395 * by the caller.
2396 *
2397 * The caller must not request to add more records than would fit in
2398 * the on-disk inode root. If the if_broot is currently NULL, then
2399 * if we adding records one will be allocated. The caller must also
2400 * not request that the number of records go below zero, although
2401 * it can go to zero.
2402 *
2403 * ip -- the inode whose if_broot area is changing
2404 * ext_diff -- the change in the number of records, positive or negative,
2405 * requested for the if_broot array.
2406 */
2407void
2408xfs_iroot_realloc(
2409 xfs_inode_t *ip,
2410 int rec_diff,
2411 int whichfork)
2412{
2413 int cur_max;
2414 xfs_ifork_t *ifp;
2415 xfs_bmbt_block_t *new_broot;
2416 int new_max;
2417 size_t new_size;
2418 char *np;
2419 char *op;
2420
2421 /*
2422 * Handle the degenerate case quietly.
2423 */
2424 if (rec_diff == 0) {
2425 return;
2426 }
2427
2428 ifp = XFS_IFORK_PTR(ip, whichfork);
2429 if (rec_diff > 0) {
2430 /*
2431 * If there wasn't any memory allocated before, just
2432 * allocate it now and get out.
2433 */
2434 if (ifp->if_broot_bytes == 0) {
2435 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff);
2436 ifp->if_broot = (xfs_bmbt_block_t*)kmem_alloc(new_size,
2437 KM_SLEEP);
2438 ifp->if_broot_bytes = (int)new_size;
2439 return;
2440 }
2441
2442 /*
2443 * If there is already an existing if_broot, then we need
2444 * to realloc() it and shift the pointers to their new
2445 * location. The records don't change location because
2446 * they are kept butted up against the btree block header.
2447 */
2448 cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes);
2449 new_max = cur_max + rec_diff;
2450 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
2451 ifp->if_broot = (xfs_bmbt_block_t *)
2452 kmem_realloc(ifp->if_broot,
2453 new_size,
2454 (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */
2455 KM_SLEEP);
2456 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
2457 ifp->if_broot_bytes);
2458 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
2459 (int)new_size);
2460 ifp->if_broot_bytes = (int)new_size;
2461 ASSERT(ifp->if_broot_bytes <=
2462 XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
2463 memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
2464 return;
2465 }
2466
2467 /*
2468 * rec_diff is less than 0. In this case, we are shrinking the
2469 * if_broot buffer. It must already exist. If we go to zero
2470 * records, just get rid of the root and clear the status bit.
2471 */
2472 ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
2473 cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes);
2474 new_max = cur_max + rec_diff;
2475 ASSERT(new_max >= 0);
2476 if (new_max > 0)
2477 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
2478 else
2479 new_size = 0;
2480 if (new_size > 0) {
2481 new_broot = (xfs_bmbt_block_t *)kmem_alloc(new_size, KM_SLEEP);
2482 /*
2483 * First copy over the btree block header.
2484 */
2485 memcpy(new_broot, ifp->if_broot, sizeof(xfs_bmbt_block_t));
2486 } else {
2487 new_broot = NULL;
2488 ifp->if_flags &= ~XFS_IFBROOT;
2489 }
2490
2491 /*
2492 * Only copy the records and pointers if there are any.
2493 */
2494 if (new_max > 0) {
2495 /*
2496 * First copy the records.
2497 */
2498 op = (char *)XFS_BMAP_BROOT_REC_ADDR(ifp->if_broot, 1,
2499 ifp->if_broot_bytes);
2500 np = (char *)XFS_BMAP_BROOT_REC_ADDR(new_broot, 1,
2501 (int)new_size);
2502 memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
2503
2504 /*
2505 * Then copy the pointers.
2506 */
2507 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
2508 ifp->if_broot_bytes);
2509 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(new_broot, 1,
2510 (int)new_size);
2511 memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
2512 }
2513 kmem_free(ifp->if_broot, ifp->if_broot_bytes);
2514 ifp->if_broot = new_broot;
2515 ifp->if_broot_bytes = (int)new_size;
2516 ASSERT(ifp->if_broot_bytes <=
2517 XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
2518 return;
2519}
2520
2521
2522/*
2523 * This is called when the amount of space needed for if_extents
2524 * is increased or decreased. The change in size is indicated by
2525 * the number of extents that need to be added or deleted in the
2526 * ext_diff parameter.
2527 *
2528 * If the amount of space needed has decreased below the size of the
2529 * inline buffer, then switch to using the inline buffer. Otherwise,
2530 * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
2531 * to what is needed.
2532 *
2533 * ip -- the inode whose if_extents area is changing
2534 * ext_diff -- the change in the number of extents, positive or negative,
2535 * requested for the if_extents array.
2536 */
2537void
2538xfs_iext_realloc(
2539 xfs_inode_t *ip,
2540 int ext_diff,
2541 int whichfork)
2542{
2543 int byte_diff;
2544 xfs_ifork_t *ifp;
2545 int new_size;
2546 uint rnew_size;
2547
2548 if (ext_diff == 0) {
2549 return;
2550 }
2551
2552 ifp = XFS_IFORK_PTR(ip, whichfork);
2553 byte_diff = ext_diff * (uint)sizeof(xfs_bmbt_rec_t);
2554 new_size = (int)ifp->if_bytes + byte_diff;
2555 ASSERT(new_size >= 0);
2556
2557 if (new_size == 0) {
2558 if (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext) {
2559 ASSERT(ifp->if_real_bytes != 0);
2560 kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes);
2561 }
2562 ifp->if_u1.if_extents = NULL;
2563 rnew_size = 0;
2564 } else if (new_size <= sizeof(ifp->if_u2.if_inline_ext)) {
2565 /*
2566 * If the valid extents can fit in if_inline_ext,
2567 * copy them from the malloc'd vector and free it.
2568 */
2569 if (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext) {
2570 /*
2571 * For now, empty files are format EXTENTS,
2572 * so the if_extents pointer is null.
2573 */
2574 if (ifp->if_u1.if_extents) {
2575 memcpy(ifp->if_u2.if_inline_ext,
2576 ifp->if_u1.if_extents, new_size);
2577 kmem_free(ifp->if_u1.if_extents,
2578 ifp->if_real_bytes);
2579 }
2580 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
2581 }
2582 rnew_size = 0;
2583 } else {
2584 rnew_size = new_size;
2585 if ((rnew_size & (rnew_size - 1)) != 0)
2586 rnew_size = xfs_iroundup(rnew_size);
2587 /*
2588 * Stuck with malloc/realloc.
2589 */
2590 if (ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext) {
2591 ifp->if_u1.if_extents = (xfs_bmbt_rec_t *)
2592 kmem_alloc(rnew_size, KM_SLEEP);
2593 memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
2594 sizeof(ifp->if_u2.if_inline_ext));
2595 } else if (rnew_size != ifp->if_real_bytes) {
2596 ifp->if_u1.if_extents = (xfs_bmbt_rec_t *)
2597 kmem_realloc(ifp->if_u1.if_extents,
2598 rnew_size,
2599 ifp->if_real_bytes,
2600 KM_NOFS);
2601 }
2602 }
2603 ifp->if_real_bytes = rnew_size;
2604 ifp->if_bytes = new_size;
2605}
2606
2607
2608/*
2609 * This is called when the amount of space needed for if_data
2610 * is increased or decreased. The change in size is indicated by
2611 * the number of bytes that need to be added or deleted in the
2612 * byte_diff parameter.
2613 *
2614 * If the amount of space needed has decreased below the size of the
2615 * inline buffer, then switch to using the inline buffer. Otherwise,
2616 * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
2617 * to what is needed.
2618 *
2619 * ip -- the inode whose if_data area is changing
2620 * byte_diff -- the change in the number of bytes, positive or negative,
2621 * requested for the if_data array.
2622 */
2623void
2624xfs_idata_realloc(
2625 xfs_inode_t *ip,
2626 int byte_diff,
2627 int whichfork)
2628{
2629 xfs_ifork_t *ifp;
2630 int new_size;
2631 int real_size;
2632
2633 if (byte_diff == 0) {
2634 return;
2635 }
2636
2637 ifp = XFS_IFORK_PTR(ip, whichfork);
2638 new_size = (int)ifp->if_bytes + byte_diff;
2639 ASSERT(new_size >= 0);
2640
2641 if (new_size == 0) {
2642 if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
2643 kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes);
2644 }
2645 ifp->if_u1.if_data = NULL;
2646 real_size = 0;
2647 } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
2648 /*
2649 * If the valid extents/data can fit in if_inline_ext/data,
2650 * copy them from the malloc'd vector and free it.
2651 */
2652 if (ifp->if_u1.if_data == NULL) {
2653 ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
2654 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
2655 ASSERT(ifp->if_real_bytes != 0);
2656 memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
2657 new_size);
2658 kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes);
2659 ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
2660 }
2661 real_size = 0;
2662 } else {
2663 /*
2664 * Stuck with malloc/realloc.
2665 * For inline data, the underlying buffer must be
2666 * a multiple of 4 bytes in size so that it can be
2667 * logged and stay on word boundaries. We enforce
2668 * that here.
2669 */
2670 real_size = roundup(new_size, 4);
2671 if (ifp->if_u1.if_data == NULL) {
2672 ASSERT(ifp->if_real_bytes == 0);
2673 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
2674 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
2675 /*
2676 * Only do the realloc if the underlying size
2677 * is really changing.
2678 */
2679 if (ifp->if_real_bytes != real_size) {
2680 ifp->if_u1.if_data =
2681 kmem_realloc(ifp->if_u1.if_data,
2682 real_size,
2683 ifp->if_real_bytes,
2684 KM_SLEEP);
2685 }
2686 } else {
2687 ASSERT(ifp->if_real_bytes == 0);
2688 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
2689 memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
2690 ifp->if_bytes);
2691 }
2692 }
2693 ifp->if_real_bytes = real_size;
2694 ifp->if_bytes = new_size;
2695 ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
2696}
2697
2698
2699
2700
2701/*
2702 * Map inode to disk block and offset.
2703 *
2704 * mp -- the mount point structure for the current file system
2705 * tp -- the current transaction
2706 * ino -- the inode number of the inode to be located
2707 * imap -- this structure is filled in with the information necessary
2708 * to retrieve the given inode from disk
2709 * flags -- flags to pass to xfs_dilocate indicating whether or not
2710 * lookups in the inode btree were OK or not
2711 */
2712int
2713xfs_imap(
2714 xfs_mount_t *mp,
2715 xfs_trans_t *tp,
2716 xfs_ino_t ino,
2717 xfs_imap_t *imap,
2718 uint flags)
2719{
2720 xfs_fsblock_t fsbno;
2721 int len;
2722 int off;
2723 int error;
2724
2725 fsbno = imap->im_blkno ?
2726 XFS_DADDR_TO_FSB(mp, imap->im_blkno) : NULLFSBLOCK;
2727 error = xfs_dilocate(mp, tp, ino, &fsbno, &len, &off, flags);
2728 if (error != 0) {
2729 return error;
2730 }
2731 imap->im_blkno = XFS_FSB_TO_DADDR(mp, fsbno);
2732 imap->im_len = XFS_FSB_TO_BB(mp, len);
2733 imap->im_agblkno = XFS_FSB_TO_AGBNO(mp, fsbno);
2734 imap->im_ioffset = (ushort)off;
2735 imap->im_boffset = (ushort)(off << mp->m_sb.sb_inodelog);
2736 return 0;
2737}
2738
2739void
2740xfs_idestroy_fork(
2741 xfs_inode_t *ip,
2742 int whichfork)
2743{
2744 xfs_ifork_t *ifp;
2745
2746 ifp = XFS_IFORK_PTR(ip, whichfork);
2747 if (ifp->if_broot != NULL) {
2748 kmem_free(ifp->if_broot, ifp->if_broot_bytes);
2749 ifp->if_broot = NULL;
2750 }
2751
2752 /*
2753 * If the format is local, then we can't have an extents
2754 * array so just look for an inline data array. If we're
2755 * not local then we may or may not have an extents list,
2756 * so check and free it up if we do.
2757 */
2758 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
2759 if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
2760 (ifp->if_u1.if_data != NULL)) {
2761 ASSERT(ifp->if_real_bytes != 0);
2762 kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes);
2763 ifp->if_u1.if_data = NULL;
2764 ifp->if_real_bytes = 0;
2765 }
2766 } else if ((ifp->if_flags & XFS_IFEXTENTS) &&
2767 (ifp->if_u1.if_extents != NULL) &&
2768 (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)) {
2769 ASSERT(ifp->if_real_bytes != 0);
2770 kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes);
2771 ifp->if_u1.if_extents = NULL;
2772 ifp->if_real_bytes = 0;
2773 }
2774 ASSERT(ifp->if_u1.if_extents == NULL ||
2775 ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
2776 ASSERT(ifp->if_real_bytes == 0);
2777 if (whichfork == XFS_ATTR_FORK) {
2778 kmem_zone_free(xfs_ifork_zone, ip->i_afp);
2779 ip->i_afp = NULL;
2780 }
2781}
2782
2783/*
2784 * This is called free all the memory associated with an inode.
2785 * It must free the inode itself and any buffers allocated for
2786 * if_extents/if_data and if_broot. It must also free the lock
2787 * associated with the inode.
2788 */
2789void
2790xfs_idestroy(
2791 xfs_inode_t *ip)
2792{
2793
2794 switch (ip->i_d.di_mode & S_IFMT) {
2795 case S_IFREG:
2796 case S_IFDIR:
2797 case S_IFLNK:
2798 xfs_idestroy_fork(ip, XFS_DATA_FORK);
2799 break;
2800 }
2801 if (ip->i_afp)
2802 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
2803 mrfree(&ip->i_lock);
2804 mrfree(&ip->i_iolock);
2805 freesema(&ip->i_flock);
2806#ifdef XFS_BMAP_TRACE
2807 ktrace_free(ip->i_xtrace);
2808#endif
2809#ifdef XFS_BMBT_TRACE
2810 ktrace_free(ip->i_btrace);
2811#endif
2812#ifdef XFS_RW_TRACE
2813 ktrace_free(ip->i_rwtrace);
2814#endif
2815#ifdef XFS_ILOCK_TRACE
2816 ktrace_free(ip->i_lock_trace);
2817#endif
2818#ifdef XFS_DIR2_TRACE
2819 ktrace_free(ip->i_dir_trace);
2820#endif
2821 if (ip->i_itemp) {
2822 /* XXXdpd should be able to assert this but shutdown
2823 * is leaving the AIL behind. */
2824 ASSERT(((ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL) == 0) ||
2825 XFS_FORCED_SHUTDOWN(ip->i_mount));
2826 xfs_inode_item_destroy(ip);
2827 }
2828 kmem_zone_free(xfs_inode_zone, ip);
2829}
2830
2831
2832/*
2833 * Increment the pin count of the given buffer.
2834 * This value is protected by ipinlock spinlock in the mount structure.
2835 */
2836void
2837xfs_ipin(
2838 xfs_inode_t *ip)
2839{
2840 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
2841
2842 atomic_inc(&ip->i_pincount);
2843}
2844
2845/*
2846 * Decrement the pin count of the given inode, and wake up
2847 * anyone in xfs_iwait_unpin() if the count goes to 0. The
2848 * inode must have been previoulsy pinned with a call to xfs_ipin().
2849 */
2850void
2851xfs_iunpin(
2852 xfs_inode_t *ip)
2853{
2854 ASSERT(atomic_read(&ip->i_pincount) > 0);
2855
2856 if (atomic_dec_and_test(&ip->i_pincount)) {
2857 vnode_t *vp = XFS_ITOV_NULL(ip);
2858
2859 /* make sync come back and flush this inode */
2860 if (vp) {
2861 struct inode *inode = LINVFS_GET_IP(vp);
2862
2863 if (!(inode->i_state & I_NEW))
2864 mark_inode_dirty_sync(inode);
2865 }
2866
2867 wake_up(&ip->i_ipin_wait);
2868 }
2869}
2870
2871/*
2872 * This is called to wait for the given inode to be unpinned.
2873 * It will sleep until this happens. The caller must have the
2874 * inode locked in at least shared mode so that the buffer cannot
2875 * be subsequently pinned once someone is waiting for it to be
2876 * unpinned.
2877 */
2878void
2879xfs_iunpin_wait(
2880 xfs_inode_t *ip)
2881{
2882 xfs_inode_log_item_t *iip;
2883 xfs_lsn_t lsn;
2884
2885 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE | MR_ACCESS));
2886
2887 if (atomic_read(&ip->i_pincount) == 0) {
2888 return;
2889 }
2890
2891 iip = ip->i_itemp;
2892 if (iip && iip->ili_last_lsn) {
2893 lsn = iip->ili_last_lsn;
2894 } else {
2895 lsn = (xfs_lsn_t)0;
2896 }
2897
2898 /*
2899 * Give the log a push so we don't wait here too long.
2900 */
2901 xfs_log_force(ip->i_mount, lsn, XFS_LOG_FORCE);
2902
2903 wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
2904}
2905
2906
2907/*
2908 * xfs_iextents_copy()
2909 *
2910 * This is called to copy the REAL extents (as opposed to the delayed
2911 * allocation extents) from the inode into the given buffer. It
2912 * returns the number of bytes copied into the buffer.
2913 *
2914 * If there are no delayed allocation extents, then we can just
2915 * memcpy() the extents into the buffer. Otherwise, we need to
2916 * examine each extent in turn and skip those which are delayed.
2917 */
2918int
2919xfs_iextents_copy(
2920 xfs_inode_t *ip,
2921 xfs_bmbt_rec_t *buffer,
2922 int whichfork)
2923{
2924 int copied;
2925 xfs_bmbt_rec_t *dest_ep;
2926 xfs_bmbt_rec_t *ep;
2927#ifdef XFS_BMAP_TRACE
2928 static char fname[] = "xfs_iextents_copy";
2929#endif
2930 int i;
2931 xfs_ifork_t *ifp;
2932 int nrecs;
2933 xfs_fsblock_t start_block;
2934
2935 ifp = XFS_IFORK_PTR(ip, whichfork);
2936 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS));
2937 ASSERT(ifp->if_bytes > 0);
2938
2939 nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
2940 xfs_bmap_trace_exlist(fname, ip, nrecs, whichfork);
2941 ASSERT(nrecs > 0);
2942
2943 /*
2944 * There are some delayed allocation extents in the
2945 * inode, so copy the extents one at a time and skip
2946 * the delayed ones. There must be at least one
2947 * non-delayed extent.
2948 */
2949 ep = ifp->if_u1.if_extents;
2950 dest_ep = buffer;
2951 copied = 0;
2952 for (i = 0; i < nrecs; i++) {
2953 start_block = xfs_bmbt_get_startblock(ep);
2954 if (ISNULLSTARTBLOCK(start_block)) {
2955 /*
2956 * It's a delayed allocation extent, so skip it.
2957 */
2958 ep++;
2959 continue;
2960 }
2961
2962 /* Translate to on disk format */
2963 put_unaligned(INT_GET(ep->l0, ARCH_CONVERT),
2964 (__uint64_t*)&dest_ep->l0);
2965 put_unaligned(INT_GET(ep->l1, ARCH_CONVERT),
2966 (__uint64_t*)&dest_ep->l1);
2967 dest_ep++;
2968 ep++;
2969 copied++;
2970 }
2971 ASSERT(copied != 0);
2972 xfs_validate_extents(buffer, copied, 1, XFS_EXTFMT_INODE(ip));
2973
2974 return (copied * (uint)sizeof(xfs_bmbt_rec_t));
2975}
2976
2977/*
2978 * Each of the following cases stores data into the same region
2979 * of the on-disk inode, so only one of them can be valid at
2980 * any given time. While it is possible to have conflicting formats
2981 * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
2982 * in EXTENTS format, this can only happen when the fork has
2983 * changed formats after being modified but before being flushed.
2984 * In these cases, the format always takes precedence, because the
2985 * format indicates the current state of the fork.
2986 */
2987/*ARGSUSED*/
2988STATIC int
2989xfs_iflush_fork(
2990 xfs_inode_t *ip,
2991 xfs_dinode_t *dip,
2992 xfs_inode_log_item_t *iip,
2993 int whichfork,
2994 xfs_buf_t *bp)
2995{
2996 char *cp;
2997 xfs_ifork_t *ifp;
2998 xfs_mount_t *mp;
2999#ifdef XFS_TRANS_DEBUG
3000 int first;
3001#endif
3002 static const short brootflag[2] =
3003 { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
3004 static const short dataflag[2] =
3005 { XFS_ILOG_DDATA, XFS_ILOG_ADATA };
3006 static const short extflag[2] =
3007 { XFS_ILOG_DEXT, XFS_ILOG_AEXT };
3008
3009 if (iip == NULL)
3010 return 0;
3011 ifp = XFS_IFORK_PTR(ip, whichfork);
3012 /*
3013 * This can happen if we gave up in iformat in an error path,
3014 * for the attribute fork.
3015 */
3016 if (ifp == NULL) {
3017 ASSERT(whichfork == XFS_ATTR_FORK);
3018 return 0;
3019 }
3020 cp = XFS_DFORK_PTR(dip, whichfork);
3021 mp = ip->i_mount;
3022 switch (XFS_IFORK_FORMAT(ip, whichfork)) {
3023 case XFS_DINODE_FMT_LOCAL:
3024 if ((iip->ili_format.ilf_fields & dataflag[whichfork]) &&
3025 (ifp->if_bytes > 0)) {
3026 ASSERT(ifp->if_u1.if_data != NULL);
3027 ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
3028 memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
3029 }
3030 if (whichfork == XFS_DATA_FORK) {
3031 if (unlikely(XFS_DIR_SHORTFORM_VALIDATE_ONDISK(mp, dip))) {
3032 XFS_ERROR_REPORT("xfs_iflush_fork",
3033 XFS_ERRLEVEL_LOW, mp);
3034 return XFS_ERROR(EFSCORRUPTED);
3035 }
3036 }
3037 break;
3038
3039 case XFS_DINODE_FMT_EXTENTS:
3040 ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
3041 !(iip->ili_format.ilf_fields & extflag[whichfork]));
3042 ASSERT((ifp->if_u1.if_extents != NULL) || (ifp->if_bytes == 0));
3043 ASSERT((ifp->if_u1.if_extents == NULL) || (ifp->if_bytes > 0));
3044 if ((iip->ili_format.ilf_fields & extflag[whichfork]) &&
3045 (ifp->if_bytes > 0)) {
3046 ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
3047 (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
3048 whichfork);
3049 }
3050 break;
3051
3052 case XFS_DINODE_FMT_BTREE:
3053 if ((iip->ili_format.ilf_fields & brootflag[whichfork]) &&
3054 (ifp->if_broot_bytes > 0)) {
3055 ASSERT(ifp->if_broot != NULL);
3056 ASSERT(ifp->if_broot_bytes <=
3057 (XFS_IFORK_SIZE(ip, whichfork) +
3058 XFS_BROOT_SIZE_ADJ));
3059 xfs_bmbt_to_bmdr(ifp->if_broot, ifp->if_broot_bytes,
3060 (xfs_bmdr_block_t *)cp,
3061 XFS_DFORK_SIZE(dip, mp, whichfork));
3062 }
3063 break;
3064
3065 case XFS_DINODE_FMT_DEV:
3066 if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) {
3067 ASSERT(whichfork == XFS_DATA_FORK);
3068 INT_SET(dip->di_u.di_dev, ARCH_CONVERT, ip->i_df.if_u2.if_rdev);
3069 }
3070 break;
3071
3072 case XFS_DINODE_FMT_UUID:
3073 if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) {
3074 ASSERT(whichfork == XFS_DATA_FORK);
3075 memcpy(&dip->di_u.di_muuid, &ip->i_df.if_u2.if_uuid,
3076 sizeof(uuid_t));
3077 }
3078 break;
3079
3080 default:
3081 ASSERT(0);
3082 break;
3083 }
3084
3085 return 0;
3086}
3087
3088/*
3089 * xfs_iflush() will write a modified inode's changes out to the
3090 * inode's on disk home. The caller must have the inode lock held
3091 * in at least shared mode and the inode flush semaphore must be
3092 * held as well. The inode lock will still be held upon return from
3093 * the call and the caller is free to unlock it.
3094 * The inode flush lock will be unlocked when the inode reaches the disk.
3095 * The flags indicate how the inode's buffer should be written out.
3096 */
3097int
3098xfs_iflush(
3099 xfs_inode_t *ip,
3100 uint flags)
3101{
3102 xfs_inode_log_item_t *iip;
3103 xfs_buf_t *bp;
3104 xfs_dinode_t *dip;
3105 xfs_mount_t *mp;
3106 int error;
3107 /* REFERENCED */
3108 xfs_chash_t *ch;
3109 xfs_inode_t *iq;
3110 int clcount; /* count of inodes clustered */
3111 int bufwasdelwri;
3112 enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
3113 SPLDECL(s);
3114
3115 XFS_STATS_INC(xs_iflush_count);
3116
3117 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS));
3118 ASSERT(valusema(&ip->i_flock) <= 0);
3119 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
3120 ip->i_d.di_nextents > ip->i_df.if_ext_max);
3121
3122 iip = ip->i_itemp;
3123 mp = ip->i_mount;
3124
3125 /*
3126 * If the inode isn't dirty, then just release the inode
3127 * flush lock and do nothing.
3128 */
3129 if ((ip->i_update_core == 0) &&
3130 ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
3131 ASSERT((iip != NULL) ?
3132 !(iip->ili_item.li_flags & XFS_LI_IN_AIL) : 1);
3133 xfs_ifunlock(ip);
3134 return 0;
3135 }
3136
3137 /*
3138 * We can't flush the inode until it is unpinned, so
3139 * wait for it. We know noone new can pin it, because
3140 * we are holding the inode lock shared and you need
3141 * to hold it exclusively to pin the inode.
3142 */
3143 xfs_iunpin_wait(ip);
3144
3145 /*
3146 * This may have been unpinned because the filesystem is shutting
3147 * down forcibly. If that's the case we must not write this inode
3148 * to disk, because the log record didn't make it to disk!
3149 */
3150 if (XFS_FORCED_SHUTDOWN(mp)) {
3151 ip->i_update_core = 0;
3152 if (iip)
3153 iip->ili_format.ilf_fields = 0;
3154 xfs_ifunlock(ip);
3155 return XFS_ERROR(EIO);
3156 }
3157
3158 /*
3159 * Get the buffer containing the on-disk inode.
3160 */
3161 error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0);
3162 if (error != 0) {
3163 xfs_ifunlock(ip);
3164 return error;
3165 }
3166
3167 /*
3168 * Decide how buffer will be flushed out. This is done before
3169 * the call to xfs_iflush_int because this field is zeroed by it.
3170 */
3171 if (iip != NULL && iip->ili_format.ilf_fields != 0) {
3172 /*
3173 * Flush out the inode buffer according to the directions
3174 * of the caller. In the cases where the caller has given
3175 * us a choice choose the non-delwri case. This is because
3176 * the inode is in the AIL and we need to get it out soon.
3177 */
3178 switch (flags) {
3179 case XFS_IFLUSH_SYNC:
3180 case XFS_IFLUSH_DELWRI_ELSE_SYNC:
3181 flags = 0;
3182 break;
3183 case XFS_IFLUSH_ASYNC:
3184 case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
3185 flags = INT_ASYNC;
3186 break;
3187 case XFS_IFLUSH_DELWRI:
3188 flags = INT_DELWRI;
3189 break;
3190 default:
3191 ASSERT(0);
3192 flags = 0;
3193 break;
3194 }
3195 } else {
3196 switch (flags) {
3197 case XFS_IFLUSH_DELWRI_ELSE_SYNC:
3198 case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
3199 case XFS_IFLUSH_DELWRI:
3200 flags = INT_DELWRI;
3201 break;
3202 case XFS_IFLUSH_ASYNC:
3203 flags = INT_ASYNC;
3204 break;
3205 case XFS_IFLUSH_SYNC:
3206 flags = 0;
3207 break;
3208 default:
3209 ASSERT(0);
3210 flags = 0;
3211 break;
3212 }
3213 }
3214
3215 /*
3216 * First flush out the inode that xfs_iflush was called with.
3217 */
3218 error = xfs_iflush_int(ip, bp);
3219 if (error) {
3220 goto corrupt_out;
3221 }
3222
3223 /*
3224 * inode clustering:
3225 * see if other inodes can be gathered into this write
3226 */
3227
3228 ip->i_chash->chl_buf = bp;
3229
3230 ch = XFS_CHASH(mp, ip->i_blkno);
3231 s = mutex_spinlock(&ch->ch_lock);
3232
3233 clcount = 0;
3234 for (iq = ip->i_cnext; iq != ip; iq = iq->i_cnext) {
3235 /*
3236 * Do an un-protected check to see if the inode is dirty and
3237 * is a candidate for flushing. These checks will be repeated
3238 * later after the appropriate locks are acquired.
3239 */
3240 iip = iq->i_itemp;
3241 if ((iq->i_update_core == 0) &&
3242 ((iip == NULL) ||
3243 !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
3244 xfs_ipincount(iq) == 0) {
3245 continue;
3246 }
3247
3248 /*
3249 * Try to get locks. If any are unavailable,
3250 * then this inode cannot be flushed and is skipped.
3251 */
3252
3253 /* get inode locks (just i_lock) */
3254 if (xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) {
3255 /* get inode flush lock */
3256 if (xfs_iflock_nowait(iq)) {
3257 /* check if pinned */
3258 if (xfs_ipincount(iq) == 0) {
3259 /* arriving here means that
3260 * this inode can be flushed.
3261 * first re-check that it's
3262 * dirty
3263 */
3264 iip = iq->i_itemp;
3265 if ((iq->i_update_core != 0)||
3266 ((iip != NULL) &&
3267 (iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
3268 clcount++;
3269 error = xfs_iflush_int(iq, bp);
3270 if (error) {
3271 xfs_iunlock(iq,
3272 XFS_ILOCK_SHARED);
3273 goto cluster_corrupt_out;
3274 }
3275 } else {
3276 xfs_ifunlock(iq);
3277 }
3278 } else {
3279 xfs_ifunlock(iq);
3280 }
3281 }
3282 xfs_iunlock(iq, XFS_ILOCK_SHARED);
3283 }
3284 }
3285 mutex_spinunlock(&ch->ch_lock, s);
3286
3287 if (clcount) {
3288 XFS_STATS_INC(xs_icluster_flushcnt);
3289 XFS_STATS_ADD(xs_icluster_flushinode, clcount);
3290 }
3291
3292 /*
3293 * If the buffer is pinned then push on the log so we won't
3294 * get stuck waiting in the write for too long.
3295 */
3296 if (XFS_BUF_ISPINNED(bp)){
3297 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
3298 }
3299
3300 if (flags & INT_DELWRI) {
3301 xfs_bdwrite(mp, bp);
3302 } else if (flags & INT_ASYNC) {
3303 xfs_bawrite(mp, bp);
3304 } else {
3305 error = xfs_bwrite(mp, bp);
3306 }
3307 return error;
3308
3309corrupt_out:
3310 xfs_buf_relse(bp);
3311 xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
3312 xfs_iflush_abort(ip);
3313 /*
3314 * Unlocks the flush lock
3315 */
3316 return XFS_ERROR(EFSCORRUPTED);
3317
3318cluster_corrupt_out:
3319 /* Corruption detected in the clustering loop. Invalidate the
3320 * inode buffer and shut down the filesystem.
3321 */
3322 mutex_spinunlock(&ch->ch_lock, s);
3323
3324 /*
3325 * Clean up the buffer. If it was B_DELWRI, just release it --
3326 * brelse can handle it with no problems. If not, shut down the
3327 * filesystem before releasing the buffer.
3328 */
3329 if ((bufwasdelwri= XFS_BUF_ISDELAYWRITE(bp))) {
3330 xfs_buf_relse(bp);
3331 }
3332
3333 xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
3334
3335 if(!bufwasdelwri) {
3336 /*
3337 * Just like incore_relse: if we have b_iodone functions,
3338 * mark the buffer as an error and call them. Otherwise
3339 * mark it as stale and brelse.
3340 */
3341 if (XFS_BUF_IODONE_FUNC(bp)) {
3342 XFS_BUF_CLR_BDSTRAT_FUNC(bp);
3343 XFS_BUF_UNDONE(bp);
3344 XFS_BUF_STALE(bp);
3345 XFS_BUF_SHUT(bp);
3346 XFS_BUF_ERROR(bp,EIO);
3347 xfs_biodone(bp);
3348 } else {
3349 XFS_BUF_STALE(bp);
3350 xfs_buf_relse(bp);
3351 }
3352 }
3353
3354 xfs_iflush_abort(iq);
3355 /*
3356 * Unlocks the flush lock
3357 */
3358 return XFS_ERROR(EFSCORRUPTED);
3359}
3360
3361
3362STATIC int
3363xfs_iflush_int(
3364 xfs_inode_t *ip,
3365 xfs_buf_t *bp)
3366{
3367 xfs_inode_log_item_t *iip;
3368 xfs_dinode_t *dip;
3369 xfs_mount_t *mp;
3370#ifdef XFS_TRANS_DEBUG
3371 int first;
3372#endif
3373 SPLDECL(s);
3374
3375 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS));
3376 ASSERT(valusema(&ip->i_flock) <= 0);
3377 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
3378 ip->i_d.di_nextents > ip->i_df.if_ext_max);
3379
3380 iip = ip->i_itemp;
3381 mp = ip->i_mount;
3382
3383
3384 /*
3385 * If the inode isn't dirty, then just release the inode
3386 * flush lock and do nothing.
3387 */
3388 if ((ip->i_update_core == 0) &&
3389 ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
3390 xfs_ifunlock(ip);
3391 return 0;
3392 }
3393
3394 /* set *dip = inode's place in the buffer */
3395 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_boffset);
3396
3397 /*
3398 * Clear i_update_core before copying out the data.
3399 * This is for coordination with our timestamp updates
3400 * that don't hold the inode lock. They will always
3401 * update the timestamps BEFORE setting i_update_core,
3402 * so if we clear i_update_core after they set it we
3403 * are guaranteed to see their updates to the timestamps.
3404 * I believe that this depends on strongly ordered memory
3405 * semantics, but we have that. We use the SYNCHRONIZE
3406 * macro to make sure that the compiler does not reorder
3407 * the i_update_core access below the data copy below.
3408 */
3409 ip->i_update_core = 0;
3410 SYNCHRONIZE();
3411
3412 if (XFS_TEST_ERROR(INT_GET(dip->di_core.di_magic,ARCH_CONVERT) != XFS_DINODE_MAGIC,
3413 mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
3414 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
3415 "xfs_iflush: Bad inode %Lu magic number 0x%x, ptr 0x%p",
3416 ip->i_ino, (int) INT_GET(dip->di_core.di_magic, ARCH_CONVERT), dip);
3417 goto corrupt_out;
3418 }
3419 if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
3420 mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) {
3421 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
3422 "xfs_iflush: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
3423 ip->i_ino, ip, ip->i_d.di_magic);
3424 goto corrupt_out;
3425 }
3426 if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
3427 if (XFS_TEST_ERROR(
3428 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
3429 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
3430 mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) {
3431 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
3432 "xfs_iflush: Bad regular inode %Lu, ptr 0x%p",
3433 ip->i_ino, ip);
3434 goto corrupt_out;
3435 }
3436 } else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
3437 if (XFS_TEST_ERROR(
3438 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
3439 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
3440 (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
3441 mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) {
3442 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
3443 "xfs_iflush: Bad directory inode %Lu, ptr 0x%p",
3444 ip->i_ino, ip);
3445 goto corrupt_out;
3446 }
3447 }
3448 if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
3449 ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5,
3450 XFS_RANDOM_IFLUSH_5)) {
3451 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
3452 "xfs_iflush: detected corrupt incore inode %Lu, total extents = %d, nblocks = %Ld, ptr 0x%p",
3453 ip->i_ino,
3454 ip->i_d.di_nextents + ip->i_d.di_anextents,
3455 ip->i_d.di_nblocks,
3456 ip);
3457 goto corrupt_out;
3458 }
3459 if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
3460 mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) {
3461 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
3462 "xfs_iflush: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
3463 ip->i_ino, ip->i_d.di_forkoff, ip);
3464 goto corrupt_out;
3465 }
3466 /*
3467 * bump the flush iteration count, used to detect flushes which
3468 * postdate a log record during recovery.
3469 */
3470
3471 ip->i_d.di_flushiter++;
3472
3473 /*
3474 * Copy the dirty parts of the inode into the on-disk
3475 * inode. We always copy out the core of the inode,
3476 * because if the inode is dirty at all the core must
3477 * be.
3478 */
3479 xfs_xlate_dinode_core((xfs_caddr_t)&(dip->di_core), &(ip->i_d), -1);
3480
3481 /* Wrap, we never let the log put out DI_MAX_FLUSH */
3482 if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
3483 ip->i_d.di_flushiter = 0;
3484
3485 /*
3486 * If this is really an old format inode and the superblock version
3487 * has not been updated to support only new format inodes, then
3488 * convert back to the old inode format. If the superblock version
3489 * has been updated, then make the conversion permanent.
3490 */
3491 ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 ||
3492 XFS_SB_VERSION_HASNLINK(&mp->m_sb));
3493 if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
3494 if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) {
3495 /*
3496 * Convert it back.
3497 */
3498 ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
3499 INT_SET(dip->di_core.di_onlink, ARCH_CONVERT, ip->i_d.di_nlink);
3500 } else {
3501 /*
3502 * The superblock version has already been bumped,
3503 * so just make the conversion to the new inode
3504 * format permanent.
3505 */
3506 ip->i_d.di_version = XFS_DINODE_VERSION_2;
3507 INT_SET(dip->di_core.di_version, ARCH_CONVERT, XFS_DINODE_VERSION_2);
3508 ip->i_d.di_onlink = 0;
3509 dip->di_core.di_onlink = 0;
3510 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
3511 memset(&(dip->di_core.di_pad[0]), 0,
3512 sizeof(dip->di_core.di_pad));
3513 ASSERT(ip->i_d.di_projid == 0);
3514 }
3515 }
3516
3517 if (xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp) == EFSCORRUPTED) {
3518 goto corrupt_out;
3519 }
3520
3521 if (XFS_IFORK_Q(ip)) {
3522 /*
3523 * The only error from xfs_iflush_fork is on the data fork.
3524 */
3525 (void) xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
3526 }
3527 xfs_inobp_check(mp, bp);
3528
3529 /*
3530 * We've recorded everything logged in the inode, so we'd
3531 * like to clear the ilf_fields bits so we don't log and
3532 * flush things unnecessarily. However, we can't stop
3533 * logging all this information until the data we've copied
3534 * into the disk buffer is written to disk. If we did we might
3535 * overwrite the copy of the inode in the log with all the
3536 * data after re-logging only part of it, and in the face of
3537 * a crash we wouldn't have all the data we need to recover.
3538 *
3539 * What we do is move the bits to the ili_last_fields field.
3540 * When logging the inode, these bits are moved back to the
3541 * ilf_fields field. In the xfs_iflush_done() routine we
3542 * clear ili_last_fields, since we know that the information
3543 * those bits represent is permanently on disk. As long as
3544 * the flush completes before the inode is logged again, then
3545 * both ilf_fields and ili_last_fields will be cleared.
3546 *
3547 * We can play with the ilf_fields bits here, because the inode
3548 * lock must be held exclusively in order to set bits there
3549 * and the flush lock protects the ili_last_fields bits.
3550 * Set ili_logged so the flush done
3551 * routine can tell whether or not to look in the AIL.
3552 * Also, store the current LSN of the inode so that we can tell
3553 * whether the item has moved in the AIL from xfs_iflush_done().
3554 * In order to read the lsn we need the AIL lock, because
3555 * it is a 64 bit value that cannot be read atomically.
3556 */
3557 if (iip != NULL && iip->ili_format.ilf_fields != 0) {
3558 iip->ili_last_fields = iip->ili_format.ilf_fields;
3559 iip->ili_format.ilf_fields = 0;
3560 iip->ili_logged = 1;
3561
3562 ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */
3563 AIL_LOCK(mp,s);
3564 iip->ili_flush_lsn = iip->ili_item.li_lsn;
3565 AIL_UNLOCK(mp, s);
3566
3567 /*
3568 * Attach the function xfs_iflush_done to the inode's
3569 * buffer. This will remove the inode from the AIL
3570 * and unlock the inode's flush lock when the inode is
3571 * completely written to disk.
3572 */
3573 xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t*,xfs_log_item_t*))
3574 xfs_iflush_done, (xfs_log_item_t *)iip);
3575
3576 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
3577 ASSERT(XFS_BUF_IODONE_FUNC(bp) != NULL);
3578 } else {
3579 /*
3580 * We're flushing an inode which is not in the AIL and has
3581 * not been logged but has i_update_core set. For this
3582 * case we can use a B_DELWRI flush and immediately drop
3583 * the inode flush lock because we can avoid the whole
3584 * AIL state thing. It's OK to drop the flush lock now,
3585 * because we've already locked the buffer and to do anything
3586 * you really need both.
3587 */
3588 if (iip != NULL) {
3589 ASSERT(iip->ili_logged == 0);
3590 ASSERT(iip->ili_last_fields == 0);
3591 ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0);
3592 }
3593 xfs_ifunlock(ip);
3594 }
3595
3596 return 0;
3597
3598corrupt_out:
3599 return XFS_ERROR(EFSCORRUPTED);
3600}
3601
3602
3603/*
3604 * Flush all inactive inodes in mp. Return true if no user references
3605 * were found, false otherwise.
3606 */
3607int
3608xfs_iflush_all(
3609 xfs_mount_t *mp,
3610 int flag)
3611{
3612 int busy;
3613 int done;
3614 int purged;
3615 xfs_inode_t *ip;
3616 vmap_t vmap;
3617 vnode_t *vp;
3618
3619 busy = done = 0;
3620 while (!done) {
3621 purged = 0;
3622 XFS_MOUNT_ILOCK(mp);
3623 ip = mp->m_inodes;
3624 if (ip == NULL) {
3625 break;
3626 }
3627 do {
3628 /* Make sure we skip markers inserted by sync */
3629 if (ip->i_mount == NULL) {
3630 ip = ip->i_mnext;
3631 continue;
3632 }
3633
3634 /*
3635 * It's up to our caller to purge the root
3636 * and quota vnodes later.
3637 */
3638 vp = XFS_ITOV_NULL(ip);
3639
3640 if (!vp) {
3641 XFS_MOUNT_IUNLOCK(mp);
3642 xfs_finish_reclaim(ip, 0, XFS_IFLUSH_ASYNC);
3643 purged = 1;
3644 break;
3645 }
3646
3647 if (vn_count(vp) != 0) {
3648 if (vn_count(vp) == 1 &&
3649 (ip == mp->m_rootip ||
3650 (mp->m_quotainfo &&
3651 (ip->i_ino == mp->m_sb.sb_uquotino ||
3652 ip->i_ino == mp->m_sb.sb_gquotino)))) {
3653
3654 ip = ip->i_mnext;
3655 continue;
3656 }
3657 if (!(flag & XFS_FLUSH_ALL)) {
3658 busy = 1;
3659 done = 1;
3660 break;
3661 }
3662 /*
3663 * Ignore busy inodes but continue flushing
3664 * others.
3665 */
3666 ip = ip->i_mnext;
3667 continue;
3668 }
3669 /*
3670 * Sample vp mapping while holding mp locked on MP
3671 * systems, so we don't purge a reclaimed or
3672 * nonexistent vnode. We break from the loop
3673 * since we know that we modify
3674 * it by pulling ourselves from it in xfs_reclaim()
3675 * called via vn_purge() below. Set ip to the next
3676 * entry in the list anyway so we'll know below
3677 * whether we reached the end or not.
3678 */
3679 VMAP(vp, vmap);
3680 XFS_MOUNT_IUNLOCK(mp);
3681
3682 vn_purge(vp, &vmap);
3683
3684 purged = 1;
3685 break;
3686 } while (ip != mp->m_inodes);
3687 /*
3688 * We need to distinguish between when we exit the loop
3689 * after a purge and when we simply hit the end of the
3690 * list. We can't use the (ip == mp->m_inodes) test,
3691 * because when we purge an inode at the start of the list
3692 * the next inode on the list becomes mp->m_inodes. That
3693 * would cause such a test to bail out early. The purged
3694 * variable tells us how we got out of the loop.
3695 */
3696 if (!purged) {
3697 done = 1;
3698 }
3699 }
3700 XFS_MOUNT_IUNLOCK(mp);
3701 return !busy;
3702}
3703
3704
3705/*
3706 * xfs_iaccess: check accessibility of inode for mode.
3707 */
3708int
3709xfs_iaccess(
3710 xfs_inode_t *ip,
3711 mode_t mode,
3712 cred_t *cr)
3713{
3714 int error;
3715 mode_t orgmode = mode;
3716 struct inode *inode = LINVFS_GET_IP(XFS_ITOV(ip));
3717
3718 if (mode & S_IWUSR) {
3719 umode_t imode = inode->i_mode;
3720
3721 if (IS_RDONLY(inode) &&
3722 (S_ISREG(imode) || S_ISDIR(imode) || S_ISLNK(imode)))
3723 return XFS_ERROR(EROFS);
3724
3725 if (IS_IMMUTABLE(inode))
3726 return XFS_ERROR(EACCES);
3727 }
3728
3729 /*
3730 * If there's an Access Control List it's used instead of
3731 * the mode bits.
3732 */
3733 if ((error = _ACL_XFS_IACCESS(ip, mode, cr)) != -1)
3734 return error ? XFS_ERROR(error) : 0;
3735
3736 if (current_fsuid(cr) != ip->i_d.di_uid) {
3737 mode >>= 3;
3738 if (!in_group_p((gid_t)ip->i_d.di_gid))
3739 mode >>= 3;
3740 }
3741
3742 /*
3743 * If the DACs are ok we don't need any capability check.
3744 */
3745 if ((ip->i_d.di_mode & mode) == mode)
3746 return 0;
3747 /*
3748 * Read/write DACs are always overridable.
3749 * Executable DACs are overridable if at least one exec bit is set.
3750 */
3751 if (!(orgmode & S_IXUSR) ||
3752 (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
3753 if (capable_cred(cr, CAP_DAC_OVERRIDE))
3754 return 0;
3755
3756 if ((orgmode == S_IRUSR) ||
3757 (S_ISDIR(inode->i_mode) && (!(orgmode & S_IWUSR)))) {
3758 if (capable_cred(cr, CAP_DAC_READ_SEARCH))
3759 return 0;
3760#ifdef NOISE
3761 cmn_err(CE_NOTE, "Ick: mode=%o, orgmode=%o", mode, orgmode);
3762#endif /* NOISE */
3763 return XFS_ERROR(EACCES);
3764 }
3765 return XFS_ERROR(EACCES);
3766}
3767
3768/*
3769 * xfs_iroundup: round up argument to next power of two
3770 */
3771uint
3772xfs_iroundup(
3773 uint v)
3774{
3775 int i;
3776 uint m;
3777
3778 if ((v & (v - 1)) == 0)
3779 return v;
3780 ASSERT((v & 0x80000000) == 0);
3781 if ((v & (v + 1)) == 0)
3782 return v + 1;
3783 for (i = 0, m = 1; i < 31; i++, m <<= 1) {
3784 if (v & m)
3785 continue;
3786 v |= m;
3787 if ((v & (v + 1)) == 0)
3788 return v + 1;
3789 }
3790 ASSERT(0);
3791 return( 0 );
3792}
3793
3794/*
3795 * Change the requested timestamp in the given inode.
3796 * We don't lock across timestamp updates, and we don't log them but
3797 * we do record the fact that there is dirty information in core.
3798 *
3799 * NOTE -- callers MUST combine XFS_ICHGTIME_MOD or XFS_ICHGTIME_CHG
3800 * with XFS_ICHGTIME_ACC to be sure that access time
3801 * update will take. Calling first with XFS_ICHGTIME_ACC
3802 * and then XFS_ICHGTIME_MOD may fail to modify the access
3803 * timestamp if the filesystem is mounted noacctm.
3804 */
3805void
3806xfs_ichgtime(xfs_inode_t *ip,
3807 int flags)
3808{
3809 timespec_t tv;
3810 vnode_t *vp = XFS_ITOV(ip);
3811 struct inode *inode = LINVFS_GET_IP(vp);
3812
3813 /*
3814 * We're not supposed to change timestamps in readonly-mounted
3815 * filesystems. Throw it away if anyone asks us.
3816 */
3817 if (unlikely(vp->v_vfsp->vfs_flag & VFS_RDONLY))
3818 return;
3819
3820 /*
3821 * Don't update access timestamps on reads if mounted "noatime"
3822 * Throw it away if anyone asks us.
3823 */
3824 if ((ip->i_mount->m_flags & XFS_MOUNT_NOATIME || IS_NOATIME(inode)) &&
3825 ((flags & (XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD|XFS_ICHGTIME_CHG))
3826 == XFS_ICHGTIME_ACC))
3827 return;
3828
3829 nanotime(&tv);
3830 if (flags & XFS_ICHGTIME_MOD) {
3831 VN_MTIMESET(vp, &tv);
3832 ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
3833 ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
3834 }
3835 if (flags & XFS_ICHGTIME_ACC) {
3836 VN_ATIMESET(vp, &tv);
3837 ip->i_d.di_atime.t_sec = (__int32_t)tv.tv_sec;
3838 ip->i_d.di_atime.t_nsec = (__int32_t)tv.tv_nsec;
3839 }
3840 if (flags & XFS_ICHGTIME_CHG) {
3841 VN_CTIMESET(vp, &tv);
3842 ip->i_d.di_ctime.t_sec = (__int32_t)tv.tv_sec;
3843 ip->i_d.di_ctime.t_nsec = (__int32_t)tv.tv_nsec;
3844 }
3845
3846 /*
3847 * We update the i_update_core field _after_ changing
3848 * the timestamps in order to coordinate properly with
3849 * xfs_iflush() so that we don't lose timestamp updates.
3850 * This keeps us from having to hold the inode lock
3851 * while doing this. We use the SYNCHRONIZE macro to
3852 * ensure that the compiler does not reorder the update
3853 * of i_update_core above the timestamp updates above.
3854 */
3855 SYNCHRONIZE();
3856 ip->i_update_core = 1;
3857 if (!(inode->i_state & I_LOCK))
3858 mark_inode_dirty_sync(inode);
3859}
3860
3861#ifdef XFS_ILOCK_TRACE
3862ktrace_t *xfs_ilock_trace_buf;
3863
3864void
3865xfs_ilock_trace(xfs_inode_t *ip, int lock, unsigned int lockflags, inst_t *ra)
3866{
3867 ktrace_enter(ip->i_lock_trace,
3868 (void *)ip,
3869 (void *)(unsigned long)lock, /* 1 = LOCK, 3=UNLOCK, etc */
3870 (void *)(unsigned long)lockflags, /* XFS_ILOCK_EXCL etc */
3871 (void *)ra, /* caller of ilock */
3872 (void *)(unsigned long)current_cpu(),
3873 (void *)(unsigned long)current_pid(),
3874 NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL);
3875}
3876#endif
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
new file mode 100644
index 000000000000..a53b1ccf6070
--- /dev/null
+++ b/fs/xfs/xfs_inode.h
@@ -0,0 +1,554 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_INODE_H__
33#define __XFS_INODE_H__
34
35/*
36 * File incore extent information, present for each of data & attr forks.
37 */
38#define XFS_INLINE_EXTS 2
39#define XFS_INLINE_DATA 32
40typedef struct xfs_ifork {
41 int if_bytes; /* bytes in if_u1 */
42 int if_real_bytes; /* bytes allocated in if_u1 */
43 xfs_bmbt_block_t *if_broot; /* file's incore btree root */
44 short if_broot_bytes; /* bytes allocated for root */
45 unsigned char if_flags; /* per-fork flags */
46 unsigned char if_ext_max; /* max # of extent records */
47 xfs_extnum_t if_lastex; /* last if_extents used */
48 union {
49 xfs_bmbt_rec_t *if_extents; /* linear map file exts */
50 char *if_data; /* inline file data */
51 } if_u1;
52 union {
53 xfs_bmbt_rec_t if_inline_ext[XFS_INLINE_EXTS];
54 /* very small file extents */
55 char if_inline_data[XFS_INLINE_DATA];
56 /* very small file data */
57 xfs_dev_t if_rdev; /* dev number if special */
58 uuid_t if_uuid; /* mount point value */
59 } if_u2;
60} xfs_ifork_t;
61
62/*
63 * Flags for xfs_ichgtime().
64 */
65#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */
66#define XFS_ICHGTIME_ACC 0x2 /* data fork access timestamp */
67#define XFS_ICHGTIME_CHG 0x4 /* inode field change timestamp */
68
69/*
70 * Per-fork incore inode flags.
71 */
72#define XFS_IFINLINE 0x0001 /* Inline data is read in */
73#define XFS_IFEXTENTS 0x0002 /* All extent pointers are read in */
74#define XFS_IFBROOT 0x0004 /* i_broot points to the bmap b-tree root */
75
76/*
77 * Flags for xfs_imap() and xfs_dilocate().
78 */
79#define XFS_IMAP_LOOKUP 0x1
80
81/*
82 * Maximum number of extent pointers in if_u1.if_extents.
83 */
84#define XFS_MAX_INCORE_EXTENTS 32768
85
86
87#ifdef __KERNEL__
88struct bhv_desc;
89struct cred;
90struct ktrace;
91struct vnode;
92struct xfs_buf;
93struct xfs_bmap_free;
94struct xfs_bmbt_irec;
95struct xfs_bmbt_block;
96struct xfs_inode;
97struct xfs_inode_log_item;
98struct xfs_mount;
99struct xfs_trans;
100struct xfs_dquot;
101
102#if defined(XFS_ILOCK_TRACE)
103#define XFS_ILOCK_KTRACE_SIZE 32
104extern ktrace_t *xfs_ilock_trace_buf;
105extern void xfs_ilock_trace(struct xfs_inode *, int, unsigned int, inst_t *);
106#else
107#define xfs_ilock_trace(i,n,f,ra)
108#endif
109
110/*
111 * This structure is used to communicate which extents of a file
112 * were holes when a write started from xfs_write_file() to
113 * xfs_strat_read(). This is necessary so that we can know which
114 * blocks need to be zeroed when they are read in in xfs_strat_read()
115 * if they weren\'t allocated when the buffer given to xfs_strat_read()
116 * was mapped.
117 *
118 * We keep a list of these attached to the inode. The list is
119 * protected by the inode lock and the fact that the io lock is
120 * held exclusively by writers.
121 */
122typedef struct xfs_gap {
123 struct xfs_gap *xg_next;
124 xfs_fileoff_t xg_offset_fsb;
125 xfs_extlen_t xg_count_fsb;
126} xfs_gap_t;
127
128typedef struct dm_attrs_s {
129 __uint32_t da_dmevmask; /* DMIG event mask */
130 __uint16_t da_dmstate; /* DMIG state info */
131 __uint16_t da_pad; /* DMIG extra padding */
132} dm_attrs_t;
133
134typedef struct xfs_iocore {
135 void *io_obj; /* pointer to container
136 * inode or dcxvn structure */
137 struct xfs_mount *io_mount; /* fs mount struct ptr */
138#ifdef DEBUG
139 mrlock_t *io_lock; /* inode IO lock */
140 mrlock_t *io_iolock; /* inode IO lock */
141#endif
142
143 /* I/O state */
144 xfs_fsize_t io_new_size; /* sz when write completes */
145
146 /* Miscellaneous state. */
147 unsigned int io_flags; /* IO related flags */
148
149 /* DMAPI state */
150 dm_attrs_t io_dmattrs;
151
152} xfs_iocore_t;
153
154#define io_dmevmask io_dmattrs.da_dmevmask
155#define io_dmstate io_dmattrs.da_dmstate
156
157#define XFS_IO_INODE(io) ((xfs_inode_t *) ((io)->io_obj))
158#define XFS_IO_DCXVN(io) ((dcxvn_t *) ((io)->io_obj))
159
160/*
161 * Flags in the flags field
162 */
163
164#define XFS_IOCORE_RT 0x1
165
166/*
167 * xfs_iocore prototypes
168 */
169
170extern void xfs_iocore_inode_init(struct xfs_inode *);
171extern void xfs_iocore_inode_reinit(struct xfs_inode *);
172
173
174/*
175 * This is the type used in the xfs inode hash table.
176 * An array of these is allocated for each mounted
177 * file system to hash the inodes for that file system.
178 */
179typedef struct xfs_ihash {
180 struct xfs_inode *ih_next;
181 rwlock_t ih_lock;
182 uint ih_version;
183} xfs_ihash_t;
184
185#define XFS_IHASH(mp,ino) ((mp)->m_ihash + (((uint)(ino)) % (mp)->m_ihsize))
186
187/*
188 * This is the xfs inode cluster hash. This hash is used by xfs_iflush to
189 * find inodes that share a cluster and can be flushed to disk at the same
190 * time.
191 */
192typedef struct xfs_chashlist {
193 struct xfs_chashlist *chl_next;
194 struct xfs_inode *chl_ip;
195 xfs_daddr_t chl_blkno; /* starting block number of
196 * the cluster */
197 struct xfs_buf *chl_buf; /* the inode buffer */
198} xfs_chashlist_t;
199
200typedef struct xfs_chash {
201 xfs_chashlist_t *ch_list;
202 lock_t ch_lock;
203} xfs_chash_t;
204
205#define XFS_CHASH(mp,blk) ((mp)->m_chash + (((uint)blk) % (mp)->m_chsize))
206
207
208/*
209 * This is the xfs in-core inode structure.
210 * Most of the on-disk inode is embedded in the i_d field.
211 *
212 * The extent pointers/inline file space, however, are managed
213 * separately. The memory for this information is pointed to by
214 * the if_u1 unions depending on the type of the data.
215 * This is used to linearize the array of extents for fast in-core
216 * access. This is used until the file's number of extents
217 * surpasses XFS_MAX_INCORE_EXTENTS, at which point all extent pointers
218 * are accessed through the buffer cache.
219 *
220 * Other state kept in the in-core inode is used for identification,
221 * locking, transactional updating, etc of the inode.
222 *
223 * Generally, we do not want to hold the i_rlock while holding the
224 * i_ilock. Hierarchy is i_iolock followed by i_rlock.
225 *
226 * xfs_iptr_t contains all the inode fields upto and including the
227 * i_mnext and i_mprev fields, it is used as a marker in the inode
228 * chain off the mount structure by xfs_sync calls.
229 */
230
231typedef struct {
232 struct xfs_ihash *ip_hash; /* pointer to hash header */
233 struct xfs_inode *ip_next; /* inode hash link forw */
234 struct xfs_inode *ip_mnext; /* next inode in mount list */
235 struct xfs_inode *ip_mprev; /* ptr to prev inode */
236 struct xfs_inode **ip_prevp; /* ptr to prev i_next */
237 struct xfs_mount *ip_mount; /* fs mount struct ptr */
238} xfs_iptr_t;
239
240typedef struct xfs_inode {
241 /* Inode linking and identification information. */
242 struct xfs_ihash *i_hash; /* pointer to hash header */
243 struct xfs_inode *i_next; /* inode hash link forw */
244 struct xfs_inode *i_mnext; /* next inode in mount list */
245 struct xfs_inode *i_mprev; /* ptr to prev inode */
246 struct xfs_inode **i_prevp; /* ptr to prev i_next */
247 struct xfs_mount *i_mount; /* fs mount struct ptr */
248 struct list_head i_reclaim; /* reclaim list */
249 struct bhv_desc i_bhv_desc; /* inode behavior descriptor*/
250 struct xfs_dquot *i_udquot; /* user dquot */
251 struct xfs_dquot *i_gdquot; /* group dquot */
252
253 /* Inode location stuff */
254 xfs_ino_t i_ino; /* inode number (agno/agino)*/
255 xfs_daddr_t i_blkno; /* blkno of inode buffer */
256 ushort i_len; /* len of inode buffer */
257 ushort i_boffset; /* off of inode in buffer */
258
259 /* Extent information. */
260 xfs_ifork_t *i_afp; /* attribute fork pointer */
261 xfs_ifork_t i_df; /* data fork */
262
263 /* Transaction and locking information. */
264 struct xfs_trans *i_transp; /* ptr to owning transaction*/
265 struct xfs_inode_log_item *i_itemp; /* logging information */
266 mrlock_t i_lock; /* inode lock */
267 mrlock_t i_iolock; /* inode IO lock */
268 sema_t i_flock; /* inode flush lock */
269 atomic_t i_pincount; /* inode pin count */
270 wait_queue_head_t i_ipin_wait; /* inode pinning wait queue */
271#ifdef HAVE_REFCACHE
272 struct xfs_inode **i_refcache; /* ptr to entry in ref cache */
273 struct xfs_inode *i_release; /* inode to unref */
274#endif
275 /* I/O state */
276 xfs_iocore_t i_iocore; /* I/O core */
277
278 /* Miscellaneous state. */
279 unsigned short i_flags; /* see defined flags below */
280 unsigned char i_update_core; /* timestamps/size is dirty */
281 unsigned char i_update_size; /* di_size field is dirty */
282 unsigned int i_gen; /* generation count */
283 unsigned int i_delayed_blks; /* count of delay alloc blks */
284
285 xfs_dinode_core_t i_d; /* most of ondisk inode */
286 xfs_chashlist_t *i_chash; /* cluster hash list header */
287 struct xfs_inode *i_cnext; /* cluster hash link forward */
288 struct xfs_inode *i_cprev; /* cluster hash link backward */
289
290 /* Trace buffers per inode. */
291#ifdef XFS_BMAP_TRACE
292 struct ktrace *i_xtrace; /* inode extent list trace */
293#endif
294#ifdef XFS_BMBT_TRACE
295 struct ktrace *i_btrace; /* inode bmap btree trace */
296#endif
297#ifdef XFS_RW_TRACE
298 struct ktrace *i_rwtrace; /* inode read/write trace */
299#endif
300#ifdef XFS_ILOCK_TRACE
301 struct ktrace *i_lock_trace; /* inode lock/unlock trace */
302#endif
303#ifdef XFS_DIR2_TRACE
304 struct ktrace *i_dir_trace; /* inode directory trace */
305#endif
306} xfs_inode_t;
307
308#endif /* __KERNEL__ */
309
310
311/*
312 * Fork handling.
313 */
314#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_PTR)
315xfs_ifork_t *xfs_ifork_ptr(xfs_inode_t *ip, int w);
316#define XFS_IFORK_PTR(ip,w) xfs_ifork_ptr(ip,w)
317#else
318#define XFS_IFORK_PTR(ip,w) ((w) == XFS_DATA_FORK ? &(ip)->i_df : (ip)->i_afp)
319#endif
320#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_Q)
321int xfs_ifork_q(xfs_inode_t *ip);
322#define XFS_IFORK_Q(ip) xfs_ifork_q(ip)
323#else
324#define XFS_IFORK_Q(ip) XFS_CFORK_Q(&(ip)->i_d)
325#endif
326#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_DSIZE)
327int xfs_ifork_dsize(xfs_inode_t *ip);
328#define XFS_IFORK_DSIZE(ip) xfs_ifork_dsize(ip)
329#else
330#define XFS_IFORK_DSIZE(ip) XFS_CFORK_DSIZE(&ip->i_d, ip->i_mount)
331#endif
332#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_ASIZE)
333int xfs_ifork_asize(xfs_inode_t *ip);
334#define XFS_IFORK_ASIZE(ip) xfs_ifork_asize(ip)
335#else
336#define XFS_IFORK_ASIZE(ip) XFS_CFORK_ASIZE(&ip->i_d, ip->i_mount)
337#endif
338#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_SIZE)
339int xfs_ifork_size(xfs_inode_t *ip, int w);
340#define XFS_IFORK_SIZE(ip,w) xfs_ifork_size(ip,w)
341#else
342#define XFS_IFORK_SIZE(ip,w) XFS_CFORK_SIZE(&ip->i_d, ip->i_mount, w)
343#endif
344#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_FORMAT)
345int xfs_ifork_format(xfs_inode_t *ip, int w);
346#define XFS_IFORK_FORMAT(ip,w) xfs_ifork_format(ip,w)
347#else
348#define XFS_IFORK_FORMAT(ip,w) XFS_CFORK_FORMAT(&ip->i_d, w)
349#endif
350#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_FMT_SET)
351void xfs_ifork_fmt_set(xfs_inode_t *ip, int w, int n);
352#define XFS_IFORK_FMT_SET(ip,w,n) xfs_ifork_fmt_set(ip,w,n)
353#else
354#define XFS_IFORK_FMT_SET(ip,w,n) XFS_CFORK_FMT_SET(&ip->i_d, w, n)
355#endif
356#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_NEXTENTS)
357int xfs_ifork_nextents(xfs_inode_t *ip, int w);
358#define XFS_IFORK_NEXTENTS(ip,w) xfs_ifork_nextents(ip,w)
359#else
360#define XFS_IFORK_NEXTENTS(ip,w) XFS_CFORK_NEXTENTS(&ip->i_d, w)
361#endif
362#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_NEXT_SET)
363void xfs_ifork_next_set(xfs_inode_t *ip, int w, int n);
364#define XFS_IFORK_NEXT_SET(ip,w,n) xfs_ifork_next_set(ip,w,n)
365#else
366#define XFS_IFORK_NEXT_SET(ip,w,n) XFS_CFORK_NEXT_SET(&ip->i_d, w, n)
367#endif
368
369
370#ifdef __KERNEL__
371
372/*
373 * In-core inode flags.
374 */
375#define XFS_IGRIO 0x0001 /* inode used for guaranteed rate i/o */
376#define XFS_IUIOSZ 0x0002 /* inode i/o sizes have been explicitly set */
377#define XFS_IQUIESCE 0x0004 /* we have started quiescing for this inode */
378#define XFS_IRECLAIM 0x0008 /* we have started reclaiming this inode */
379#define XFS_ISTALE 0x0010 /* inode has been staled */
380#define XFS_IRECLAIMABLE 0x0020 /* inode can be reclaimed */
381#define XFS_INEW 0x0040
382
383/*
384 * Flags for inode locking.
385 */
386#define XFS_IOLOCK_EXCL 0x001
387#define XFS_IOLOCK_SHARED 0x002
388#define XFS_ILOCK_EXCL 0x004
389#define XFS_ILOCK_SHARED 0x008
390#define XFS_IUNLOCK_NONOTIFY 0x010
391#define XFS_EXTENT_TOKEN_RD 0x040
392#define XFS_SIZE_TOKEN_RD 0x080
393#define XFS_EXTSIZE_RD (XFS_EXTENT_TOKEN_RD|XFS_SIZE_TOKEN_RD)
394#define XFS_WILLLEND 0x100 /* Always acquire tokens for lending */
395#define XFS_EXTENT_TOKEN_WR (XFS_EXTENT_TOKEN_RD | XFS_WILLLEND)
396#define XFS_SIZE_TOKEN_WR (XFS_SIZE_TOKEN_RD | XFS_WILLLEND)
397#define XFS_EXTSIZE_WR (XFS_EXTSIZE_RD | XFS_WILLLEND)
398
399
400#define XFS_LOCK_MASK \
401 (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL | \
402 XFS_ILOCK_SHARED | XFS_EXTENT_TOKEN_RD | XFS_SIZE_TOKEN_RD | \
403 XFS_WILLLEND)
404
405/*
406 * Flags for xfs_iflush()
407 */
408#define XFS_IFLUSH_DELWRI_ELSE_SYNC 1
409#define XFS_IFLUSH_DELWRI_ELSE_ASYNC 2
410#define XFS_IFLUSH_SYNC 3
411#define XFS_IFLUSH_ASYNC 4
412#define XFS_IFLUSH_DELWRI 5
413
414/*
415 * Flags for xfs_iflush_all.
416 */
417#define XFS_FLUSH_ALL 0x1
418
419/*
420 * Flags for xfs_itruncate_start().
421 */
422#define XFS_ITRUNC_DEFINITE 0x1
423#define XFS_ITRUNC_MAYBE 0x2
424
425#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ITOV)
426struct vnode *xfs_itov(xfs_inode_t *ip);
427#define XFS_ITOV(ip) xfs_itov(ip)
428#else
429#define XFS_ITOV(ip) BHV_TO_VNODE(XFS_ITOBHV(ip))
430#endif
431#define XFS_ITOV_NULL(ip) BHV_TO_VNODE_NULL(XFS_ITOBHV(ip))
432#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ITOBHV)
433struct bhv_desc *xfs_itobhv(xfs_inode_t *ip);
434#define XFS_ITOBHV(ip) xfs_itobhv(ip)
435#else
436#define XFS_ITOBHV(ip) ((struct bhv_desc *)(&((ip)->i_bhv_desc)))
437#endif
438#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BHVTOI)
439xfs_inode_t *xfs_bhvtoi(struct bhv_desc *bhvp);
440#define XFS_BHVTOI(bhvp) xfs_bhvtoi(bhvp)
441#else
442#define XFS_BHVTOI(bhvp) \
443 ((xfs_inode_t *)((char *)(bhvp) - \
444 (char *)&(((xfs_inode_t *)0)->i_bhv_desc)))
445#endif
446
447#define BHV_IS_XFS(bdp) (BHV_OPS(bdp) == &xfs_vnodeops)
448
449/*
450 * For multiple groups support: if S_ISGID bit is set in the parent
451 * directory, group of new file is set to that of the parent, and
452 * new subdirectory gets S_ISGID bit from parent.
453 */
454#define XFS_INHERIT_GID(pip, vfsp) \
455 (((vfsp)->vfs_flag & VFS_GRPID) || ((pip)->i_d.di_mode & S_ISGID))
456
457/*
458 * xfs_iget.c prototypes.
459 */
460
461#define IGET_CREATE 1
462
463void xfs_ihash_init(struct xfs_mount *);
464void xfs_ihash_free(struct xfs_mount *);
465void xfs_chash_init(struct xfs_mount *);
466void xfs_chash_free(struct xfs_mount *);
467xfs_inode_t *xfs_inode_incore(struct xfs_mount *, xfs_ino_t,
468 struct xfs_trans *);
469void xfs_inode_lock_init(xfs_inode_t *, struct vnode *);
470int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
471 uint, uint, xfs_inode_t **, xfs_daddr_t);
472void xfs_iput(xfs_inode_t *, uint);
473void xfs_iput_new(xfs_inode_t *, uint);
474void xfs_ilock(xfs_inode_t *, uint);
475int xfs_ilock_nowait(xfs_inode_t *, uint);
476void xfs_iunlock(xfs_inode_t *, uint);
477void xfs_ilock_demote(xfs_inode_t *, uint);
478void xfs_iflock(xfs_inode_t *);
479int xfs_iflock_nowait(xfs_inode_t *);
480uint xfs_ilock_map_shared(xfs_inode_t *);
481void xfs_iunlock_map_shared(xfs_inode_t *, uint);
482void xfs_ifunlock(xfs_inode_t *);
483void xfs_ireclaim(xfs_inode_t *);
484int xfs_finish_reclaim(xfs_inode_t *, int, int);
485int xfs_finish_reclaim_all(struct xfs_mount *, int);
486
487/*
488 * xfs_inode.c prototypes.
489 */
490int xfs_inotobp(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
491 xfs_dinode_t **, struct xfs_buf **, int *);
492int xfs_itobp(struct xfs_mount *, struct xfs_trans *,
493 xfs_inode_t *, xfs_dinode_t **, struct xfs_buf **,
494 xfs_daddr_t);
495int xfs_iread(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
496 xfs_inode_t **, xfs_daddr_t);
497int xfs_iread_extents(struct xfs_trans *, xfs_inode_t *, int);
498int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t, nlink_t,
499 xfs_dev_t, struct cred *, xfs_prid_t, int,
500 struct xfs_buf **, boolean_t *, xfs_inode_t **);
501void xfs_xlate_dinode_core(xfs_caddr_t, struct xfs_dinode_core *,
502 int);
503uint xfs_ip2xflags(struct xfs_inode *);
504uint xfs_dic2xflags(struct xfs_dinode_core *);
505int xfs_ifree(struct xfs_trans *, xfs_inode_t *,
506 struct xfs_bmap_free *);
507void xfs_itruncate_start(xfs_inode_t *, uint, xfs_fsize_t);
508int xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *,
509 xfs_fsize_t, int, int);
510int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
511int xfs_igrow_start(xfs_inode_t *, xfs_fsize_t, struct cred *);
512void xfs_igrow_finish(struct xfs_trans *, xfs_inode_t *,
513 xfs_fsize_t, int);
514
515void xfs_idestroy_fork(xfs_inode_t *, int);
516void xfs_idestroy(xfs_inode_t *);
517void xfs_idata_realloc(xfs_inode_t *, int, int);
518void xfs_iextract(xfs_inode_t *);
519void xfs_iext_realloc(xfs_inode_t *, int, int);
520void xfs_iroot_realloc(xfs_inode_t *, int, int);
521void xfs_ipin(xfs_inode_t *);
522void xfs_iunpin(xfs_inode_t *);
523int xfs_iextents_copy(xfs_inode_t *, xfs_bmbt_rec_t *, int);
524int xfs_iflush(xfs_inode_t *, uint);
525int xfs_iflush_all(struct xfs_mount *, int);
526int xfs_iaccess(xfs_inode_t *, mode_t, cred_t *);
527uint xfs_iroundup(uint);
528void xfs_ichgtime(xfs_inode_t *, int);
529xfs_fsize_t xfs_file_last_byte(xfs_inode_t *);
530void xfs_lock_inodes(xfs_inode_t **, int, int, uint);
531
532#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount))
533
534#ifdef DEBUG
535void xfs_isize_check(struct xfs_mount *, xfs_inode_t *, xfs_fsize_t);
536#else /* DEBUG */
537#define xfs_isize_check(mp, ip, isize)
538#endif /* DEBUG */
539
540#if defined(DEBUG)
541void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
542#else
543#define xfs_inobp_check(mp, bp)
544#endif /* DEBUG */
545
546extern struct kmem_zone *xfs_chashlist_zone;
547extern struct kmem_zone *xfs_ifork_zone;
548extern struct kmem_zone *xfs_inode_zone;
549extern struct kmem_zone *xfs_ili_zone;
550extern struct vnodeops xfs_vnodeops;
551
552#endif /* __KERNEL__ */
553
554#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
new file mode 100644
index 000000000000..768cb1816b8e
--- /dev/null
+++ b/fs/xfs/xfs_inode_item.c
@@ -0,0 +1,1092 @@
1/*
2 * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33/*
34 * This file contains the implementation of the xfs_inode_log_item.
35 * It contains the item operations used to manipulate the inode log
36 * items as well as utility routines used by the inode specific
37 * transaction routines.
38 */
39#include "xfs.h"
40#include "xfs_macros.h"
41#include "xfs_types.h"
42#include "xfs_inum.h"
43#include "xfs_log.h"
44#include "xfs_trans.h"
45#include "xfs_buf_item.h"
46#include "xfs_sb.h"
47#include "xfs_dir.h"
48#include "xfs_dir2.h"
49#include "xfs_dmapi.h"
50#include "xfs_mount.h"
51#include "xfs_trans_priv.h"
52#include "xfs_ag.h"
53#include "xfs_alloc_btree.h"
54#include "xfs_bmap_btree.h"
55#include "xfs_ialloc_btree.h"
56#include "xfs_btree.h"
57#include "xfs_ialloc.h"
58#include "xfs_attr_sf.h"
59#include "xfs_dir_sf.h"
60#include "xfs_dir2_sf.h"
61#include "xfs_dinode.h"
62#include "xfs_inode_item.h"
63#include "xfs_inode.h"
64#include "xfs_rw.h"
65
66
67kmem_zone_t *xfs_ili_zone; /* inode log item zone */
68
69/*
70 * This returns the number of iovecs needed to log the given inode item.
71 *
72 * We need one iovec for the inode log format structure, one for the
73 * inode core, and possibly one for the inode data/extents/b-tree root
74 * and one for the inode attribute data/extents/b-tree root.
75 */
76STATIC uint
77xfs_inode_item_size(
78 xfs_inode_log_item_t *iip)
79{
80 uint nvecs;
81 xfs_inode_t *ip;
82
83 ip = iip->ili_inode;
84 nvecs = 2;
85
86 /*
87 * Only log the data/extents/b-tree root if there is something
88 * left to log.
89 */
90 iip->ili_format.ilf_fields |= XFS_ILOG_CORE;
91
92 switch (ip->i_d.di_format) {
93 case XFS_DINODE_FMT_EXTENTS:
94 iip->ili_format.ilf_fields &=
95 ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
96 XFS_ILOG_DEV | XFS_ILOG_UUID);
97 if ((iip->ili_format.ilf_fields & XFS_ILOG_DEXT) &&
98 (ip->i_d.di_nextents > 0) &&
99 (ip->i_df.if_bytes > 0)) {
100 ASSERT(ip->i_df.if_u1.if_extents != NULL);
101 nvecs++;
102 } else {
103 iip->ili_format.ilf_fields &= ~XFS_ILOG_DEXT;
104 }
105 break;
106
107 case XFS_DINODE_FMT_BTREE:
108 ASSERT(ip->i_df.if_ext_max ==
109 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
110 iip->ili_format.ilf_fields &=
111 ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
112 XFS_ILOG_DEV | XFS_ILOG_UUID);
113 if ((iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) &&
114 (ip->i_df.if_broot_bytes > 0)) {
115 ASSERT(ip->i_df.if_broot != NULL);
116 nvecs++;
117 } else {
118 ASSERT(!(iip->ili_format.ilf_fields &
119 XFS_ILOG_DBROOT));
120#ifdef XFS_TRANS_DEBUG
121 if (iip->ili_root_size > 0) {
122 ASSERT(iip->ili_root_size ==
123 ip->i_df.if_broot_bytes);
124 ASSERT(memcmp(iip->ili_orig_root,
125 ip->i_df.if_broot,
126 iip->ili_root_size) == 0);
127 } else {
128 ASSERT(ip->i_df.if_broot_bytes == 0);
129 }
130#endif
131 iip->ili_format.ilf_fields &= ~XFS_ILOG_DBROOT;
132 }
133 break;
134
135 case XFS_DINODE_FMT_LOCAL:
136 iip->ili_format.ilf_fields &=
137 ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT |
138 XFS_ILOG_DEV | XFS_ILOG_UUID);
139 if ((iip->ili_format.ilf_fields & XFS_ILOG_DDATA) &&
140 (ip->i_df.if_bytes > 0)) {
141 ASSERT(ip->i_df.if_u1.if_data != NULL);
142 ASSERT(ip->i_d.di_size > 0);
143 nvecs++;
144 } else {
145 iip->ili_format.ilf_fields &= ~XFS_ILOG_DDATA;
146 }
147 break;
148
149 case XFS_DINODE_FMT_DEV:
150 iip->ili_format.ilf_fields &=
151 ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
152 XFS_ILOG_DEXT | XFS_ILOG_UUID);
153 break;
154
155 case XFS_DINODE_FMT_UUID:
156 iip->ili_format.ilf_fields &=
157 ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
158 XFS_ILOG_DEXT | XFS_ILOG_DEV);
159 break;
160
161 default:
162 ASSERT(0);
163 break;
164 }
165
166 /*
167 * If there are no attributes associated with this file,
168 * then there cannot be anything more to log.
169 * Clear all attribute-related log flags.
170 */
171 if (!XFS_IFORK_Q(ip)) {
172 iip->ili_format.ilf_fields &=
173 ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT);
174 return nvecs;
175 }
176
177 /*
178 * Log any necessary attribute data.
179 */
180 switch (ip->i_d.di_aformat) {
181 case XFS_DINODE_FMT_EXTENTS:
182 iip->ili_format.ilf_fields &=
183 ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT);
184 if ((iip->ili_format.ilf_fields & XFS_ILOG_AEXT) &&
185 (ip->i_d.di_anextents > 0) &&
186 (ip->i_afp->if_bytes > 0)) {
187 ASSERT(ip->i_afp->if_u1.if_extents != NULL);
188 nvecs++;
189 } else {
190 iip->ili_format.ilf_fields &= ~XFS_ILOG_AEXT;
191 }
192 break;
193
194 case XFS_DINODE_FMT_BTREE:
195 iip->ili_format.ilf_fields &=
196 ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT);
197 if ((iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) &&
198 (ip->i_afp->if_broot_bytes > 0)) {
199 ASSERT(ip->i_afp->if_broot != NULL);
200 nvecs++;
201 } else {
202 iip->ili_format.ilf_fields &= ~XFS_ILOG_ABROOT;
203 }
204 break;
205
206 case XFS_DINODE_FMT_LOCAL:
207 iip->ili_format.ilf_fields &=
208 ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT);
209 if ((iip->ili_format.ilf_fields & XFS_ILOG_ADATA) &&
210 (ip->i_afp->if_bytes > 0)) {
211 ASSERT(ip->i_afp->if_u1.if_data != NULL);
212 nvecs++;
213 } else {
214 iip->ili_format.ilf_fields &= ~XFS_ILOG_ADATA;
215 }
216 break;
217
218 default:
219 ASSERT(0);
220 break;
221 }
222
223 return nvecs;
224}
225
226/*
227 * This is called to fill in the vector of log iovecs for the
228 * given inode log item. It fills the first item with an inode
229 * log format structure, the second with the on-disk inode structure,
230 * and a possible third and/or fourth with the inode data/extents/b-tree
231 * root and inode attributes data/extents/b-tree root.
232 */
233STATIC void
234xfs_inode_item_format(
235 xfs_inode_log_item_t *iip,
236 xfs_log_iovec_t *log_vector)
237{
238 uint nvecs;
239 xfs_log_iovec_t *vecp;
240 xfs_inode_t *ip;
241 size_t data_bytes;
242 xfs_bmbt_rec_t *ext_buffer;
243 int nrecs;
244 xfs_mount_t *mp;
245
246 ip = iip->ili_inode;
247 vecp = log_vector;
248
249 vecp->i_addr = (xfs_caddr_t)&iip->ili_format;
250 vecp->i_len = sizeof(xfs_inode_log_format_t);
251 vecp++;
252 nvecs = 1;
253
254 /*
255 * Clear i_update_core if the timestamps (or any other
256 * non-transactional modification) need flushing/logging
257 * and we're about to log them with the rest of the core.
258 *
259 * This is the same logic as xfs_iflush() but this code can't
260 * run at the same time as xfs_iflush because we're in commit
261 * processing here and so we have the inode lock held in
262 * exclusive mode. Although it doesn't really matter
263 * for the timestamps if both routines were to grab the
264 * timestamps or not. That would be ok.
265 *
266 * We clear i_update_core before copying out the data.
267 * This is for coordination with our timestamp updates
268 * that don't hold the inode lock. They will always
269 * update the timestamps BEFORE setting i_update_core,
270 * so if we clear i_update_core after they set it we
271 * are guaranteed to see their updates to the timestamps
272 * either here. Likewise, if they set it after we clear it
273 * here, we'll see it either on the next commit of this
274 * inode or the next time the inode gets flushed via
275 * xfs_iflush(). This depends on strongly ordered memory
276 * semantics, but we have that. We use the SYNCHRONIZE
277 * macro to make sure that the compiler does not reorder
278 * the i_update_core access below the data copy below.
279 */
280 if (ip->i_update_core) {
281 ip->i_update_core = 0;
282 SYNCHRONIZE();
283 }
284
285 /*
286 * We don't have to worry about re-ordering here because
287 * the update_size field is protected by the inode lock
288 * and we have that held in exclusive mode.
289 */
290 if (ip->i_update_size)
291 ip->i_update_size = 0;
292
293 vecp->i_addr = (xfs_caddr_t)&ip->i_d;
294 vecp->i_len = sizeof(xfs_dinode_core_t);
295 vecp++;
296 nvecs++;
297 iip->ili_format.ilf_fields |= XFS_ILOG_CORE;
298
299 /*
300 * If this is really an old format inode, then we need to
301 * log it as such. This means that we have to copy the link
302 * count from the new field to the old. We don't have to worry
303 * about the new fields, because nothing trusts them as long as
304 * the old inode version number is there. If the superblock already
305 * has a new version number, then we don't bother converting back.
306 */
307 mp = ip->i_mount;
308 ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 ||
309 XFS_SB_VERSION_HASNLINK(&mp->m_sb));
310 if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
311 if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) {
312 /*
313 * Convert it back.
314 */
315 ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
316 ip->i_d.di_onlink = ip->i_d.di_nlink;
317 } else {
318 /*
319 * The superblock version has already been bumped,
320 * so just make the conversion to the new inode
321 * format permanent.
322 */
323 ip->i_d.di_version = XFS_DINODE_VERSION_2;
324 ip->i_d.di_onlink = 0;
325 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
326 }
327 }
328
329 switch (ip->i_d.di_format) {
330 case XFS_DINODE_FMT_EXTENTS:
331 ASSERT(!(iip->ili_format.ilf_fields &
332 (XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
333 XFS_ILOG_DEV | XFS_ILOG_UUID)));
334 if (iip->ili_format.ilf_fields & XFS_ILOG_DEXT) {
335 ASSERT(ip->i_df.if_bytes > 0);
336 ASSERT(ip->i_df.if_u1.if_extents != NULL);
337 ASSERT(ip->i_d.di_nextents > 0);
338 ASSERT(iip->ili_extents_buf == NULL);
339 nrecs = ip->i_df.if_bytes /
340 (uint)sizeof(xfs_bmbt_rec_t);
341 ASSERT(nrecs > 0);
342#if __BYTE_ORDER == __BIG_ENDIAN
343 if (nrecs == ip->i_d.di_nextents) {
344 /*
345 * There are no delayed allocation
346 * extents, so just point to the
347 * real extents array.
348 */
349 vecp->i_addr =
350 (char *)(ip->i_df.if_u1.if_extents);
351 vecp->i_len = ip->i_df.if_bytes;
352 } else
353#endif
354 {
355 /*
356 * There are delayed allocation extents
357 * in the inode, or we need to convert
358 * the extents to on disk format.
359 * Use xfs_iextents_copy()
360 * to copy only the real extents into
361 * a separate buffer. We'll free the
362 * buffer in the unlock routine.
363 */
364 ext_buffer = kmem_alloc(ip->i_df.if_bytes,
365 KM_SLEEP);
366 iip->ili_extents_buf = ext_buffer;
367 vecp->i_addr = (xfs_caddr_t)ext_buffer;
368 vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
369 XFS_DATA_FORK);
370 }
371 ASSERT(vecp->i_len <= ip->i_df.if_bytes);
372 iip->ili_format.ilf_dsize = vecp->i_len;
373 vecp++;
374 nvecs++;
375 }
376 break;
377
378 case XFS_DINODE_FMT_BTREE:
379 ASSERT(!(iip->ili_format.ilf_fields &
380 (XFS_ILOG_DDATA | XFS_ILOG_DEXT |
381 XFS_ILOG_DEV | XFS_ILOG_UUID)));
382 if (iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) {
383 ASSERT(ip->i_df.if_broot_bytes > 0);
384 ASSERT(ip->i_df.if_broot != NULL);
385 vecp->i_addr = (xfs_caddr_t)ip->i_df.if_broot;
386 vecp->i_len = ip->i_df.if_broot_bytes;
387 vecp++;
388 nvecs++;
389 iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes;
390 }
391 break;
392
393 case XFS_DINODE_FMT_LOCAL:
394 ASSERT(!(iip->ili_format.ilf_fields &
395 (XFS_ILOG_DBROOT | XFS_ILOG_DEXT |
396 XFS_ILOG_DEV | XFS_ILOG_UUID)));
397 if (iip->ili_format.ilf_fields & XFS_ILOG_DDATA) {
398 ASSERT(ip->i_df.if_bytes > 0);
399 ASSERT(ip->i_df.if_u1.if_data != NULL);
400 ASSERT(ip->i_d.di_size > 0);
401
402 vecp->i_addr = (xfs_caddr_t)ip->i_df.if_u1.if_data;
403 /*
404 * Round i_bytes up to a word boundary.
405 * The underlying memory is guaranteed to
406 * to be there by xfs_idata_realloc().
407 */
408 data_bytes = roundup(ip->i_df.if_bytes, 4);
409 ASSERT((ip->i_df.if_real_bytes == 0) ||
410 (ip->i_df.if_real_bytes == data_bytes));
411 vecp->i_len = (int)data_bytes;
412 vecp++;
413 nvecs++;
414 iip->ili_format.ilf_dsize = (unsigned)data_bytes;
415 }
416 break;
417
418 case XFS_DINODE_FMT_DEV:
419 ASSERT(!(iip->ili_format.ilf_fields &
420 (XFS_ILOG_DBROOT | XFS_ILOG_DEXT |
421 XFS_ILOG_DDATA | XFS_ILOG_UUID)));
422 if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) {
423 iip->ili_format.ilf_u.ilfu_rdev =
424 ip->i_df.if_u2.if_rdev;
425 }
426 break;
427
428 case XFS_DINODE_FMT_UUID:
429 ASSERT(!(iip->ili_format.ilf_fields &
430 (XFS_ILOG_DBROOT | XFS_ILOG_DEXT |
431 XFS_ILOG_DDATA | XFS_ILOG_DEV)));
432 if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) {
433 iip->ili_format.ilf_u.ilfu_uuid =
434 ip->i_df.if_u2.if_uuid;
435 }
436 break;
437
438 default:
439 ASSERT(0);
440 break;
441 }
442
443 /*
444 * If there are no attributes associated with the file,
445 * then we're done.
446 * Assert that no attribute-related log flags are set.
447 */
448 if (!XFS_IFORK_Q(ip)) {
449 ASSERT(nvecs == iip->ili_item.li_desc->lid_size);
450 iip->ili_format.ilf_size = nvecs;
451 ASSERT(!(iip->ili_format.ilf_fields &
452 (XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT)));
453 return;
454 }
455
456 switch (ip->i_d.di_aformat) {
457 case XFS_DINODE_FMT_EXTENTS:
458 ASSERT(!(iip->ili_format.ilf_fields &
459 (XFS_ILOG_ADATA | XFS_ILOG_ABROOT)));
460 if (iip->ili_format.ilf_fields & XFS_ILOG_AEXT) {
461 ASSERT(ip->i_afp->if_bytes > 0);
462 ASSERT(ip->i_afp->if_u1.if_extents != NULL);
463 ASSERT(ip->i_d.di_anextents > 0);
464#ifdef DEBUG
465 nrecs = ip->i_afp->if_bytes /
466 (uint)sizeof(xfs_bmbt_rec_t);
467#endif
468 ASSERT(nrecs > 0);
469 ASSERT(nrecs == ip->i_d.di_anextents);
470#if __BYTE_ORDER == __BIG_ENDIAN
471 /*
472 * There are not delayed allocation extents
473 * for attributes, so just point at the array.
474 */
475 vecp->i_addr = (char *)(ip->i_afp->if_u1.if_extents);
476 vecp->i_len = ip->i_afp->if_bytes;
477#else
478 ASSERT(iip->ili_aextents_buf == NULL);
479 /*
480 * Need to endian flip before logging
481 */
482 ext_buffer = kmem_alloc(ip->i_afp->if_bytes,
483 KM_SLEEP);
484 iip->ili_aextents_buf = ext_buffer;
485 vecp->i_addr = (xfs_caddr_t)ext_buffer;
486 vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
487 XFS_ATTR_FORK);
488#endif
489 iip->ili_format.ilf_asize = vecp->i_len;
490 vecp++;
491 nvecs++;
492 }
493 break;
494
495 case XFS_DINODE_FMT_BTREE:
496 ASSERT(!(iip->ili_format.ilf_fields &
497 (XFS_ILOG_ADATA | XFS_ILOG_AEXT)));
498 if (iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) {
499 ASSERT(ip->i_afp->if_broot_bytes > 0);
500 ASSERT(ip->i_afp->if_broot != NULL);
501 vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_broot;
502 vecp->i_len = ip->i_afp->if_broot_bytes;
503 vecp++;
504 nvecs++;
505 iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes;
506 }
507 break;
508
509 case XFS_DINODE_FMT_LOCAL:
510 ASSERT(!(iip->ili_format.ilf_fields &
511 (XFS_ILOG_ABROOT | XFS_ILOG_AEXT)));
512 if (iip->ili_format.ilf_fields & XFS_ILOG_ADATA) {
513 ASSERT(ip->i_afp->if_bytes > 0);
514 ASSERT(ip->i_afp->if_u1.if_data != NULL);
515
516 vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_u1.if_data;
517 /*
518 * Round i_bytes up to a word boundary.
519 * The underlying memory is guaranteed to
520 * to be there by xfs_idata_realloc().
521 */
522 data_bytes = roundup(ip->i_afp->if_bytes, 4);
523 ASSERT((ip->i_afp->if_real_bytes == 0) ||
524 (ip->i_afp->if_real_bytes == data_bytes));
525 vecp->i_len = (int)data_bytes;
526 vecp++;
527 nvecs++;
528 iip->ili_format.ilf_asize = (unsigned)data_bytes;
529 }
530 break;
531
532 default:
533 ASSERT(0);
534 break;
535 }
536
537 ASSERT(nvecs == iip->ili_item.li_desc->lid_size);
538 iip->ili_format.ilf_size = nvecs;
539}
540
541
542/*
543 * This is called to pin the inode associated with the inode log
544 * item in memory so it cannot be written out. Do this by calling
545 * xfs_ipin() to bump the pin count in the inode while holding the
546 * inode pin lock.
547 */
548STATIC void
549xfs_inode_item_pin(
550 xfs_inode_log_item_t *iip)
551{
552 ASSERT(ismrlocked(&(iip->ili_inode->i_lock), MR_UPDATE));
553 xfs_ipin(iip->ili_inode);
554}
555
556
557/*
558 * This is called to unpin the inode associated with the inode log
559 * item which was previously pinned with a call to xfs_inode_item_pin().
560 * Just call xfs_iunpin() on the inode to do this.
561 */
562/* ARGSUSED */
563STATIC void
564xfs_inode_item_unpin(
565 xfs_inode_log_item_t *iip,
566 int stale)
567{
568 xfs_iunpin(iip->ili_inode);
569}
570
571/* ARGSUSED */
572STATIC void
573xfs_inode_item_unpin_remove(
574 xfs_inode_log_item_t *iip,
575 xfs_trans_t *tp)
576{
577 xfs_iunpin(iip->ili_inode);
578}
579
580/*
581 * This is called to attempt to lock the inode associated with this
582 * inode log item, in preparation for the push routine which does the actual
583 * iflush. Don't sleep on the inode lock or the flush lock.
584 *
585 * If the flush lock is already held, indicating that the inode has
586 * been or is in the process of being flushed, then (ideally) we'd like to
587 * see if the inode's buffer is still incore, and if so give it a nudge.
588 * We delay doing so until the pushbuf routine, though, to avoid holding
589 * the AIL lock across a call to the blackhole which is the buffercache.
590 * Also we don't want to sleep in any device strategy routines, which can happen
591 * if we do the subsequent bawrite in here.
592 */
593STATIC uint
594xfs_inode_item_trylock(
595 xfs_inode_log_item_t *iip)
596{
597 register xfs_inode_t *ip;
598
599 ip = iip->ili_inode;
600
601 if (xfs_ipincount(ip) > 0) {
602 return XFS_ITEM_PINNED;
603 }
604
605 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
606 return XFS_ITEM_LOCKED;
607 }
608
609 if (!xfs_iflock_nowait(ip)) {
610 /*
611 * If someone else isn't already trying to push the inode
612 * buffer, we get to do it.
613 */
614 if (iip->ili_pushbuf_flag == 0) {
615 iip->ili_pushbuf_flag = 1;
616#ifdef DEBUG
617 iip->ili_push_owner = get_thread_id();
618#endif
619 /*
620 * Inode is left locked in shared mode.
621 * Pushbuf routine gets to unlock it.
622 */
623 return XFS_ITEM_PUSHBUF;
624 } else {
625 /*
626 * We hold the AIL_LOCK, so we must specify the
627 * NONOTIFY flag so that we won't double trip.
628 */
629 xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY);
630 return XFS_ITEM_FLUSHING;
631 }
632 /* NOTREACHED */
633 }
634
635 /* Stale items should force out the iclog */
636 if (ip->i_flags & XFS_ISTALE) {
637 xfs_ifunlock(ip);
638 xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY);
639 return XFS_ITEM_PINNED;
640 }
641
642#ifdef DEBUG
643 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
644 ASSERT(iip->ili_format.ilf_fields != 0);
645 ASSERT(iip->ili_logged == 0);
646 ASSERT(iip->ili_item.li_flags & XFS_LI_IN_AIL);
647 }
648#endif
649 return XFS_ITEM_SUCCESS;
650}
651
652/*
653 * Unlock the inode associated with the inode log item.
654 * Clear the fields of the inode and inode log item that
655 * are specific to the current transaction. If the
656 * hold flags is set, do not unlock the inode.
657 */
658STATIC void
659xfs_inode_item_unlock(
660 xfs_inode_log_item_t *iip)
661{
662 uint hold;
663 uint iolocked;
664 uint lock_flags;
665 xfs_inode_t *ip;
666
667 ASSERT(iip != NULL);
668 ASSERT(iip->ili_inode->i_itemp != NULL);
669 ASSERT(ismrlocked(&(iip->ili_inode->i_lock), MR_UPDATE));
670 ASSERT((!(iip->ili_inode->i_itemp->ili_flags &
671 XFS_ILI_IOLOCKED_EXCL)) ||
672 ismrlocked(&(iip->ili_inode->i_iolock), MR_UPDATE));
673 ASSERT((!(iip->ili_inode->i_itemp->ili_flags &
674 XFS_ILI_IOLOCKED_SHARED)) ||
675 ismrlocked(&(iip->ili_inode->i_iolock), MR_ACCESS));
676 /*
677 * Clear the transaction pointer in the inode.
678 */
679 ip = iip->ili_inode;
680 ip->i_transp = NULL;
681
682 /*
683 * If the inode needed a separate buffer with which to log
684 * its extents, then free it now.
685 */
686 if (iip->ili_extents_buf != NULL) {
687 ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS);
688 ASSERT(ip->i_d.di_nextents > 0);
689 ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_DEXT);
690 ASSERT(ip->i_df.if_bytes > 0);
691 kmem_free(iip->ili_extents_buf, ip->i_df.if_bytes);
692 iip->ili_extents_buf = NULL;
693 }
694 if (iip->ili_aextents_buf != NULL) {
695 ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS);
696 ASSERT(ip->i_d.di_anextents > 0);
697 ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_AEXT);
698 ASSERT(ip->i_afp->if_bytes > 0);
699 kmem_free(iip->ili_aextents_buf, ip->i_afp->if_bytes);
700 iip->ili_aextents_buf = NULL;
701 }
702
703 /*
704 * Figure out if we should unlock the inode or not.
705 */
706 hold = iip->ili_flags & XFS_ILI_HOLD;
707
708 /*
709 * Before clearing out the flags, remember whether we
710 * are holding the inode's IO lock.
711 */
712 iolocked = iip->ili_flags & XFS_ILI_IOLOCKED_ANY;
713
714 /*
715 * Clear out the fields of the inode log item particular
716 * to the current transaction.
717 */
718 iip->ili_ilock_recur = 0;
719 iip->ili_iolock_recur = 0;
720 iip->ili_flags = 0;
721
722 /*
723 * Unlock the inode if XFS_ILI_HOLD was not set.
724 */
725 if (!hold) {
726 lock_flags = XFS_ILOCK_EXCL;
727 if (iolocked & XFS_ILI_IOLOCKED_EXCL) {
728 lock_flags |= XFS_IOLOCK_EXCL;
729 } else if (iolocked & XFS_ILI_IOLOCKED_SHARED) {
730 lock_flags |= XFS_IOLOCK_SHARED;
731 }
732 xfs_iput(iip->ili_inode, lock_flags);
733 }
734}
735
736/*
737 * This is called to find out where the oldest active copy of the
738 * inode log item in the on disk log resides now that the last log
739 * write of it completed at the given lsn. Since we always re-log
740 * all dirty data in an inode, the latest copy in the on disk log
741 * is the only one that matters. Therefore, simply return the
742 * given lsn.
743 */
744/*ARGSUSED*/
745STATIC xfs_lsn_t
746xfs_inode_item_committed(
747 xfs_inode_log_item_t *iip,
748 xfs_lsn_t lsn)
749{
750 return (lsn);
751}
752
753/*
754 * The transaction with the inode locked has aborted. The inode
755 * must not be dirty within the transaction (unless we're forcibly
756 * shutting down). We simply unlock just as if the transaction
757 * had been cancelled.
758 */
759STATIC void
760xfs_inode_item_abort(
761 xfs_inode_log_item_t *iip)
762{
763 xfs_inode_item_unlock(iip);
764 return;
765}
766
767
768/*
769 * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK
770 * failed to get the inode flush lock but did get the inode locked SHARED.
771 * Here we're trying to see if the inode buffer is incore, and if so whether it's
772 * marked delayed write. If that's the case, we'll initiate a bawrite on that
773 * buffer to expedite the process.
774 *
775 * We aren't holding the AIL_LOCK (or the flush lock) when this gets called,
776 * so it is inherently race-y.
777 */
778STATIC void
779xfs_inode_item_pushbuf(
780 xfs_inode_log_item_t *iip)
781{
782 xfs_inode_t *ip;
783 xfs_mount_t *mp;
784 xfs_buf_t *bp;
785 uint dopush;
786
787 ip = iip->ili_inode;
788
789 ASSERT(ismrlocked(&(ip->i_lock), MR_ACCESS));
790
791 /*
792 * The ili_pushbuf_flag keeps others from
793 * trying to duplicate our effort.
794 */
795 ASSERT(iip->ili_pushbuf_flag != 0);
796 ASSERT(iip->ili_push_owner == get_thread_id());
797
798 /*
799 * If flushlock isn't locked anymore, chances are that the
800 * inode flush completed and the inode was taken off the AIL.
801 * So, just get out.
802 */
803 if ((valusema(&(ip->i_flock)) > 0) ||
804 ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) {
805 iip->ili_pushbuf_flag = 0;
806 xfs_iunlock(ip, XFS_ILOCK_SHARED);
807 return;
808 }
809
810 mp = ip->i_mount;
811 bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno,
812 iip->ili_format.ilf_len, XFS_INCORE_TRYLOCK);
813
814 if (bp != NULL) {
815 if (XFS_BUF_ISDELAYWRITE(bp)) {
816 /*
817 * We were racing with iflush because we don't hold
818 * the AIL_LOCK or the flush lock. However, at this point,
819 * we have the buffer, and we know that it's dirty.
820 * So, it's possible that iflush raced with us, and
821 * this item is already taken off the AIL.
822 * If not, we can flush it async.
823 */
824 dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) &&
825 (valusema(&(ip->i_flock)) <= 0));
826 iip->ili_pushbuf_flag = 0;
827 xfs_iunlock(ip, XFS_ILOCK_SHARED);
828 xfs_buftrace("INODE ITEM PUSH", bp);
829 if (XFS_BUF_ISPINNED(bp)) {
830 xfs_log_force(mp, (xfs_lsn_t)0,
831 XFS_LOG_FORCE);
832 }
833 if (dopush) {
834 xfs_bawrite(mp, bp);
835 } else {
836 xfs_buf_relse(bp);
837 }
838 } else {
839 iip->ili_pushbuf_flag = 0;
840 xfs_iunlock(ip, XFS_ILOCK_SHARED);
841 xfs_buf_relse(bp);
842 }
843 return;
844 }
845 /*
846 * We have to be careful about resetting pushbuf flag too early (above).
847 * Even though in theory we can do it as soon as we have the buflock,
848 * we don't want others to be doing work needlessly. They'll come to
849 * this function thinking that pushing the buffer is their
850 * responsibility only to find that the buffer is still locked by
851 * another doing the same thing
852 */
853 iip->ili_pushbuf_flag = 0;
854 xfs_iunlock(ip, XFS_ILOCK_SHARED);
855 return;
856}
857
858
859/*
860 * This is called to asynchronously write the inode associated with this
861 * inode log item out to disk. The inode will already have been locked by
862 * a successful call to xfs_inode_item_trylock().
863 */
864STATIC void
865xfs_inode_item_push(
866 xfs_inode_log_item_t *iip)
867{
868 xfs_inode_t *ip;
869
870 ip = iip->ili_inode;
871
872 ASSERT(ismrlocked(&(ip->i_lock), MR_ACCESS));
873 ASSERT(valusema(&(ip->i_flock)) <= 0);
874 /*
875 * Since we were able to lock the inode's flush lock and
876 * we found it on the AIL, the inode must be dirty. This
877 * is because the inode is removed from the AIL while still
878 * holding the flush lock in xfs_iflush_done(). Thus, if
879 * we found it in the AIL and were able to obtain the flush
880 * lock without sleeping, then there must not have been
881 * anyone in the process of flushing the inode.
882 */
883 ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) ||
884 iip->ili_format.ilf_fields != 0);
885
886 /*
887 * Write out the inode. The completion routine ('iflush_done') will
888 * pull it from the AIL, mark it clean, unlock the flush lock.
889 */
890 (void) xfs_iflush(ip, XFS_IFLUSH_ASYNC);
891 xfs_iunlock(ip, XFS_ILOCK_SHARED);
892
893 return;
894}
895
896/*
897 * XXX rcc - this one really has to do something. Probably needs
898 * to stamp in a new field in the incore inode.
899 */
900/* ARGSUSED */
901STATIC void
902xfs_inode_item_committing(
903 xfs_inode_log_item_t *iip,
904 xfs_lsn_t lsn)
905{
906 iip->ili_last_lsn = lsn;
907 return;
908}
909
910/*
911 * This is the ops vector shared by all buf log items.
912 */
913struct xfs_item_ops xfs_inode_item_ops = {
914 .iop_size = (uint(*)(xfs_log_item_t*))xfs_inode_item_size,
915 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
916 xfs_inode_item_format,
917 .iop_pin = (void(*)(xfs_log_item_t*))xfs_inode_item_pin,
918 .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_inode_item_unpin,
919 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
920 xfs_inode_item_unpin_remove,
921 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_inode_item_trylock,
922 .iop_unlock = (void(*)(xfs_log_item_t*))xfs_inode_item_unlock,
923 .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
924 xfs_inode_item_committed,
925 .iop_push = (void(*)(xfs_log_item_t*))xfs_inode_item_push,
926 .iop_abort = (void(*)(xfs_log_item_t*))xfs_inode_item_abort,
927 .iop_pushbuf = (void(*)(xfs_log_item_t*))xfs_inode_item_pushbuf,
928 .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
929 xfs_inode_item_committing
930};
931
932
933/*
934 * Initialize the inode log item for a newly allocated (in-core) inode.
935 */
936void
937xfs_inode_item_init(
938 xfs_inode_t *ip,
939 xfs_mount_t *mp)
940{
941 xfs_inode_log_item_t *iip;
942
943 ASSERT(ip->i_itemp == NULL);
944 iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP);
945
946 iip->ili_item.li_type = XFS_LI_INODE;
947 iip->ili_item.li_ops = &xfs_inode_item_ops;
948 iip->ili_item.li_mountp = mp;
949 iip->ili_inode = ip;
950
951 /*
952 We have zeroed memory. No need ...
953 iip->ili_extents_buf = NULL;
954 iip->ili_pushbuf_flag = 0;
955 */
956
957 iip->ili_format.ilf_type = XFS_LI_INODE;
958 iip->ili_format.ilf_ino = ip->i_ino;
959 iip->ili_format.ilf_blkno = ip->i_blkno;
960 iip->ili_format.ilf_len = ip->i_len;
961 iip->ili_format.ilf_boffset = ip->i_boffset;
962}
963
964/*
965 * Free the inode log item and any memory hanging off of it.
966 */
967void
968xfs_inode_item_destroy(
969 xfs_inode_t *ip)
970{
971#ifdef XFS_TRANS_DEBUG
972 if (ip->i_itemp->ili_root_size != 0) {
973 kmem_free(ip->i_itemp->ili_orig_root,
974 ip->i_itemp->ili_root_size);
975 }
976#endif
977 kmem_zone_free(xfs_ili_zone, ip->i_itemp);
978}
979
980
981/*
982 * This is the inode flushing I/O completion routine. It is called
983 * from interrupt level when the buffer containing the inode is
984 * flushed to disk. It is responsible for removing the inode item
985 * from the AIL if it has not been re-logged, and unlocking the inode's
986 * flush lock.
987 */
988/*ARGSUSED*/
989void
990xfs_iflush_done(
991 xfs_buf_t *bp,
992 xfs_inode_log_item_t *iip)
993{
994 xfs_inode_t *ip;
995 SPLDECL(s);
996
997 ip = iip->ili_inode;
998
999 /*
1000 * We only want to pull the item from the AIL if it is
1001 * actually there and its location in the log has not
1002 * changed since we started the flush. Thus, we only bother
1003 * if the ili_logged flag is set and the inode's lsn has not
1004 * changed. First we check the lsn outside
1005 * the lock since it's cheaper, and then we recheck while
1006 * holding the lock before removing the inode from the AIL.
1007 */
1008 if (iip->ili_logged &&
1009 (iip->ili_item.li_lsn == iip->ili_flush_lsn)) {
1010 AIL_LOCK(ip->i_mount, s);
1011 if (iip->ili_item.li_lsn == iip->ili_flush_lsn) {
1012 /*
1013 * xfs_trans_delete_ail() drops the AIL lock.
1014 */
1015 xfs_trans_delete_ail(ip->i_mount,
1016 (xfs_log_item_t*)iip, s);
1017 } else {
1018 AIL_UNLOCK(ip->i_mount, s);
1019 }
1020 }
1021
1022 iip->ili_logged = 0;
1023
1024 /*
1025 * Clear the ili_last_fields bits now that we know that the
1026 * data corresponding to them is safely on disk.
1027 */
1028 iip->ili_last_fields = 0;
1029
1030 /*
1031 * Release the inode's flush lock since we're done with it.
1032 */
1033 xfs_ifunlock(ip);
1034
1035 return;
1036}
1037
1038/*
1039 * This is the inode flushing abort routine. It is called
1040 * from xfs_iflush when the filesystem is shutting down to clean
1041 * up the inode state.
1042 * It is responsible for removing the inode item
1043 * from the AIL if it has not been re-logged, and unlocking the inode's
1044 * flush lock.
1045 */
1046void
1047xfs_iflush_abort(
1048 xfs_inode_t *ip)
1049{
1050 xfs_inode_log_item_t *iip;
1051 xfs_mount_t *mp;
1052 SPLDECL(s);
1053
1054 iip = ip->i_itemp;
1055 mp = ip->i_mount;
1056 if (iip) {
1057 if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
1058 AIL_LOCK(mp, s);
1059 if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
1060 /*
1061 * xfs_trans_delete_ail() drops the AIL lock.
1062 */
1063 xfs_trans_delete_ail(mp, (xfs_log_item_t *)iip,
1064 s);
1065 } else
1066 AIL_UNLOCK(mp, s);
1067 }
1068 iip->ili_logged = 0;
1069 /*
1070 * Clear the ili_last_fields bits now that we know that the
1071 * data corresponding to them is safely on disk.
1072 */
1073 iip->ili_last_fields = 0;
1074 /*
1075 * Clear the inode logging fields so no more flushes are
1076 * attempted.
1077 */
1078 iip->ili_format.ilf_fields = 0;
1079 }
1080 /*
1081 * Release the inode's flush lock since we're done with it.
1082 */
1083 xfs_ifunlock(ip);
1084}
1085
1086void
1087xfs_istale_done(
1088 xfs_buf_t *bp,
1089 xfs_inode_log_item_t *iip)
1090{
1091 xfs_iflush_abort(iip->ili_inode);
1092}
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
new file mode 100644
index 000000000000..d8775e0d6291
--- /dev/null
+++ b/fs/xfs/xfs_inode_item.h
@@ -0,0 +1,197 @@
1/*
2 * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_INODE_ITEM_H__
33#define __XFS_INODE_ITEM_H__
34
35/*
36 * This is the structure used to lay out an inode log item in the
37 * log. The size of the inline data/extents/b-tree root to be logged
38 * (if any) is indicated in the ilf_dsize field. Changes to this structure
39 * must be added on to the end.
40 *
41 * Convention for naming inode log item versions : The current version
42 * is always named XFS_LI_INODE. When an inode log item gets superseded,
43 * add the latest version of IRIX that will generate logs with that item
44 * to the version name.
45 *
46 * -Version 1 of this structure (XFS_LI_5_3_INODE) included up to the first
47 * union (ilf_u) field. This was released with IRIX 5.3-XFS.
48 * -Version 2 of this structure (XFS_LI_6_1_INODE) is currently the entire
49 * structure. This was released with IRIX 6.0.1-XFS and IRIX 6.1.
50 * -Version 3 of this structure (XFS_LI_INODE) is the same as version 2
51 * so a new structure definition wasn't necessary. However, we had
52 * to add a new type because the inode cluster size changed from 4K
53 * to 8K and the version number had to be rev'ved to keep older kernels
54 * from trying to recover logs with the 8K buffers in them. The logging
55 * code can handle recovery on different-sized clusters now so hopefully
56 * this'll be the last time we need to change the inode log item just
57 * for a change in the inode cluster size. This new version was
58 * released with IRIX 6.2.
59 */
60typedef struct xfs_inode_log_format {
61 unsigned short ilf_type; /* inode log item type */
62 unsigned short ilf_size; /* size of this item */
63 uint ilf_fields; /* flags for fields logged */
64 ushort ilf_asize; /* size of attr d/ext/root */
65 ushort ilf_dsize; /* size of data/ext/root */
66 xfs_ino_t ilf_ino; /* inode number */
67 union {
68 xfs_dev_t ilfu_rdev; /* rdev value for dev inode*/
69 uuid_t ilfu_uuid; /* mount point value */
70 } ilf_u;
71 __int64_t ilf_blkno; /* blkno of inode buffer */
72 int ilf_len; /* len of inode buffer */
73 int ilf_boffset; /* off of inode in buffer */
74} xfs_inode_log_format_t;
75
76/* Initial version shipped with IRIX 5.3-XFS */
77typedef struct xfs_inode_log_format_v1 {
78 unsigned short ilf_type; /* inode log item type */
79 unsigned short ilf_size; /* size of this item */
80 uint ilf_fields; /* flags for fields logged */
81 uint ilf_dsize; /* size of data/ext/root */
82 xfs_ino_t ilf_ino; /* inode number */
83 union {
84 xfs_dev_t ilfu_rdev; /* rdev value for dev inode*/
85 uuid_t ilfu_uuid; /* mount point value */
86 } ilf_u;
87} xfs_inode_log_format_t_v1;
88
89/*
90 * Flags for xfs_trans_log_inode flags field.
91 */
92#define XFS_ILOG_CORE 0x001 /* log standard inode fields */
93#define XFS_ILOG_DDATA 0x002 /* log i_df.if_data */
94#define XFS_ILOG_DEXT 0x004 /* log i_df.if_extents */
95#define XFS_ILOG_DBROOT 0x008 /* log i_df.i_broot */
96#define XFS_ILOG_DEV 0x010 /* log the dev field */
97#define XFS_ILOG_UUID 0x020 /* log the uuid field */
98#define XFS_ILOG_ADATA 0x040 /* log i_af.if_data */
99#define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */
100#define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */
101
102#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
103 XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
104 XFS_ILOG_UUID | XFS_ILOG_ADATA | \
105 XFS_ILOG_AEXT | XFS_ILOG_ABROOT)
106
107#define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
108 XFS_ILOG_DBROOT)
109
110#define XFS_ILOG_AFORK (XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
111 XFS_ILOG_ABROOT)
112
113#define XFS_ILOG_ALL (XFS_ILOG_CORE | XFS_ILOG_DDATA | \
114 XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
115 XFS_ILOG_DEV | XFS_ILOG_UUID | \
116 XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
117 XFS_ILOG_ABROOT)
118
119#define XFS_ILI_HOLD 0x1
120#define XFS_ILI_IOLOCKED_EXCL 0x2
121#define XFS_ILI_IOLOCKED_SHARED 0x4
122
123#define XFS_ILI_IOLOCKED_ANY (XFS_ILI_IOLOCKED_EXCL | XFS_ILI_IOLOCKED_SHARED)
124
125
126#ifdef __KERNEL__
127
128struct xfs_buf;
129struct xfs_bmbt_rec_64;
130struct xfs_inode;
131struct xfs_mount;
132
133
134typedef struct xfs_inode_log_item {
135 xfs_log_item_t ili_item; /* common portion */
136 struct xfs_inode *ili_inode; /* inode ptr */
137 xfs_lsn_t ili_flush_lsn; /* lsn at last flush */
138 xfs_lsn_t ili_last_lsn; /* lsn at last transaction */
139 unsigned short ili_ilock_recur; /* lock recursion count */
140 unsigned short ili_iolock_recur; /* lock recursion count */
141 unsigned short ili_flags; /* misc flags */
142 unsigned short ili_logged; /* flushed logged data */
143 unsigned int ili_last_fields; /* fields when flushed */
144 struct xfs_bmbt_rec_64 *ili_extents_buf; /* array of logged
145 data exts */
146 struct xfs_bmbt_rec_64 *ili_aextents_buf; /* array of logged
147 attr exts */
148 unsigned int ili_pushbuf_flag; /* one bit used in push_ail */
149
150#ifdef DEBUG
151 uint64_t ili_push_owner; /* one who sets pushbuf_flag
152 above gets to push the buf */
153#endif
154#ifdef XFS_TRANS_DEBUG
155 int ili_root_size;
156 char *ili_orig_root;
157#endif
158 xfs_inode_log_format_t ili_format; /* logged structure */
159} xfs_inode_log_item_t;
160
161
162#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ILOG_FDATA)
163int xfs_ilog_fdata(int w);
164#define XFS_ILOG_FDATA(w) xfs_ilog_fdata(w)
165#else
166#define XFS_ILOG_FDATA(w) \
167 ((w) == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA)
168#endif
169
170#endif /* __KERNEL__ */
171
172#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ILOG_FBROOT)
173int xfs_ilog_fbroot(int w);
174#define XFS_ILOG_FBROOT(w) xfs_ilog_fbroot(w)
175#else
176#define XFS_ILOG_FBROOT(w) \
177 ((w) == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT)
178#endif
179#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ILOG_FEXT)
180int xfs_ilog_fext(int w);
181#define XFS_ILOG_FEXT(w) xfs_ilog_fext(w)
182#else
183#define XFS_ILOG_FEXT(w) \
184 ((w) == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT)
185#endif
186
187#ifdef __KERNEL__
188
189void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
190void xfs_inode_item_destroy(struct xfs_inode *);
191void xfs_iflush_done(struct xfs_buf *, xfs_inode_log_item_t *);
192void xfs_istale_done(struct xfs_buf *, xfs_inode_log_item_t *);
193void xfs_iflush_abort(struct xfs_inode *);
194
195#endif /* __KERNEL__ */
196
197#endif /* __XFS_INODE_ITEM_H__ */
diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/xfs_inum.h
new file mode 100644
index 000000000000..a3af2d5a6eb7
--- /dev/null
+++ b/fs/xfs/xfs_inum.h
@@ -0,0 +1,173 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_INUM_H__
33#define __XFS_INUM_H__
34
35/*
36 * Inode number format:
37 * low inopblog bits - offset in block
38 * next agblklog bits - block number in ag
39 * next agno_log bits - ag number
40 * high agno_log-agblklog-inopblog bits - 0
41 */
42
43typedef __uint32_t xfs_agino_t; /* within allocation grp inode number */
44
45/*
46 * Useful inode bits for this kernel.
47 * Used in some places where having 64-bits in the 32-bit kernels
48 * costs too much.
49 */
50#if XFS_BIG_INUMS
51typedef xfs_ino_t xfs_intino_t;
52#else
53typedef __uint32_t xfs_intino_t;
54#endif
55
56#define NULLFSINO ((xfs_ino_t)-1)
57#define NULLAGINO ((xfs_agino_t)-1)
58
59struct xfs_mount;
60
61#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_MASK)
62__uint32_t xfs_ino_mask(int k);
63#define XFS_INO_MASK(k) xfs_ino_mask(k)
64#else
65#define XFS_INO_MASK(k) ((__uint32_t)((1ULL << (k)) - 1))
66#endif
67#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_OFFSET_BITS)
68int xfs_ino_offset_bits(struct xfs_mount *mp);
69#define XFS_INO_OFFSET_BITS(mp) xfs_ino_offset_bits(mp)
70#else
71#define XFS_INO_OFFSET_BITS(mp) ((mp)->m_sb.sb_inopblog)
72#endif
73#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_AGBNO_BITS)
74int xfs_ino_agbno_bits(struct xfs_mount *mp);
75#define XFS_INO_AGBNO_BITS(mp) xfs_ino_agbno_bits(mp)
76#else
77#define XFS_INO_AGBNO_BITS(mp) ((mp)->m_sb.sb_agblklog)
78#endif
79#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_AGINO_BITS)
80int xfs_ino_agino_bits(struct xfs_mount *mp);
81#define XFS_INO_AGINO_BITS(mp) xfs_ino_agino_bits(mp)
82#else
83#define XFS_INO_AGINO_BITS(mp) ((mp)->m_agino_log)
84#endif
85#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_AGNO_BITS)
86int xfs_ino_agno_bits(struct xfs_mount *mp);
87#define XFS_INO_AGNO_BITS(mp) xfs_ino_agno_bits(mp)
88#else
89#define XFS_INO_AGNO_BITS(mp) ((mp)->m_agno_log)
90#endif
91#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_BITS)
92int xfs_ino_bits(struct xfs_mount *mp);
93#define XFS_INO_BITS(mp) xfs_ino_bits(mp)
94#else
95#define XFS_INO_BITS(mp) (XFS_INO_AGNO_BITS(mp) + XFS_INO_AGINO_BITS(mp))
96#endif
97
98#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_TO_AGNO)
99xfs_agnumber_t xfs_ino_to_agno(struct xfs_mount *mp, xfs_ino_t i);
100#define XFS_INO_TO_AGNO(mp,i) xfs_ino_to_agno(mp,i)
101#else
102#define XFS_INO_TO_AGNO(mp,i) \
103 ((xfs_agnumber_t)((i) >> XFS_INO_AGINO_BITS(mp)))
104#endif
105#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_TO_AGINO)
106xfs_agino_t xfs_ino_to_agino(struct xfs_mount *mp, xfs_ino_t i);
107#define XFS_INO_TO_AGINO(mp,i) xfs_ino_to_agino(mp,i)
108#else
109#define XFS_INO_TO_AGINO(mp,i) \
110 ((xfs_agino_t)(i) & XFS_INO_MASK(XFS_INO_AGINO_BITS(mp)))
111#endif
112#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_TO_AGBNO)
113xfs_agblock_t xfs_ino_to_agbno(struct xfs_mount *mp, xfs_ino_t i);
114#define XFS_INO_TO_AGBNO(mp,i) xfs_ino_to_agbno(mp,i)
115#else
116#define XFS_INO_TO_AGBNO(mp,i) \
117 (((xfs_agblock_t)(i) >> XFS_INO_OFFSET_BITS(mp)) & \
118 XFS_INO_MASK(XFS_INO_AGBNO_BITS(mp)))
119#endif
120#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_TO_OFFSET)
121int xfs_ino_to_offset(struct xfs_mount *mp, xfs_ino_t i);
122#define XFS_INO_TO_OFFSET(mp,i) xfs_ino_to_offset(mp,i)
123#else
124#define XFS_INO_TO_OFFSET(mp,i) \
125 ((int)(i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
126#endif
127#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_TO_FSB)
128xfs_fsblock_t xfs_ino_to_fsb(struct xfs_mount *mp, xfs_ino_t i);
129#define XFS_INO_TO_FSB(mp,i) xfs_ino_to_fsb(mp,i)
130#else
131#define XFS_INO_TO_FSB(mp,i) \
132 XFS_AGB_TO_FSB(mp, XFS_INO_TO_AGNO(mp,i), XFS_INO_TO_AGBNO(mp,i))
133#endif
134
135#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGINO_TO_INO)
136xfs_ino_t
137xfs_agino_to_ino(struct xfs_mount *mp, xfs_agnumber_t a, xfs_agino_t i);
138#define XFS_AGINO_TO_INO(mp,a,i) xfs_agino_to_ino(mp,a,i)
139#else
140#define XFS_AGINO_TO_INO(mp,a,i) \
141 (((xfs_ino_t)(a) << XFS_INO_AGINO_BITS(mp)) | (i))
142#endif
143#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGINO_TO_AGBNO)
144xfs_agblock_t xfs_agino_to_agbno(struct xfs_mount *mp, xfs_agino_t i);
145#define XFS_AGINO_TO_AGBNO(mp,i) xfs_agino_to_agbno(mp,i)
146#else
147#define XFS_AGINO_TO_AGBNO(mp,i) ((i) >> XFS_INO_OFFSET_BITS(mp))
148#endif
149#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGINO_TO_OFFSET)
150int xfs_agino_to_offset(struct xfs_mount *mp, xfs_agino_t i);
151#define XFS_AGINO_TO_OFFSET(mp,i) xfs_agino_to_offset(mp,i)
152#else
153#define XFS_AGINO_TO_OFFSET(mp,i) \
154 ((i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
155#endif
156
157#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_OFFBNO_TO_AGINO)
158xfs_agino_t xfs_offbno_to_agino(struct xfs_mount *mp, xfs_agblock_t b, int o);
159#define XFS_OFFBNO_TO_AGINO(mp,b,o) xfs_offbno_to_agino(mp,b,o)
160#else
161#define XFS_OFFBNO_TO_AGINO(mp,b,o) \
162 ((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o)))
163#endif
164
165#if XFS_BIG_INUMS
166#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL))
167#define XFS_INO64_OFFSET ((xfs_ino_t)(1ULL << 32))
168#else
169#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 32) - 1ULL))
170#endif
171#define XFS_MAXINUMBER_32 ((xfs_ino_t)((1ULL << 32) - 1ULL))
172
173#endif /* __XFS_INUM_H__ */
diff --git a/fs/xfs/xfs_iocore.c b/fs/xfs/xfs_iocore.c
new file mode 100644
index 000000000000..414ec496845f
--- /dev/null
+++ b/fs/xfs/xfs_iocore.c
@@ -0,0 +1,133 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34#include "xfs_macros.h"
35#include "xfs_types.h"
36#include "xfs_inum.h"
37#include "xfs_log.h"
38#include "xfs_trans.h"
39#include "xfs_sb.h"
40#include "xfs_ag.h"
41#include "xfs_dir.h"
42#include "xfs_dir2.h"
43#include "xfs_dmapi.h"
44#include "xfs_mount.h"
45#include "xfs_alloc_btree.h"
46#include "xfs_bmap_btree.h"
47#include "xfs_ialloc_btree.h"
48#include "xfs_itable.h"
49#include "xfs_btree.h"
50#include "xfs_alloc.h"
51#include "xfs_ialloc.h"
52#include "xfs_attr_sf.h"
53#include "xfs_dir_sf.h"
54#include "xfs_dir2_sf.h"
55#include "xfs_dinode.h"
56#include "xfs_inode_item.h"
57#include "xfs_inode.h"
58#include "xfs_bmap.h"
59#include "xfs_error.h"
60#include "xfs_bit.h"
61#include "xfs_rw.h"
62#include "xfs_quota.h"
63#include "xfs_trans_space.h"
64#include "xfs_iomap.h"
65
66
67STATIC xfs_fsize_t
68xfs_size_fn(
69 xfs_inode_t *ip)
70{
71 return (ip->i_d.di_size);
72}
73
74STATIC int
75xfs_ioinit(
76 struct vfs *vfsp,
77 struct xfs_mount_args *mntargs,
78 int flags)
79{
80 return xfs_mountfs(vfsp, XFS_VFSTOM(vfsp), flags);
81}
82
83xfs_ioops_t xfs_iocore_xfs = {
84 .xfs_ioinit = (xfs_ioinit_t) xfs_ioinit,
85 .xfs_bmapi_func = (xfs_bmapi_t) xfs_bmapi,
86 .xfs_bmap_eof_func = (xfs_bmap_eof_t) xfs_bmap_eof,
87 .xfs_iomap_write_direct =
88 (xfs_iomap_write_direct_t) xfs_iomap_write_direct,
89 .xfs_iomap_write_delay =
90 (xfs_iomap_write_delay_t) xfs_iomap_write_delay,
91 .xfs_iomap_write_allocate =
92 (xfs_iomap_write_allocate_t) xfs_iomap_write_allocate,
93 .xfs_iomap_write_unwritten =
94 (xfs_iomap_write_unwritten_t) xfs_iomap_write_unwritten,
95 .xfs_ilock = (xfs_lock_t) xfs_ilock,
96 .xfs_lck_map_shared = (xfs_lck_map_shared_t) xfs_ilock_map_shared,
97 .xfs_ilock_demote = (xfs_lock_demote_t) xfs_ilock_demote,
98 .xfs_ilock_nowait = (xfs_lock_nowait_t) xfs_ilock_nowait,
99 .xfs_unlock = (xfs_unlk_t) xfs_iunlock,
100 .xfs_size_func = (xfs_size_t) xfs_size_fn,
101 .xfs_iodone = (xfs_iodone_t) fs_noerr,
102};
103
104void
105xfs_iocore_inode_reinit(
106 xfs_inode_t *ip)
107{
108 xfs_iocore_t *io = &ip->i_iocore;
109
110 io->io_flags = 0;
111 if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)
112 io->io_flags |= XFS_IOCORE_RT;
113 io->io_dmevmask = ip->i_d.di_dmevmask;
114 io->io_dmstate = ip->i_d.di_dmstate;
115}
116
117void
118xfs_iocore_inode_init(
119 xfs_inode_t *ip)
120{
121 xfs_iocore_t *io = &ip->i_iocore;
122 xfs_mount_t *mp = ip->i_mount;
123
124 io->io_mount = mp;
125#ifdef DEBUG
126 io->io_lock = &ip->i_lock;
127 io->io_iolock = &ip->i_iolock;
128#endif
129
130 io->io_obj = (void *)ip;
131
132 xfs_iocore_inode_reinit(ip);
133}
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
new file mode 100644
index 000000000000..3826e8f0e28a
--- /dev/null
+++ b/fs/xfs/xfs_iomap.c
@@ -0,0 +1,1000 @@
1/*
2 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34
35#include "xfs_fs.h"
36#include "xfs_inum.h"
37#include "xfs_log.h"
38#include "xfs_trans.h"
39#include "xfs_sb.h"
40#include "xfs_ag.h"
41#include "xfs_dir.h"
42#include "xfs_dir2.h"
43#include "xfs_alloc.h"
44#include "xfs_dmapi.h"
45#include "xfs_quota.h"
46#include "xfs_mount.h"
47#include "xfs_alloc_btree.h"
48#include "xfs_bmap_btree.h"
49#include "xfs_ialloc_btree.h"
50#include "xfs_btree.h"
51#include "xfs_ialloc.h"
52#include "xfs_attr_sf.h"
53#include "xfs_dir_sf.h"
54#include "xfs_dir2_sf.h"
55#include "xfs_dinode.h"
56#include "xfs_inode.h"
57#include "xfs_bmap.h"
58#include "xfs_bit.h"
59#include "xfs_rtalloc.h"
60#include "xfs_error.h"
61#include "xfs_itable.h"
62#include "xfs_rw.h"
63#include "xfs_acl.h"
64#include "xfs_cap.h"
65#include "xfs_mac.h"
66#include "xfs_attr.h"
67#include "xfs_buf_item.h"
68#include "xfs_trans_space.h"
69#include "xfs_utils.h"
70#include "xfs_iomap.h"
71
72#if defined(XFS_RW_TRACE)
73void
74xfs_iomap_enter_trace(
75 int tag,
76 xfs_iocore_t *io,
77 xfs_off_t offset,
78 ssize_t count)
79{
80 xfs_inode_t *ip = XFS_IO_INODE(io);
81
82 if (!ip->i_rwtrace)
83 return;
84
85 ktrace_enter(ip->i_rwtrace,
86 (void *)((unsigned long)tag),
87 (void *)ip,
88 (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
89 (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
90 (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
91 (void *)((unsigned long)(offset & 0xffffffff)),
92 (void *)((unsigned long)count),
93 (void *)((unsigned long)((io->io_new_size >> 32) & 0xffffffff)),
94 (void *)((unsigned long)(io->io_new_size & 0xffffffff)),
95 (void *)NULL,
96 (void *)NULL,
97 (void *)NULL,
98 (void *)NULL,
99 (void *)NULL,
100 (void *)NULL,
101 (void *)NULL);
102}
103
104void
105xfs_iomap_map_trace(
106 int tag,
107 xfs_iocore_t *io,
108 xfs_off_t offset,
109 ssize_t count,
110 xfs_iomap_t *iomapp,
111 xfs_bmbt_irec_t *imapp,
112 int flags)
113{
114 xfs_inode_t *ip = XFS_IO_INODE(io);
115
116 if (!ip->i_rwtrace)
117 return;
118
119 ktrace_enter(ip->i_rwtrace,
120 (void *)((unsigned long)tag),
121 (void *)ip,
122 (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
123 (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
124 (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
125 (void *)((unsigned long)(offset & 0xffffffff)),
126 (void *)((unsigned long)count),
127 (void *)((unsigned long)flags),
128 (void *)((unsigned long)((iomapp->iomap_offset >> 32) & 0xffffffff)),
129 (void *)((unsigned long)(iomapp->iomap_offset & 0xffffffff)),
130 (void *)((unsigned long)(iomapp->iomap_delta)),
131 (void *)((unsigned long)(iomapp->iomap_bsize)),
132 (void *)((unsigned long)(iomapp->iomap_bn)),
133 (void *)(__psint_t)(imapp->br_startoff),
134 (void *)((unsigned long)(imapp->br_blockcount)),
135 (void *)(__psint_t)(imapp->br_startblock));
136}
137#else
138#define xfs_iomap_enter_trace(tag, io, offset, count)
139#define xfs_iomap_map_trace(tag, io, offset, count, iomapp, imapp, flags)
140#endif
141
142#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
143 << mp->m_writeio_log)
144#define XFS_STRAT_WRITE_IMAPS 2
145#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP
146
147STATIC int
148xfs_imap_to_bmap(
149 xfs_iocore_t *io,
150 xfs_off_t offset,
151 xfs_bmbt_irec_t *imap,
152 xfs_iomap_t *iomapp,
153 int imaps, /* Number of imap entries */
154 int iomaps, /* Number of iomap entries */
155 int flags)
156{
157 xfs_mount_t *mp;
158 xfs_fsize_t nisize;
159 int pbm;
160 xfs_fsblock_t start_block;
161
162 mp = io->io_mount;
163 nisize = XFS_SIZE(mp, io);
164 if (io->io_new_size > nisize)
165 nisize = io->io_new_size;
166
167 for (pbm = 0; imaps && pbm < iomaps; imaps--, iomapp++, imap++, pbm++) {
168 iomapp->iomap_offset = XFS_FSB_TO_B(mp, imap->br_startoff);
169 iomapp->iomap_delta = offset - iomapp->iomap_offset;
170 iomapp->iomap_bsize = XFS_FSB_TO_B(mp, imap->br_blockcount);
171 iomapp->iomap_flags = flags;
172
173 if (io->io_flags & XFS_IOCORE_RT) {
174 iomapp->iomap_flags |= IOMAP_REALTIME;
175 iomapp->iomap_target = mp->m_rtdev_targp;
176 } else {
177 iomapp->iomap_target = mp->m_ddev_targp;
178 }
179 start_block = imap->br_startblock;
180 if (start_block == HOLESTARTBLOCK) {
181 iomapp->iomap_bn = IOMAP_DADDR_NULL;
182 iomapp->iomap_flags |= IOMAP_HOLE;
183 } else if (start_block == DELAYSTARTBLOCK) {
184 iomapp->iomap_bn = IOMAP_DADDR_NULL;
185 iomapp->iomap_flags |= IOMAP_DELAY;
186 } else {
187 iomapp->iomap_bn = XFS_FSB_TO_DB_IO(io, start_block);
188 if (ISUNWRITTEN(imap))
189 iomapp->iomap_flags |= IOMAP_UNWRITTEN;
190 }
191
192 if ((iomapp->iomap_offset + iomapp->iomap_bsize) >= nisize) {
193 iomapp->iomap_flags |= IOMAP_EOF;
194 }
195
196 offset += iomapp->iomap_bsize - iomapp->iomap_delta;
197 }
198 return pbm; /* Return the number filled */
199}
200
201int
202xfs_iomap(
203 xfs_iocore_t *io,
204 xfs_off_t offset,
205 ssize_t count,
206 int flags,
207 xfs_iomap_t *iomapp,
208 int *niomaps)
209{
210 xfs_mount_t *mp = io->io_mount;
211 xfs_fileoff_t offset_fsb, end_fsb;
212 int error = 0;
213 int lockmode = 0;
214 xfs_bmbt_irec_t imap;
215 int nimaps = 1;
216 int bmapi_flags = 0;
217 int iomap_flags = 0;
218
219 if (XFS_FORCED_SHUTDOWN(mp))
220 return XFS_ERROR(EIO);
221
222 switch (flags &
223 (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE |
224 BMAPI_UNWRITTEN | BMAPI_DEVICE)) {
225 case BMAPI_READ:
226 xfs_iomap_enter_trace(XFS_IOMAP_READ_ENTER, io, offset, count);
227 lockmode = XFS_LCK_MAP_SHARED(mp, io);
228 bmapi_flags = XFS_BMAPI_ENTIRE;
229 if (flags & BMAPI_IGNSTATE)
230 bmapi_flags |= XFS_BMAPI_IGSTATE;
231 break;
232 case BMAPI_WRITE:
233 xfs_iomap_enter_trace(XFS_IOMAP_WRITE_ENTER, io, offset, count);
234 lockmode = XFS_ILOCK_EXCL|XFS_EXTSIZE_WR;
235 bmapi_flags = 0;
236 XFS_ILOCK(mp, io, lockmode);
237 break;
238 case BMAPI_ALLOCATE:
239 xfs_iomap_enter_trace(XFS_IOMAP_ALLOC_ENTER, io, offset, count);
240 lockmode = XFS_ILOCK_SHARED|XFS_EXTSIZE_RD;
241 bmapi_flags = XFS_BMAPI_ENTIRE;
242 /* Attempt non-blocking lock */
243 if (flags & BMAPI_TRYLOCK) {
244 if (!XFS_ILOCK_NOWAIT(mp, io, lockmode))
245 return XFS_ERROR(EAGAIN);
246 } else {
247 XFS_ILOCK(mp, io, lockmode);
248 }
249 break;
250 case BMAPI_UNWRITTEN:
251 goto phase2;
252 case BMAPI_DEVICE:
253 lockmode = XFS_LCK_MAP_SHARED(mp, io);
254 iomapp->iomap_target = io->io_flags & XFS_IOCORE_RT ?
255 mp->m_rtdev_targp : mp->m_ddev_targp;
256 error = 0;
257 *niomaps = 1;
258 goto out;
259 default:
260 BUG();
261 }
262
263 ASSERT(offset <= mp->m_maxioffset);
264 if ((xfs_fsize_t)offset + count > mp->m_maxioffset)
265 count = mp->m_maxioffset - offset;
266 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
267 offset_fsb = XFS_B_TO_FSBT(mp, offset);
268
269 error = XFS_BMAPI(mp, NULL, io, offset_fsb,
270 (xfs_filblks_t)(end_fsb - offset_fsb),
271 bmapi_flags, NULL, 0, &imap,
272 &nimaps, NULL);
273
274 if (error)
275 goto out;
276
277phase2:
278 switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE|BMAPI_UNWRITTEN)) {
279 case BMAPI_WRITE:
280 /* If we found an extent, return it */
281 if (nimaps && (imap.br_startblock != HOLESTARTBLOCK)) {
282 xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, io,
283 offset, count, iomapp, &imap, flags);
284 break;
285 }
286
287 if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) {
288 error = XFS_IOMAP_WRITE_DIRECT(mp, io, offset,
289 count, flags, &imap, &nimaps, nimaps);
290 } else {
291 error = XFS_IOMAP_WRITE_DELAY(mp, io, offset, count,
292 flags, &imap, &nimaps);
293 }
294 if (!error) {
295 xfs_iomap_map_trace(XFS_IOMAP_ALLOC_MAP, io,
296 offset, count, iomapp, &imap, flags);
297 }
298 iomap_flags = IOMAP_NEW;
299 break;
300 case BMAPI_ALLOCATE:
301 /* If we found an extent, return it */
302 XFS_IUNLOCK(mp, io, lockmode);
303 lockmode = 0;
304
305 if (nimaps && !ISNULLSTARTBLOCK(imap.br_startblock)) {
306 xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, io,
307 offset, count, iomapp, &imap, flags);
308 break;
309 }
310
311 error = XFS_IOMAP_WRITE_ALLOCATE(mp, io, &imap, &nimaps);
312 break;
313 case BMAPI_UNWRITTEN:
314 lockmode = 0;
315 error = XFS_IOMAP_WRITE_UNWRITTEN(mp, io, offset, count);
316 nimaps = 0;
317 break;
318 }
319
320 if (nimaps) {
321 *niomaps = xfs_imap_to_bmap(io, offset, &imap,
322 iomapp, nimaps, *niomaps, iomap_flags);
323 } else if (niomaps) {
324 *niomaps = 0;
325 }
326
327out:
328 if (lockmode)
329 XFS_IUNLOCK(mp, io, lockmode);
330 return XFS_ERROR(error);
331}
332
333STATIC int
334xfs_flush_space(
335 xfs_inode_t *ip,
336 int *fsynced,
337 int *ioflags)
338{
339 switch (*fsynced) {
340 case 0:
341 if (ip->i_delayed_blks) {
342 xfs_iunlock(ip, XFS_ILOCK_EXCL);
343 xfs_flush_inode(ip);
344 xfs_ilock(ip, XFS_ILOCK_EXCL);
345 *fsynced = 1;
346 } else {
347 *ioflags |= BMAPI_SYNC;
348 *fsynced = 2;
349 }
350 return 0;
351 case 1:
352 *fsynced = 2;
353 *ioflags |= BMAPI_SYNC;
354 return 0;
355 case 2:
356 xfs_iunlock(ip, XFS_ILOCK_EXCL);
357 xfs_flush_device(ip);
358 xfs_ilock(ip, XFS_ILOCK_EXCL);
359 *fsynced = 3;
360 return 0;
361 }
362 return 1;
363}
364
365int
366xfs_iomap_write_direct(
367 xfs_inode_t *ip,
368 loff_t offset,
369 size_t count,
370 int flags,
371 xfs_bmbt_irec_t *ret_imap,
372 int *nmaps,
373 int found)
374{
375 xfs_mount_t *mp = ip->i_mount;
376 xfs_iocore_t *io = &ip->i_iocore;
377 xfs_fileoff_t offset_fsb;
378 xfs_fileoff_t last_fsb;
379 xfs_filblks_t count_fsb;
380 xfs_fsize_t isize;
381 xfs_fsblock_t firstfsb;
382 int nimaps, maps;
383 int error;
384 int bmapi_flag;
385 int rt;
386 xfs_trans_t *tp;
387 xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS], *imapp;
388 xfs_bmap_free_t free_list;
389 int aeof;
390 xfs_filblks_t datablocks;
391 int committed;
392 int numrtextents;
393 uint resblks;
394
395 /*
396 * Make sure that the dquots are there. This doesn't hold
397 * the ilock across a disk read.
398 */
399 error = XFS_QM_DQATTACH(ip->i_mount, ip, XFS_QMOPT_ILOCKED);
400 if (error)
401 return XFS_ERROR(error);
402
403 maps = min(XFS_WRITE_IMAPS, *nmaps);
404 nimaps = maps;
405
406 isize = ip->i_d.di_size;
407 aeof = (offset + count) > isize;
408
409 if (io->io_new_size > isize)
410 isize = io->io_new_size;
411
412 offset_fsb = XFS_B_TO_FSBT(mp, offset);
413 last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
414 count_fsb = last_fsb - offset_fsb;
415 if (found && (ret_imap->br_startblock == HOLESTARTBLOCK)) {
416 xfs_fileoff_t map_last_fsb;
417
418 map_last_fsb = ret_imap->br_blockcount + ret_imap->br_startoff;
419
420 if (map_last_fsb < last_fsb) {
421 last_fsb = map_last_fsb;
422 count_fsb = last_fsb - offset_fsb;
423 }
424 ASSERT(count_fsb > 0);
425 }
426
427 /*
428 * determine if reserving space on
429 * the data or realtime partition.
430 */
431 if ((rt = XFS_IS_REALTIME_INODE(ip))) {
432 int sbrtextsize, iprtextsize;
433
434 sbrtextsize = mp->m_sb.sb_rextsize;
435 iprtextsize =
436 ip->i_d.di_extsize ? ip->i_d.di_extsize : sbrtextsize;
437 numrtextents = (count_fsb + iprtextsize - 1);
438 do_div(numrtextents, sbrtextsize);
439 datablocks = 0;
440 } else {
441 datablocks = count_fsb;
442 numrtextents = 0;
443 }
444
445 /*
446 * allocate and setup the transaction
447 */
448 xfs_iunlock(ip, XFS_ILOCK_EXCL);
449 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
450
451 resblks = XFS_DIOSTRAT_SPACE_RES(mp, datablocks);
452
453 error = xfs_trans_reserve(tp, resblks,
454 XFS_WRITE_LOG_RES(mp), numrtextents,
455 XFS_TRANS_PERM_LOG_RES,
456 XFS_WRITE_LOG_COUNT);
457
458 /*
459 * check for running out of space
460 */
461 if (error)
462 /*
463 * Free the transaction structure.
464 */
465 xfs_trans_cancel(tp, 0);
466
467 xfs_ilock(ip, XFS_ILOCK_EXCL);
468
469 if (error)
470 goto error_out; /* Don't return in above if .. trans ..,
471 need lock to return */
472
473 if (XFS_TRANS_RESERVE_BLKQUOTA(mp, tp, ip, resblks)) {
474 error = (EDQUOT);
475 goto error1;
476 }
477 nimaps = 1;
478
479 bmapi_flag = XFS_BMAPI_WRITE;
480 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
481 xfs_trans_ihold(tp, ip);
482
483 if (!(flags & BMAPI_MMAP) && (offset < ip->i_d.di_size || rt))
484 bmapi_flag |= XFS_BMAPI_PREALLOC;
485
486 /*
487 * issue the bmapi() call to allocate the blocks
488 */
489 XFS_BMAP_INIT(&free_list, &firstfsb);
490 imapp = &imap[0];
491 error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
492 bmapi_flag, &firstfsb, 0, imapp, &nimaps, &free_list);
493 if (error) {
494 goto error0;
495 }
496
497 /*
498 * complete the transaction
499 */
500
501 error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed);
502 if (error) {
503 goto error0;
504 }
505
506 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
507 if (error) {
508 goto error_out;
509 }
510
511 /* copy any maps to caller's array and return any error. */
512 if (nimaps == 0) {
513 error = (ENOSPC);
514 goto error_out;
515 }
516
517 *ret_imap = imap[0];
518 *nmaps = 1;
519 if ( !(io->io_flags & XFS_IOCORE_RT) && !ret_imap->br_startblock) {
520 cmn_err(CE_PANIC,"Access to block zero: fs <%s> inode: %lld "
521 "start_block : %llx start_off : %llx blkcnt : %llx "
522 "extent-state : %x \n",
523 (ip->i_mount)->m_fsname,
524 (long long)ip->i_ino,
525 ret_imap->br_startblock, ret_imap->br_startoff,
526 ret_imap->br_blockcount,ret_imap->br_state);
527 }
528 return 0;
529
530 error0: /* Cancel bmap, unlock inode, and cancel trans */
531 xfs_bmap_cancel(&free_list);
532
533 error1: /* Just cancel transaction */
534 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
535 *nmaps = 0; /* nothing set-up here */
536
537error_out:
538 return XFS_ERROR(error);
539}
540
541int
542xfs_iomap_write_delay(
543 xfs_inode_t *ip,
544 loff_t offset,
545 size_t count,
546 int ioflag,
547 xfs_bmbt_irec_t *ret_imap,
548 int *nmaps)
549{
550 xfs_mount_t *mp = ip->i_mount;
551 xfs_iocore_t *io = &ip->i_iocore;
552 xfs_fileoff_t offset_fsb;
553 xfs_fileoff_t last_fsb;
554 xfs_fsize_t isize;
555 xfs_fsblock_t firstblock;
556 int nimaps;
557 int error;
558 xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
559 int aeof;
560 int fsynced = 0;
561
562 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0);
563
564 /*
565 * Make sure that the dquots are there. This doesn't hold
566 * the ilock across a disk read.
567 */
568
569 error = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED);
570 if (error)
571 return XFS_ERROR(error);
572
573retry:
574 isize = ip->i_d.di_size;
575 if (io->io_new_size > isize) {
576 isize = io->io_new_size;
577 }
578
579 aeof = 0;
580 offset_fsb = XFS_B_TO_FSBT(mp, offset);
581 last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
582 /*
583 * If the caller is doing a write at the end of the file,
584 * then extend the allocation (and the buffer used for the write)
585 * out to the file system's write iosize. We clean up any extra
586 * space left over when the file is closed in xfs_inactive().
587 *
588 * For sync writes, we are flushing delayed allocate space to
589 * try to make additional space available for allocation near
590 * the filesystem full boundary - preallocation hurts in that
591 * situation, of course.
592 */
593 if (!(ioflag & BMAPI_SYNC) && ((offset + count) > ip->i_d.di_size)) {
594 xfs_off_t aligned_offset;
595 xfs_filblks_t count_fsb;
596 unsigned int iosize;
597 xfs_fileoff_t ioalign;
598 int n;
599 xfs_fileoff_t start_fsb;
600
601 /*
602 * If there are any real blocks past eof, then don't
603 * do any speculative allocation.
604 */
605 start_fsb = XFS_B_TO_FSBT(mp,
606 ((xfs_ufsize_t)(offset + count - 1)));
607 count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
608 while (count_fsb > 0) {
609 nimaps = XFS_WRITE_IMAPS;
610 error = XFS_BMAPI(mp, NULL, io, start_fsb, count_fsb,
611 0, &firstblock, 0, imap, &nimaps, NULL);
612 if (error) {
613 return error;
614 }
615 for (n = 0; n < nimaps; n++) {
616 if ( !(io->io_flags & XFS_IOCORE_RT) &&
617 !imap[n].br_startblock) {
618 cmn_err(CE_PANIC,"Access to block "
619 "zero: fs <%s> inode: %lld "
620 "start_block : %llx start_off "
621 ": %llx blkcnt : %llx "
622 "extent-state : %x \n",
623 (ip->i_mount)->m_fsname,
624 (long long)ip->i_ino,
625 imap[n].br_startblock,
626 imap[n].br_startoff,
627 imap[n].br_blockcount,
628 imap[n].br_state);
629 }
630 if ((imap[n].br_startblock != HOLESTARTBLOCK) &&
631 (imap[n].br_startblock != DELAYSTARTBLOCK)) {
632 goto write_map;
633 }
634 start_fsb += imap[n].br_blockcount;
635 count_fsb -= imap[n].br_blockcount;
636 }
637 }
638 iosize = mp->m_writeio_blocks;
639 aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
640 ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
641 last_fsb = ioalign + iosize;
642 aeof = 1;
643 }
644write_map:
645 nimaps = XFS_WRITE_IMAPS;
646 firstblock = NULLFSBLOCK;
647
648 /*
649 * If mounted with the "-o swalloc" option, roundup the allocation
650 * request to a stripe width boundary if the file size is >=
651 * stripe width and we are allocating past the allocation eof.
652 */
653 if (!(io->io_flags & XFS_IOCORE_RT) && mp->m_swidth
654 && (mp->m_flags & XFS_MOUNT_SWALLOC)
655 && (isize >= XFS_FSB_TO_B(mp, mp->m_swidth)) && aeof) {
656 int eof;
657 xfs_fileoff_t new_last_fsb;
658
659 new_last_fsb = roundup_64(last_fsb, mp->m_swidth);
660 error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof);
661 if (error) {
662 return error;
663 }
664 if (eof) {
665 last_fsb = new_last_fsb;
666 }
667 /*
668 * Roundup the allocation request to a stripe unit (m_dalign) boundary
669 * if the file size is >= stripe unit size, and we are allocating past
670 * the allocation eof.
671 */
672 } else if (!(io->io_flags & XFS_IOCORE_RT) && mp->m_dalign &&
673 (isize >= XFS_FSB_TO_B(mp, mp->m_dalign)) && aeof) {
674 int eof;
675 xfs_fileoff_t new_last_fsb;
676 new_last_fsb = roundup_64(last_fsb, mp->m_dalign);
677 error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof);
678 if (error) {
679 return error;
680 }
681 if (eof) {
682 last_fsb = new_last_fsb;
683 }
684 /*
685 * Round up the allocation request to a real-time extent boundary
686 * if the file is on the real-time subvolume.
687 */
688 } else if (io->io_flags & XFS_IOCORE_RT && aeof) {
689 int eof;
690 xfs_fileoff_t new_last_fsb;
691
692 new_last_fsb = roundup_64(last_fsb, mp->m_sb.sb_rextsize);
693 error = XFS_BMAP_EOF(mp, io, new_last_fsb, XFS_DATA_FORK, &eof);
694 if (error) {
695 return error;
696 }
697 if (eof)
698 last_fsb = new_last_fsb;
699 }
700 error = xfs_bmapi(NULL, ip, offset_fsb,
701 (xfs_filblks_t)(last_fsb - offset_fsb),
702 XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
703 XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
704 &nimaps, NULL);
705 /*
706 * This can be EDQUOT, if nimaps == 0
707 */
708 if (error && (error != ENOSPC)) {
709 return XFS_ERROR(error);
710 }
711 /*
712 * If bmapi returned us nothing, and if we didn't get back EDQUOT,
713 * then we must have run out of space.
714 */
715 if (nimaps == 0) {
716 xfs_iomap_enter_trace(XFS_IOMAP_WRITE_NOSPACE,
717 io, offset, count);
718 if (xfs_flush_space(ip, &fsynced, &ioflag))
719 return XFS_ERROR(ENOSPC);
720
721 error = 0;
722 goto retry;
723 }
724
725 *ret_imap = imap[0];
726 *nmaps = 1;
727 if ( !(io->io_flags & XFS_IOCORE_RT) && !ret_imap->br_startblock) {
728 cmn_err(CE_PANIC,"Access to block zero: fs <%s> inode: %lld "
729 "start_block : %llx start_off : %llx blkcnt : %llx "
730 "extent-state : %x \n",
731 (ip->i_mount)->m_fsname,
732 (long long)ip->i_ino,
733 ret_imap->br_startblock, ret_imap->br_startoff,
734 ret_imap->br_blockcount,ret_imap->br_state);
735 }
736 return 0;
737}
738
739/*
740 * Pass in a delayed allocate extent, convert it to real extents;
741 * return to the caller the extent we create which maps on top of
742 * the originating callers request.
743 *
744 * Called without a lock on the inode.
745 */
746int
747xfs_iomap_write_allocate(
748 xfs_inode_t *ip,
749 xfs_bmbt_irec_t *map,
750 int *retmap)
751{
752 xfs_mount_t *mp = ip->i_mount;
753 xfs_iocore_t *io = &ip->i_iocore;
754 xfs_fileoff_t offset_fsb, last_block;
755 xfs_fileoff_t end_fsb, map_start_fsb;
756 xfs_fsblock_t first_block;
757 xfs_bmap_free_t free_list;
758 xfs_filblks_t count_fsb;
759 xfs_bmbt_irec_t imap[XFS_STRAT_WRITE_IMAPS];
760 xfs_trans_t *tp;
761 int i, nimaps, committed;
762 int error = 0;
763 int nres;
764
765 *retmap = 0;
766
767 /*
768 * Make sure that the dquots are there.
769 */
770 if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
771 return XFS_ERROR(error);
772
773 offset_fsb = map->br_startoff;
774 count_fsb = map->br_blockcount;
775 map_start_fsb = offset_fsb;
776
777 XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
778
779 while (count_fsb != 0) {
780 /*
781 * Set up a transaction with which to allocate the
782 * backing store for the file. Do allocations in a
783 * loop until we get some space in the range we are
784 * interested in. The other space that might be allocated
785 * is in the delayed allocation extent on which we sit
786 * but before our buffer starts.
787 */
788
789 nimaps = 0;
790 while (nimaps == 0) {
791 tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
792 nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
793 error = xfs_trans_reserve(tp, nres,
794 XFS_WRITE_LOG_RES(mp),
795 0, XFS_TRANS_PERM_LOG_RES,
796 XFS_WRITE_LOG_COUNT);
797 if (error == ENOSPC) {
798 error = xfs_trans_reserve(tp, 0,
799 XFS_WRITE_LOG_RES(mp),
800 0,
801 XFS_TRANS_PERM_LOG_RES,
802 XFS_WRITE_LOG_COUNT);
803 }
804 if (error) {
805 xfs_trans_cancel(tp, 0);
806 return XFS_ERROR(error);
807 }
808 xfs_ilock(ip, XFS_ILOCK_EXCL);
809 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
810 xfs_trans_ihold(tp, ip);
811
812 XFS_BMAP_INIT(&free_list, &first_block);
813
814 nimaps = XFS_STRAT_WRITE_IMAPS;
815 /*
816 * Ensure we don't go beyond eof - it is possible
817 * the extents changed since we did the read call,
818 * we dropped the ilock in the interim.
819 */
820
821 end_fsb = XFS_B_TO_FSB(mp, ip->i_d.di_size);
822 xfs_bmap_last_offset(NULL, ip, &last_block,
823 XFS_DATA_FORK);
824 last_block = XFS_FILEOFF_MAX(last_block, end_fsb);
825 if ((map_start_fsb + count_fsb) > last_block) {
826 count_fsb = last_block - map_start_fsb;
827 if (count_fsb == 0) {
828 error = EAGAIN;
829 goto trans_cancel;
830 }
831 }
832
833 /* Go get the actual blocks */
834 error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb,
835 XFS_BMAPI_WRITE, &first_block, 1,
836 imap, &nimaps, &free_list);
837 if (error)
838 goto trans_cancel;
839
840 error = xfs_bmap_finish(&tp, &free_list,
841 first_block, &committed);
842 if (error)
843 goto trans_cancel;
844
845 error = xfs_trans_commit(tp,
846 XFS_TRANS_RELEASE_LOG_RES, NULL);
847 if (error)
848 goto error0;
849
850 xfs_iunlock(ip, XFS_ILOCK_EXCL);
851 }
852
853 /*
854 * See if we were able to allocate an extent that
855 * covers at least part of the callers request
856 */
857
858 for (i = 0; i < nimaps; i++) {
859 if ( !(io->io_flags & XFS_IOCORE_RT) &&
860 !imap[i].br_startblock) {
861 cmn_err(CE_PANIC,"Access to block zero: "
862 "fs <%s> inode: %lld "
863 "start_block : %llx start_off : %llx "
864 "blkcnt : %llx extent-state : %x \n",
865 (ip->i_mount)->m_fsname,
866 (long long)ip->i_ino,
867 imap[i].br_startblock,
868 imap[i].br_startoff,
869 imap[i].br_blockcount,imap[i].br_state);
870 }
871 if ((map->br_startoff >= imap[i].br_startoff) &&
872 (map->br_startoff < (imap[i].br_startoff +
873 imap[i].br_blockcount))) {
874 *map = imap[i];
875 *retmap = 1;
876 XFS_STATS_INC(xs_xstrat_quick);
877 return 0;
878 }
879 count_fsb -= imap[i].br_blockcount;
880 }
881
882 /* So far we have not mapped the requested part of the
883 * file, just surrounding data, try again.
884 */
885 nimaps--;
886 offset_fsb = imap[nimaps].br_startoff +
887 imap[nimaps].br_blockcount;
888 map_start_fsb = offset_fsb;
889 }
890
891trans_cancel:
892 xfs_bmap_cancel(&free_list);
893 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
894error0:
895 xfs_iunlock(ip, XFS_ILOCK_EXCL);
896 return XFS_ERROR(error);
897}
898
899int
900xfs_iomap_write_unwritten(
901 xfs_inode_t *ip,
902 loff_t offset,
903 size_t count)
904{
905 xfs_mount_t *mp = ip->i_mount;
906 xfs_iocore_t *io = &ip->i_iocore;
907 xfs_trans_t *tp;
908 xfs_fileoff_t offset_fsb;
909 xfs_filblks_t count_fsb;
910 xfs_filblks_t numblks_fsb;
911 xfs_bmbt_irec_t imap;
912 int committed;
913 int error;
914 int nres;
915 int nimaps;
916 xfs_fsblock_t firstfsb;
917 xfs_bmap_free_t free_list;
918
919 xfs_iomap_enter_trace(XFS_IOMAP_UNWRITTEN,
920 &ip->i_iocore, offset, count);
921
922 offset_fsb = XFS_B_TO_FSBT(mp, offset);
923 count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
924 count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb);
925
926 do {
927 nres = XFS_DIOSTRAT_SPACE_RES(mp, 0);
928
929 /*
930 * set up a transaction to convert the range of extents
931 * from unwritten to real. Do allocations in a loop until
932 * we have covered the range passed in.
933 */
934
935 tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
936 error = xfs_trans_reserve(tp, nres,
937 XFS_WRITE_LOG_RES(mp), 0,
938 XFS_TRANS_PERM_LOG_RES,
939 XFS_WRITE_LOG_COUNT);
940 if (error) {
941 xfs_trans_cancel(tp, 0);
942 goto error0;
943 }
944
945 xfs_ilock(ip, XFS_ILOCK_EXCL);
946 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
947 xfs_trans_ihold(tp, ip);
948
949 /*
950 * Modify the unwritten extent state of the buffer.
951 */
952 XFS_BMAP_INIT(&free_list, &firstfsb);
953 nimaps = 1;
954 error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
955 XFS_BMAPI_WRITE, &firstfsb,
956 1, &imap, &nimaps, &free_list);
957 if (error)
958 goto error_on_bmapi_transaction;
959
960 error = xfs_bmap_finish(&(tp), &(free_list),
961 firstfsb, &committed);
962 if (error)
963 goto error_on_bmapi_transaction;
964
965 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
966 xfs_iunlock(ip, XFS_ILOCK_EXCL);
967 if (error)
968 goto error0;
969
970 if ( !(io->io_flags & XFS_IOCORE_RT) && !imap.br_startblock) {
971 cmn_err(CE_PANIC,"Access to block zero: fs <%s> "
972 "inode: %lld start_block : %llx start_off : "
973 "%llx blkcnt : %llx extent-state : %x \n",
974 (ip->i_mount)->m_fsname,
975 (long long)ip->i_ino,
976 imap.br_startblock,imap.br_startoff,
977 imap.br_blockcount,imap.br_state);
978 }
979
980 if ((numblks_fsb = imap.br_blockcount) == 0) {
981 /*
982 * The numblks_fsb value should always get
983 * smaller, otherwise the loop is stuck.
984 */
985 ASSERT(imap.br_blockcount);
986 break;
987 }
988 offset_fsb += numblks_fsb;
989 count_fsb -= numblks_fsb;
990 } while (count_fsb > 0);
991
992 return 0;
993
994error_on_bmapi_transaction:
995 xfs_bmap_cancel(&free_list);
996 xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT));
997 xfs_iunlock(ip, XFS_ILOCK_EXCL);
998error0:
999 return XFS_ERROR(error);
1000}
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
new file mode 100644
index 000000000000..31c91087cb33
--- /dev/null
+++ b/fs/xfs/xfs_iomap.h
@@ -0,0 +1,107 @@
1/*
2 * Copyright (c) 2003,2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33
34
35#ifndef __XFS_IOMAP_H__
36#define __XFS_IOMAP_H__
37
38#define IOMAP_DADDR_NULL ((xfs_daddr_t) (-1LL))
39
40
41typedef enum { /* iomap_flags values */
42 IOMAP_EOF = 0x01, /* mapping contains EOF */
43 IOMAP_HOLE = 0x02, /* mapping covers a hole */
44 IOMAP_DELAY = 0x04, /* mapping covers delalloc region */
45 IOMAP_REALTIME = 0x10, /* mapping on the realtime device */
46 IOMAP_UNWRITTEN = 0x20, /* mapping covers allocated */
47 /* but uninitialized file data */
48 IOMAP_NEW = 0x40 /* just allocate */
49} iomap_flags_t;
50
51typedef enum {
52 /* base extent manipulation calls */
53 BMAPI_READ = (1 << 0), /* read extents */
54 BMAPI_WRITE = (1 << 1), /* create extents */
55 BMAPI_ALLOCATE = (1 << 2), /* delayed allocate to real extents */
56 BMAPI_UNWRITTEN = (1 << 3), /* unwritten extents to real extents */
57 /* modifiers */
58 BMAPI_IGNSTATE = (1 << 4), /* ignore unwritten state on read */
59 BMAPI_DIRECT = (1 << 5), /* direct instead of buffered write */
60 BMAPI_MMAP = (1 << 6), /* allocate for mmap write */
61 BMAPI_SYNC = (1 << 7), /* sync write to flush delalloc space */
62 BMAPI_TRYLOCK = (1 << 8), /* non-blocking request */
63 BMAPI_DEVICE = (1 << 9), /* we only want to know the device */
64} bmapi_flags_t;
65
66
67/*
68 * xfs_iomap_t: File system I/O map
69 *
70 * The iomap_bn field is expressed in 512-byte blocks, and is where the
71 * mapping starts on disk.
72 *
73 * The iomap_offset, iomap_bsize and iomap_delta fields are in bytes.
74 * iomap_offset is the offset of the mapping in the file itself.
75 * iomap_bsize is the size of the mapping, iomap_delta is the
76 * desired data's offset into the mapping, given the offset supplied
77 * to the file I/O map routine.
78 *
79 * When a request is made to read beyond the logical end of the object,
80 * iomap_size may be set to 0, but iomap_offset and iomap_length should be set
81 * to the actual amount of underlying storage that has been allocated, if any.
82 */
83
84typedef struct xfs_iomap {
85 xfs_daddr_t iomap_bn; /* first 512b blk of mapping */
86 xfs_buftarg_t *iomap_target;
87 loff_t iomap_offset; /* offset of mapping, bytes */
88 loff_t iomap_bsize; /* size of mapping, bytes */
89 size_t iomap_delta; /* offset into mapping, bytes */
90 iomap_flags_t iomap_flags;
91} xfs_iomap_t;
92
93struct xfs_iocore;
94struct xfs_inode;
95struct xfs_bmbt_irec;
96
97extern int xfs_iomap(struct xfs_iocore *, xfs_off_t, ssize_t, int,
98 struct xfs_iomap *, int *);
99extern int xfs_iomap_write_direct(struct xfs_inode *, loff_t, size_t,
100 int, struct xfs_bmbt_irec *, int *, int);
101extern int xfs_iomap_write_delay(struct xfs_inode *, loff_t, size_t, int,
102 struct xfs_bmbt_irec *, int *);
103extern int xfs_iomap_write_allocate(struct xfs_inode *,
104 struct xfs_bmbt_irec *, int *);
105extern int xfs_iomap_write_unwritten(struct xfs_inode *, loff_t, size_t);
106
107#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
new file mode 100644
index 000000000000..8fbc8d378188
--- /dev/null
+++ b/fs/xfs/xfs_itable.c
@@ -0,0 +1,858 @@
1/*
2 * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34#include "xfs_macros.h"
35#include "xfs_types.h"
36#include "xfs_inum.h"
37#include "xfs_log.h"
38#include "xfs_trans.h"
39#include "xfs_sb.h"
40#include "xfs_dir.h"
41#include "xfs_dir2.h"
42#include "xfs_dmapi.h"
43#include "xfs_mount.h"
44#include "xfs_ag.h"
45#include "xfs_alloc_btree.h"
46#include "xfs_bmap_btree.h"
47#include "xfs_ialloc_btree.h"
48#include "xfs_btree.h"
49#include "xfs_attr_sf.h"
50#include "xfs_dir_sf.h"
51#include "xfs_dir2_sf.h"
52#include "xfs_dinode.h"
53#include "xfs_inode.h"
54#include "xfs_ialloc.h"
55#include "xfs_itable.h"
56#include "xfs_error.h"
57
58#ifndef HAVE_USERACC
59#define useracc(ubuffer, size, flags, foo) (0)
60#define unuseracc(ubuffer, size, flags)
61#endif
62
63STATIC int
64xfs_bulkstat_one_iget(
65 xfs_mount_t *mp, /* mount point for filesystem */
66 xfs_ino_t ino, /* inode number to get data for */
67 xfs_daddr_t bno, /* starting bno of inode cluster */
68 xfs_bstat_t *buf, /* return buffer */
69 int *stat) /* BULKSTAT_RV_... */
70{
71 xfs_dinode_core_t *dic; /* dinode core info pointer */
72 xfs_inode_t *ip; /* incore inode pointer */
73 int error;
74
75 error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, bno);
76 if (error) {
77 *stat = BULKSTAT_RV_NOTHING;
78 return error;
79 }
80
81 ASSERT(ip != NULL);
82 ASSERT(ip->i_blkno != (xfs_daddr_t)0);
83 if (ip->i_d.di_mode == 0) {
84 *stat = BULKSTAT_RV_NOTHING;
85 error = XFS_ERROR(ENOENT);
86 goto out_iput;
87 }
88
89 dic = &ip->i_d;
90
91 /* xfs_iget returns the following without needing
92 * further change.
93 */
94 buf->bs_nlink = dic->di_nlink;
95 buf->bs_projid = dic->di_projid;
96 buf->bs_ino = ino;
97 buf->bs_mode = dic->di_mode;
98 buf->bs_uid = dic->di_uid;
99 buf->bs_gid = dic->di_gid;
100 buf->bs_size = dic->di_size;
101 buf->bs_atime.tv_sec = dic->di_atime.t_sec;
102 buf->bs_atime.tv_nsec = dic->di_atime.t_nsec;
103 buf->bs_mtime.tv_sec = dic->di_mtime.t_sec;
104 buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec;
105 buf->bs_ctime.tv_sec = dic->di_ctime.t_sec;
106 buf->bs_ctime.tv_nsec = dic->di_ctime.t_nsec;
107 buf->bs_xflags = xfs_ip2xflags(ip);
108 buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog;
109 buf->bs_extents = dic->di_nextents;
110 buf->bs_gen = dic->di_gen;
111 memset(buf->bs_pad, 0, sizeof(buf->bs_pad));
112 buf->bs_dmevmask = dic->di_dmevmask;
113 buf->bs_dmstate = dic->di_dmstate;
114 buf->bs_aextents = dic->di_anextents;
115
116 switch (dic->di_format) {
117 case XFS_DINODE_FMT_DEV:
118 buf->bs_rdev = ip->i_df.if_u2.if_rdev;
119 buf->bs_blksize = BLKDEV_IOSIZE;
120 buf->bs_blocks = 0;
121 break;
122 case XFS_DINODE_FMT_LOCAL:
123 case XFS_DINODE_FMT_UUID:
124 buf->bs_rdev = 0;
125 buf->bs_blksize = mp->m_sb.sb_blocksize;
126 buf->bs_blocks = 0;
127 break;
128 case XFS_DINODE_FMT_EXTENTS:
129 case XFS_DINODE_FMT_BTREE:
130 buf->bs_rdev = 0;
131 buf->bs_blksize = mp->m_sb.sb_blocksize;
132 buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks;
133 break;
134 }
135
136 out_iput:
137 xfs_iput(ip, XFS_ILOCK_SHARED);
138 return error;
139}
140
141STATIC int
142xfs_bulkstat_one_dinode(
143 xfs_mount_t *mp, /* mount point for filesystem */
144 xfs_ino_t ino, /* inode number to get data for */
145 xfs_dinode_t *dip, /* dinode inode pointer */
146 xfs_bstat_t *buf) /* return buffer */
147{
148 xfs_dinode_core_t *dic; /* dinode core info pointer */
149
150 dic = &dip->di_core;
151
152 /*
153 * The inode format changed when we moved the link count and
154 * made it 32 bits long. If this is an old format inode,
155 * convert it in memory to look like a new one. If it gets
156 * flushed to disk we will convert back before flushing or
157 * logging it. We zero out the new projid field and the old link
158 * count field. We'll handle clearing the pad field (the remains
159 * of the old uuid field) when we actually convert the inode to
160 * the new format. We don't change the version number so that we
161 * can distinguish this from a real new format inode.
162 */
163 if (INT_GET(dic->di_version, ARCH_CONVERT) == XFS_DINODE_VERSION_1) {
164 buf->bs_nlink = INT_GET(dic->di_onlink, ARCH_CONVERT);
165 buf->bs_projid = 0;
166 } else {
167 buf->bs_nlink = INT_GET(dic->di_nlink, ARCH_CONVERT);
168 buf->bs_projid = INT_GET(dic->di_projid, ARCH_CONVERT);
169 }
170
171 buf->bs_ino = ino;
172 buf->bs_mode = INT_GET(dic->di_mode, ARCH_CONVERT);
173 buf->bs_uid = INT_GET(dic->di_uid, ARCH_CONVERT);
174 buf->bs_gid = INT_GET(dic->di_gid, ARCH_CONVERT);
175 buf->bs_size = INT_GET(dic->di_size, ARCH_CONVERT);
176 buf->bs_atime.tv_sec = INT_GET(dic->di_atime.t_sec, ARCH_CONVERT);
177 buf->bs_atime.tv_nsec = INT_GET(dic->di_atime.t_nsec, ARCH_CONVERT);
178 buf->bs_mtime.tv_sec = INT_GET(dic->di_mtime.t_sec, ARCH_CONVERT);
179 buf->bs_mtime.tv_nsec = INT_GET(dic->di_mtime.t_nsec, ARCH_CONVERT);
180 buf->bs_ctime.tv_sec = INT_GET(dic->di_ctime.t_sec, ARCH_CONVERT);
181 buf->bs_ctime.tv_nsec = INT_GET(dic->di_ctime.t_nsec, ARCH_CONVERT);
182 buf->bs_xflags = xfs_dic2xflags(dic);
183 buf->bs_extsize = INT_GET(dic->di_extsize, ARCH_CONVERT) << mp->m_sb.sb_blocklog;
184 buf->bs_extents = INT_GET(dic->di_nextents, ARCH_CONVERT);
185 buf->bs_gen = INT_GET(dic->di_gen, ARCH_CONVERT);
186 memset(buf->bs_pad, 0, sizeof(buf->bs_pad));
187 buf->bs_dmevmask = INT_GET(dic->di_dmevmask, ARCH_CONVERT);
188 buf->bs_dmstate = INT_GET(dic->di_dmstate, ARCH_CONVERT);
189 buf->bs_aextents = INT_GET(dic->di_anextents, ARCH_CONVERT);
190
191 switch (INT_GET(dic->di_format, ARCH_CONVERT)) {
192 case XFS_DINODE_FMT_DEV:
193 buf->bs_rdev = INT_GET(dip->di_u.di_dev, ARCH_CONVERT);
194 buf->bs_blksize = BLKDEV_IOSIZE;
195 buf->bs_blocks = 0;
196 break;
197 case XFS_DINODE_FMT_LOCAL:
198 case XFS_DINODE_FMT_UUID:
199 buf->bs_rdev = 0;
200 buf->bs_blksize = mp->m_sb.sb_blocksize;
201 buf->bs_blocks = 0;
202 break;
203 case XFS_DINODE_FMT_EXTENTS:
204 case XFS_DINODE_FMT_BTREE:
205 buf->bs_rdev = 0;
206 buf->bs_blksize = mp->m_sb.sb_blocksize;
207 buf->bs_blocks = INT_GET(dic->di_nblocks, ARCH_CONVERT);
208 break;
209 }
210
211 return 0;
212}
213
214/*
215 * Return stat information for one inode.
216 * Return 0 if ok, else errno.
217 */
218int /* error status */
219xfs_bulkstat_one(
220 xfs_mount_t *mp, /* mount point for filesystem */
221 xfs_ino_t ino, /* inode number to get data for */
222 void __user *buffer, /* buffer to place output in */
223 int ubsize, /* size of buffer */
224 void *private_data, /* my private data */
225 xfs_daddr_t bno, /* starting bno of inode cluster */
226 int *ubused, /* bytes used by me */
227 void *dibuff, /* on-disk inode buffer */
228 int *stat) /* BULKSTAT_RV_... */
229{
230 xfs_bstat_t *buf; /* return buffer */
231 int error = 0; /* error value */
232 xfs_dinode_t *dip; /* dinode inode pointer */
233
234 dip = (xfs_dinode_t *)dibuff;
235
236 if (!buffer || ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino ||
237 (XFS_SB_VERSION_HASQUOTA(&mp->m_sb) &&
238 (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino))) {
239 *stat = BULKSTAT_RV_NOTHING;
240 return XFS_ERROR(EINVAL);
241 }
242 if (ubsize < sizeof(*buf)) {
243 *stat = BULKSTAT_RV_NOTHING;
244 return XFS_ERROR(ENOMEM);
245 }
246
247 buf = kmem_alloc(sizeof(*buf), KM_SLEEP);
248
249 if (dip == NULL) {
250 /* We're not being passed a pointer to a dinode. This happens
251 * if BULKSTAT_FG_IGET is selected. Do the iget.
252 */
253 error = xfs_bulkstat_one_iget(mp, ino, bno, buf, stat);
254 if (error)
255 goto out_free;
256 } else {
257 xfs_bulkstat_one_dinode(mp, ino, dip, buf);
258 }
259
260 if (copy_to_user(buffer, buf, sizeof(*buf))) {
261 *stat = BULKSTAT_RV_NOTHING;
262 error = EFAULT;
263 goto out_free;
264 }
265
266 *stat = BULKSTAT_RV_DIDONE;
267 if (ubused)
268 *ubused = sizeof(*buf);
269
270 out_free:
271 kmem_free(buf, sizeof(*buf));
272 return error;
273}
274
275/*
276 * Return stat information in bulk (by-inode) for the filesystem.
277 */
278int /* error status */
279xfs_bulkstat(
280 xfs_mount_t *mp, /* mount point for filesystem */
281 xfs_ino_t *lastinop, /* last inode returned */
282 int *ubcountp, /* size of buffer/count returned */
283 bulkstat_one_pf formatter, /* func that'd fill a single buf */
284 void *private_data,/* private data for formatter */
285 size_t statstruct_size, /* sizeof struct filling */
286 char __user *ubuffer, /* buffer with inode stats */
287 int flags, /* defined in xfs_itable.h */
288 int *done) /* 1 if there're more stats to get */
289{
290 xfs_agblock_t agbno=0;/* allocation group block number */
291 xfs_buf_t *agbp; /* agi header buffer */
292 xfs_agi_t *agi; /* agi header data */
293 xfs_agino_t agino; /* inode # in allocation group */
294 xfs_agnumber_t agno; /* allocation group number */
295 xfs_daddr_t bno; /* inode cluster start daddr */
296 int chunkidx; /* current index into inode chunk */
297 int clustidx; /* current index into inode cluster */
298 xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */
299 int end_of_ag; /* set if we've seen the ag end */
300 int error; /* error code */
301 int fmterror;/* bulkstat formatter result */
302 __int32_t gcnt; /* current btree rec's count */
303 xfs_inofree_t gfree; /* current btree rec's free mask */
304 xfs_agino_t gino; /* current btree rec's start inode */
305 int i; /* loop index */
306 int icount; /* count of inodes good in irbuf */
307 xfs_ino_t ino; /* inode number (filesystem) */
308 xfs_inobt_rec_t *irbp; /* current irec buffer pointer */
309 xfs_inobt_rec_t *irbuf; /* start of irec buffer */
310 xfs_inobt_rec_t *irbufend; /* end of good irec buffer entries */
311 xfs_ino_t lastino=0; /* last inode number returned */
312 int nbcluster; /* # of blocks in a cluster */
313 int nicluster; /* # of inodes in a cluster */
314 int nimask; /* mask for inode clusters */
315 int nirbuf; /* size of irbuf */
316 int rval; /* return value error code */
317 int tmp; /* result value from btree calls */
318 int ubcount; /* size of user's buffer */
319 int ubleft; /* bytes left in user's buffer */
320 char __user *ubufp; /* pointer into user's buffer */
321 int ubelem; /* spaces used in user's buffer */
322 int ubused; /* bytes used by formatter */
323 xfs_buf_t *bp; /* ptr to on-disk inode cluster buf */
324 xfs_dinode_t *dip; /* ptr into bp for specific inode */
325 xfs_inode_t *ip; /* ptr to in-core inode struct */
326
327 /*
328 * Get the last inode value, see if there's nothing to do.
329 */
330 ino = (xfs_ino_t)*lastinop;
331 dip = NULL;
332 agno = XFS_INO_TO_AGNO(mp, ino);
333 agino = XFS_INO_TO_AGINO(mp, ino);
334 if (agno >= mp->m_sb.sb_agcount ||
335 ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
336 *done = 1;
337 *ubcountp = 0;
338 return 0;
339 }
340 ubcount = *ubcountp; /* statstruct's */
341 ubleft = ubcount * statstruct_size; /* bytes */
342 *ubcountp = ubelem = 0;
343 *done = 0;
344 fmterror = 0;
345 ubufp = ubuffer;
346 nicluster = mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp) ?
347 mp->m_sb.sb_inopblock :
348 (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog);
349 nimask = ~(nicluster - 1);
350 nbcluster = nicluster >> mp->m_sb.sb_inopblog;
351 /*
352 * Lock down the user's buffer. If a buffer was not sent, as in the case
353 * disk quota code calls here, we skip this.
354 */
355 if (ubuffer &&
356 (error = useracc(ubuffer, ubcount * statstruct_size,
357 (B_READ|B_PHYS), NULL))) {
358 return error;
359 }
360 /*
361 * Allocate a page-sized buffer for inode btree records.
362 * We could try allocating something smaller, but for normal
363 * calls we'll always (potentially) need the whole page.
364 */
365 irbuf = kmem_alloc(NBPC, KM_SLEEP);
366 nirbuf = NBPC / sizeof(*irbuf);
367 /*
368 * Loop over the allocation groups, starting from the last
369 * inode returned; 0 means start of the allocation group.
370 */
371 rval = 0;
372 while (ubleft >= statstruct_size && agno < mp->m_sb.sb_agcount) {
373 bp = NULL;
374 down_read(&mp->m_peraglock);
375 error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
376 up_read(&mp->m_peraglock);
377 if (error) {
378 /*
379 * Skip this allocation group and go to the next one.
380 */
381 agno++;
382 agino = 0;
383 continue;
384 }
385 agi = XFS_BUF_TO_AGI(agbp);
386 /*
387 * Allocate and initialize a btree cursor for ialloc btree.
388 */
389 cur = xfs_btree_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_INO,
390 (xfs_inode_t *)0, 0);
391 irbp = irbuf;
392 irbufend = irbuf + nirbuf;
393 end_of_ag = 0;
394 /*
395 * If we're returning in the middle of an allocation group,
396 * we need to get the remainder of the chunk we're in.
397 */
398 if (agino > 0) {
399 /*
400 * Lookup the inode chunk that this inode lives in.
401 */
402 error = xfs_inobt_lookup_le(cur, agino, 0, 0, &tmp);
403 if (!error && /* no I/O error */
404 tmp && /* lookup succeeded */
405 /* got the record, should always work */
406 !(error = xfs_inobt_get_rec(cur, &gino, &gcnt,
407 &gfree, &i)) &&
408 i == 1 &&
409 /* this is the right chunk */
410 agino < gino + XFS_INODES_PER_CHUNK &&
411 /* lastino was not last in chunk */
412 (chunkidx = agino - gino + 1) <
413 XFS_INODES_PER_CHUNK &&
414 /* there are some left allocated */
415 XFS_INOBT_MASKN(chunkidx,
416 XFS_INODES_PER_CHUNK - chunkidx) & ~gfree) {
417 /*
418 * Grab the chunk record. Mark all the
419 * uninteresting inodes (because they're
420 * before our start point) free.
421 */
422 for (i = 0; i < chunkidx; i++) {
423 if (XFS_INOBT_MASK(i) & ~gfree)
424 gcnt++;
425 }
426 gfree |= XFS_INOBT_MASKN(0, chunkidx);
427 INT_SET(irbp->ir_startino, ARCH_CONVERT, gino);
428 INT_SET(irbp->ir_freecount, ARCH_CONVERT, gcnt);
429 INT_SET(irbp->ir_free, ARCH_CONVERT, gfree);
430 irbp++;
431 agino = gino + XFS_INODES_PER_CHUNK;
432 icount = XFS_INODES_PER_CHUNK - gcnt;
433 } else {
434 /*
435 * If any of those tests failed, bump the
436 * inode number (just in case).
437 */
438 agino++;
439 icount = 0;
440 }
441 /*
442 * In any case, increment to the next record.
443 */
444 if (!error)
445 error = xfs_inobt_increment(cur, 0, &tmp);
446 } else {
447 /*
448 * Start of ag. Lookup the first inode chunk.
449 */
450 error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &tmp);
451 icount = 0;
452 }
453 /*
454 * Loop through inode btree records in this ag,
455 * until we run out of inodes or space in the buffer.
456 */
457 while (irbp < irbufend && icount < ubcount) {
458 /*
459 * Loop as long as we're unable to read the
460 * inode btree.
461 */
462 while (error) {
463 agino += XFS_INODES_PER_CHUNK;
464 if (XFS_AGINO_TO_AGBNO(mp, agino) >=
465 INT_GET(agi->agi_length, ARCH_CONVERT))
466 break;
467 error = xfs_inobt_lookup_ge(cur, agino, 0, 0,
468 &tmp);
469 }
470 /*
471 * If ran off the end of the ag either with an error,
472 * or the normal way, set end and stop collecting.
473 */
474 if (error ||
475 (error = xfs_inobt_get_rec(cur, &gino, &gcnt,
476 &gfree, &i)) ||
477 i == 0) {
478 end_of_ag = 1;
479 break;
480 }
481 /*
482 * If this chunk has any allocated inodes, save it.
483 */
484 if (gcnt < XFS_INODES_PER_CHUNK) {
485 INT_SET(irbp->ir_startino, ARCH_CONVERT, gino);
486 INT_SET(irbp->ir_freecount, ARCH_CONVERT, gcnt);
487 INT_SET(irbp->ir_free, ARCH_CONVERT, gfree);
488 irbp++;
489 icount += XFS_INODES_PER_CHUNK - gcnt;
490 }
491 /*
492 * Set agino to after this chunk and bump the cursor.
493 */
494 agino = gino + XFS_INODES_PER_CHUNK;
495 error = xfs_inobt_increment(cur, 0, &tmp);
496 }
497 /*
498 * Drop the btree buffers and the agi buffer.
499 * We can't hold any of the locks these represent
500 * when calling iget.
501 */
502 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
503 xfs_buf_relse(agbp);
504 /*
505 * Now format all the good inodes into the user's buffer.
506 */
507 irbufend = irbp;
508 for (irbp = irbuf;
509 irbp < irbufend && ubleft >= statstruct_size; irbp++) {
510 /*
511 * Read-ahead the next chunk's worth of inodes.
512 */
513 if (&irbp[1] < irbufend) {
514 /*
515 * Loop over all clusters in the next chunk.
516 * Do a readahead if there are any allocated
517 * inodes in that cluster.
518 */
519 for (agbno = XFS_AGINO_TO_AGBNO(mp,
520 INT_GET(irbp[1].ir_startino, ARCH_CONVERT)),
521 chunkidx = 0;
522 chunkidx < XFS_INODES_PER_CHUNK;
523 chunkidx += nicluster,
524 agbno += nbcluster) {
525 if (XFS_INOBT_MASKN(chunkidx,
526 nicluster) &
527 ~(INT_GET(irbp[1].ir_free, ARCH_CONVERT)))
528 xfs_btree_reada_bufs(mp, agno,
529 agbno, nbcluster);
530 }
531 }
532 /*
533 * Now process this chunk of inodes.
534 */
535 for (agino = INT_GET(irbp->ir_startino, ARCH_CONVERT), chunkidx = 0, clustidx = 0;
536 ubleft > 0 &&
537 INT_GET(irbp->ir_freecount, ARCH_CONVERT) < XFS_INODES_PER_CHUNK;
538 chunkidx++, clustidx++, agino++) {
539 ASSERT(chunkidx < XFS_INODES_PER_CHUNK);
540 /*
541 * Recompute agbno if this is the
542 * first inode of the cluster.
543 *
544 * Careful with clustidx. There can be
545 * multple clusters per chunk, a single
546 * cluster per chunk or a cluster that has
547 * inodes represented from several different
548 * chunks (if blocksize is large).
549 *
550 * Because of this, the starting clustidx is
551 * initialized to zero in this loop but must
552 * later be reset after reading in the cluster
553 * buffer.
554 */
555 if ((chunkidx & (nicluster - 1)) == 0) {
556 agbno = XFS_AGINO_TO_AGBNO(mp,
557 INT_GET(irbp->ir_startino, ARCH_CONVERT)) +
558 ((chunkidx & nimask) >>
559 mp->m_sb.sb_inopblog);
560
561 if (flags & BULKSTAT_FG_QUICK) {
562 ino = XFS_AGINO_TO_INO(mp, agno,
563 agino);
564 bno = XFS_AGB_TO_DADDR(mp, agno,
565 agbno);
566
567 /*
568 * Get the inode cluster buffer
569 */
570 ASSERT(xfs_inode_zone != NULL);
571 ip = kmem_zone_zalloc(xfs_inode_zone,
572 KM_SLEEP);
573 ip->i_ino = ino;
574 ip->i_mount = mp;
575 if (bp)
576 xfs_buf_relse(bp);
577 error = xfs_itobp(mp, NULL, ip,
578 &dip, &bp, bno);
579 if (!error)
580 clustidx = ip->i_boffset / mp->m_sb.sb_inodesize;
581 kmem_zone_free(xfs_inode_zone, ip);
582 if (XFS_TEST_ERROR(error != 0,
583 mp, XFS_ERRTAG_BULKSTAT_READ_CHUNK,
584 XFS_RANDOM_BULKSTAT_READ_CHUNK)) {
585 bp = NULL;
586 break;
587 }
588 }
589 }
590 /*
591 * Skip if this inode is free.
592 */
593 if (XFS_INOBT_MASK(chunkidx) & INT_GET(irbp->ir_free, ARCH_CONVERT))
594 continue;
595 /*
596 * Count used inodes as free so we can tell
597 * when the chunk is used up.
598 */
599 INT_MOD(irbp->ir_freecount, ARCH_CONVERT, +1);
600 ino = XFS_AGINO_TO_INO(mp, agno, agino);
601 bno = XFS_AGB_TO_DADDR(mp, agno, agbno);
602 if (flags & BULKSTAT_FG_QUICK) {
603 dip = (xfs_dinode_t *)xfs_buf_offset(bp,
604 (clustidx << mp->m_sb.sb_inodelog));
605
606 if (INT_GET(dip->di_core.di_magic, ARCH_CONVERT)
607 != XFS_DINODE_MAGIC
608 || !XFS_DINODE_GOOD_VERSION(
609 INT_GET(dip->di_core.di_version, ARCH_CONVERT)))
610 continue;
611 }
612
613 /*
614 * Get the inode and fill in a single buffer.
615 * BULKSTAT_FG_QUICK uses dip to fill it in.
616 * BULKSTAT_FG_IGET uses igets.
617 * See: xfs_bulkstat_one & xfs_dm_bulkstat_one.
618 * This is also used to count inodes/blks, etc
619 * in xfs_qm_quotacheck.
620 */
621 ubused = statstruct_size;
622 error = formatter(mp, ino, ubufp,
623 ubleft, private_data,
624 bno, &ubused, dip, &fmterror);
625 if (fmterror == BULKSTAT_RV_NOTHING) {
626 if (error == ENOMEM)
627 ubleft = 0;
628 continue;
629 }
630 if (fmterror == BULKSTAT_RV_GIVEUP) {
631 ubleft = 0;
632 ASSERT(error);
633 rval = error;
634 break;
635 }
636 if (ubufp)
637 ubufp += ubused;
638 ubleft -= ubused;
639 ubelem++;
640 lastino = ino;
641 }
642 }
643
644 if (bp)
645 xfs_buf_relse(bp);
646
647 /*
648 * Set up for the next loop iteration.
649 */
650 if (ubleft > 0) {
651 if (end_of_ag) {
652 agno++;
653 agino = 0;
654 } else
655 agino = XFS_INO_TO_AGINO(mp, lastino);
656 } else
657 break;
658 }
659 /*
660 * Done, we're either out of filesystem or space to put the data.
661 */
662 kmem_free(irbuf, NBPC);
663 if (ubuffer)
664 unuseracc(ubuffer, ubcount * statstruct_size, (B_READ|B_PHYS));
665 *ubcountp = ubelem;
666 if (agno >= mp->m_sb.sb_agcount) {
667 /*
668 * If we ran out of filesystem, mark lastino as off
669 * the end of the filesystem, so the next call
670 * will return immediately.
671 */
672 *lastinop = (xfs_ino_t)XFS_AGINO_TO_INO(mp, agno, 0);
673 *done = 1;
674 } else
675 *lastinop = (xfs_ino_t)lastino;
676
677 return rval;
678}
679
680/*
681 * Return stat information in bulk (by-inode) for the filesystem.
682 * Special case for non-sequential one inode bulkstat.
683 */
684int /* error status */
685xfs_bulkstat_single(
686 xfs_mount_t *mp, /* mount point for filesystem */
687 xfs_ino_t *lastinop, /* inode to return */
688 char __user *buffer, /* buffer with inode stats */
689 int *done) /* 1 if there're more stats to get */
690{
691 int count; /* count value for bulkstat call */
692 int error; /* return value */
693 xfs_ino_t ino; /* filesystem inode number */
694 int res; /* result from bs1 */
695
696 /*
697 * note that requesting valid inode numbers which are not allocated
698 * to inodes will most likely cause xfs_itobp to generate warning
699 * messages about bad magic numbers. This is ok. The fact that
700 * the inode isn't actually an inode is handled by the
701 * error check below. Done this way to make the usual case faster
702 * at the expense of the error case.
703 */
704
705 ino = (xfs_ino_t)*lastinop;
706 error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t),
707 NULL, 0, NULL, NULL, &res);
708 if (error) {
709 /*
710 * Special case way failed, do it the "long" way
711 * to see if that works.
712 */
713 (*lastinop)--;
714 count = 1;
715 if (xfs_bulkstat(mp, lastinop, &count, xfs_bulkstat_one,
716 NULL, sizeof(xfs_bstat_t), buffer,
717 BULKSTAT_FG_IGET, done))
718 return error;
719 if (count == 0 || (xfs_ino_t)*lastinop != ino)
720 return error == EFSCORRUPTED ?
721 XFS_ERROR(EINVAL) : error;
722 else
723 return 0;
724 }
725 *done = 0;
726 return 0;
727}
728
729/*
730 * Return inode number table for the filesystem.
731 */
732int /* error status */
733xfs_inumbers(
734 xfs_mount_t *mp, /* mount point for filesystem */
735 xfs_ino_t *lastino, /* last inode returned */
736 int *count, /* size of buffer/count returned */
737 xfs_inogrp_t __user *ubuffer)/* buffer with inode descriptions */
738{
739 xfs_buf_t *agbp;
740 xfs_agino_t agino;
741 xfs_agnumber_t agno;
742 int bcount;
743 xfs_inogrp_t *buffer;
744 int bufidx;
745 xfs_btree_cur_t *cur;
746 int error;
747 __int32_t gcnt;
748 xfs_inofree_t gfree;
749 xfs_agino_t gino;
750 int i;
751 xfs_ino_t ino;
752 int left;
753 int tmp;
754
755 ino = (xfs_ino_t)*lastino;
756 agno = XFS_INO_TO_AGNO(mp, ino);
757 agino = XFS_INO_TO_AGINO(mp, ino);
758 left = *count;
759 *count = 0;
760 bcount = MIN(left, (int)(NBPP / sizeof(*buffer)));
761 buffer = kmem_alloc(bcount * sizeof(*buffer), KM_SLEEP);
762 error = bufidx = 0;
763 cur = NULL;
764 agbp = NULL;
765 while (left > 0 && agno < mp->m_sb.sb_agcount) {
766 if (agbp == NULL) {
767 down_read(&mp->m_peraglock);
768 error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
769 up_read(&mp->m_peraglock);
770 if (error) {
771 /*
772 * If we can't read the AGI of this ag,
773 * then just skip to the next one.
774 */
775 ASSERT(cur == NULL);
776 agbp = NULL;
777 agno++;
778 agino = 0;
779 continue;
780 }
781 cur = xfs_btree_init_cursor(mp, NULL, agbp, agno,
782 XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
783 error = xfs_inobt_lookup_ge(cur, agino, 0, 0, &tmp);
784 if (error) {
785 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
786 cur = NULL;
787 xfs_buf_relse(agbp);
788 agbp = NULL;
789 /*
790 * Move up the the last inode in the current
791 * chunk. The lookup_ge will always get
792 * us the first inode in the next chunk.
793 */
794 agino += XFS_INODES_PER_CHUNK - 1;
795 continue;
796 }
797 }
798 if ((error = xfs_inobt_get_rec(cur, &gino, &gcnt, &gfree,
799 &i)) ||
800 i == 0) {
801 xfs_buf_relse(agbp);
802 agbp = NULL;
803 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
804 cur = NULL;
805 agno++;
806 agino = 0;
807 continue;
808 }
809 agino = gino + XFS_INODES_PER_CHUNK - 1;
810 buffer[bufidx].xi_startino = XFS_AGINO_TO_INO(mp, agno, gino);
811 buffer[bufidx].xi_alloccount = XFS_INODES_PER_CHUNK - gcnt;
812 buffer[bufidx].xi_allocmask = ~gfree;
813 bufidx++;
814 left--;
815 if (bufidx == bcount) {
816 if (copy_to_user(ubuffer, buffer,
817 bufidx * sizeof(*buffer))) {
818 error = XFS_ERROR(EFAULT);
819 break;
820 }
821 ubuffer += bufidx;
822 *count += bufidx;
823 bufidx = 0;
824 }
825 if (left) {
826 error = xfs_inobt_increment(cur, 0, &tmp);
827 if (error) {
828 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
829 cur = NULL;
830 xfs_buf_relse(agbp);
831 agbp = NULL;
832 /*
833 * The agino value has already been bumped.
834 * Just try to skip up to it.
835 */
836 agino += XFS_INODES_PER_CHUNK;
837 continue;
838 }
839 }
840 }
841 if (!error) {
842 if (bufidx) {
843 if (copy_to_user(ubuffer, buffer,
844 bufidx * sizeof(*buffer)))
845 error = XFS_ERROR(EFAULT);
846 else
847 *count += bufidx;
848 }
849 *lastino = XFS_AGINO_TO_INO(mp, agno, agino);
850 }
851 kmem_free(buffer, bcount * sizeof(*buffer));
852 if (cur)
853 xfs_btree_del_cursor(cur, (error ? XFS_BTREE_ERROR :
854 XFS_BTREE_NOERROR));
855 if (agbp)
856 xfs_buf_relse(agbp);
857 return error;
858}
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
new file mode 100644
index 000000000000..2be9d1805ab2
--- /dev/null
+++ b/fs/xfs/xfs_itable.h
@@ -0,0 +1,106 @@
1/*
2 * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_ITABLE_H__
33#define __XFS_ITABLE_H__
34
35/*
36 * xfs_bulkstat() is used to fill in xfs_bstat structures as well as dm_stat
37 * structures (by the dmi library). This is a pointer to a formatter function
38 * that will iget the inode and fill in the appropriate structure.
39 * see xfs_bulkstat_one() and xfs_dm_bulkstat_one() in dmapi_xfs.c
40 */
41typedef int (*bulkstat_one_pf)(struct xfs_mount *mp,
42 xfs_ino_t ino,
43 void __user *buffer,
44 int ubsize,
45 void *private_data,
46 xfs_daddr_t bno,
47 int *ubused,
48 void *dip,
49 int *stat);
50
51/*
52 * Values for stat return value.
53 */
54#define BULKSTAT_RV_NOTHING 0
55#define BULKSTAT_RV_DIDONE 1
56#define BULKSTAT_RV_GIVEUP 2
57
58/*
59 * Values for bulkstat flag argument.
60 */
61#define BULKSTAT_FG_IGET 0x1 /* Go through the buffer cache */
62#define BULKSTAT_FG_QUICK 0x2 /* No iget, walk the dinode cluster */
63#define BULKSTAT_FG_VFSLOCKED 0x4 /* Already have vfs lock */
64
65/*
66 * Return stat information in bulk (by-inode) for the filesystem.
67 */
68int /* error status */
69xfs_bulkstat(
70 xfs_mount_t *mp, /* mount point for filesystem */
71 xfs_ino_t *lastino, /* last inode returned */
72 int *count, /* size of buffer/count returned */
73 bulkstat_one_pf formatter, /* func that'd fill a single buf */
74 void *private_data, /* private data for formatter */
75 size_t statstruct_size,/* sizeof struct that we're filling */
76 char __user *ubuffer,/* buffer with inode stats */
77 int flags, /* flag to control access method */
78 int *done); /* 1 if there're more stats to get */
79
80int
81xfs_bulkstat_single(
82 xfs_mount_t *mp,
83 xfs_ino_t *lastinop,
84 char __user *buffer,
85 int *done);
86
87int
88xfs_bulkstat_one(
89 xfs_mount_t *mp,
90 xfs_ino_t ino,
91 void __user *buffer,
92 int ubsize,
93 void *private_data,
94 xfs_daddr_t bno,
95 int *ubused,
96 void *dibuff,
97 int *stat);
98
99int /* error status */
100xfs_inumbers(
101 xfs_mount_t *mp, /* mount point for filesystem */
102 xfs_ino_t *last, /* last inode returned */
103 int *count, /* size of buffer/count returned */
104 xfs_inogrp_t __user *buffer);/* buffer with inode info */
105
106#endif /* __XFS_ITABLE_H__ */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
new file mode 100644
index 000000000000..092d5fb096b1
--- /dev/null
+++ b/fs/xfs/xfs_log.c
@@ -0,0 +1,3560 @@
1/*
2 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33/*
34 * High level interface routines for log manager
35 */
36
37#include "xfs.h"
38#include "xfs_macros.h"
39#include "xfs_types.h"
40#include "xfs_inum.h"
41#include "xfs_ag.h"
42#include "xfs_sb.h"
43#include "xfs_log.h"
44#include "xfs_trans.h"
45#include "xfs_dir.h"
46#include "xfs_dmapi.h"
47#include "xfs_mount.h"
48#include "xfs_error.h"
49#include "xfs_log_priv.h"
50#include "xfs_buf_item.h"
51#include "xfs_alloc_btree.h"
52#include "xfs_log_recover.h"
53#include "xfs_bit.h"
54#include "xfs_rw.h"
55#include "xfs_trans_priv.h"
56
57
58#define xlog_write_adv_cnt(ptr, len, off, bytes) \
59 { (ptr) += (bytes); \
60 (len) -= (bytes); \
61 (off) += (bytes);}
62
63/* Local miscellaneous function prototypes */
64STATIC int xlog_bdstrat_cb(struct xfs_buf *);
65STATIC int xlog_commit_record(xfs_mount_t *mp, xlog_ticket_t *ticket,
66 xlog_in_core_t **, xfs_lsn_t *);
67STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp,
68 xfs_buftarg_t *log_target,
69 xfs_daddr_t blk_offset,
70 int num_bblks);
71STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes);
72STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
73STATIC void xlog_unalloc_log(xlog_t *log);
74STATIC int xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[],
75 int nentries, xfs_log_ticket_t tic,
76 xfs_lsn_t *start_lsn,
77 xlog_in_core_t **commit_iclog,
78 uint flags);
79
80/* local state machine functions */
81STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int);
82STATIC void xlog_state_do_callback(xlog_t *log,int aborted, xlog_in_core_t *iclog);
83STATIC int xlog_state_get_iclog_space(xlog_t *log,
84 int len,
85 xlog_in_core_t **iclog,
86 xlog_ticket_t *ticket,
87 int *continued_write,
88 int *logoffsetp);
89STATIC void xlog_state_put_ticket(xlog_t *log,
90 xlog_ticket_t *tic);
91STATIC int xlog_state_release_iclog(xlog_t *log,
92 xlog_in_core_t *iclog);
93STATIC void xlog_state_switch_iclogs(xlog_t *log,
94 xlog_in_core_t *iclog,
95 int eventual_size);
96STATIC int xlog_state_sync(xlog_t *log, xfs_lsn_t lsn, uint flags);
97STATIC int xlog_state_sync_all(xlog_t *log, uint flags);
98STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog);
99
100/* local functions to manipulate grant head */
101STATIC int xlog_grant_log_space(xlog_t *log,
102 xlog_ticket_t *xtic);
103STATIC void xlog_grant_push_ail(xfs_mount_t *mp,
104 int need_bytes);
105STATIC void xlog_regrant_reserve_log_space(xlog_t *log,
106 xlog_ticket_t *ticket);
107STATIC int xlog_regrant_write_log_space(xlog_t *log,
108 xlog_ticket_t *ticket);
109STATIC void xlog_ungrant_log_space(xlog_t *log,
110 xlog_ticket_t *ticket);
111
112
113/* local ticket functions */
114STATIC void xlog_state_ticket_alloc(xlog_t *log);
115STATIC xlog_ticket_t *xlog_ticket_get(xlog_t *log,
116 int unit_bytes,
117 int count,
118 char clientid,
119 uint flags);
120STATIC void xlog_ticket_put(xlog_t *log, xlog_ticket_t *ticket);
121
122/* local debug functions */
123#if defined(DEBUG) && !defined(XLOG_NOLOG)
124STATIC void xlog_verify_dest_ptr(xlog_t *log, __psint_t ptr);
125STATIC void xlog_verify_grant_head(xlog_t *log, int equals);
126STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
127 int count, boolean_t syncing);
128STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
129 xfs_lsn_t tail_lsn);
130#else
131#define xlog_verify_dest_ptr(a,b)
132#define xlog_verify_grant_head(a,b)
133#define xlog_verify_iclog(a,b,c,d)
134#define xlog_verify_tail_lsn(a,b,c)
135#endif
136
137int xlog_iclogs_empty(xlog_t *log);
138
139#ifdef DEBUG
140int xlog_do_error = 0;
141int xlog_req_num = 0;
142int xlog_error_mod = 33;
143#endif
144
145#define XLOG_FORCED_SHUTDOWN(log) (log->l_flags & XLOG_IO_ERROR)
146
147/*
148 * 0 => disable log manager
149 * 1 => enable log manager
150 * 2 => enable log manager and log debugging
151 */
152#if defined(XLOG_NOLOG) || defined(DEBUG)
153int xlog_debug = 1;
154xfs_buftarg_t *xlog_target;
155#endif
156
157#if defined(XFS_LOG_TRACE)
158
159void
160xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string)
161{
162 if (! log->l_grant_trace) {
163 log->l_grant_trace = ktrace_alloc(1024, KM_NOSLEEP);
164 if (! log->l_grant_trace)
165 return;
166 }
167
168 ktrace_enter(log->l_grant_trace,
169 (void *)tic,
170 (void *)log->l_reserve_headq,
171 (void *)log->l_write_headq,
172 (void *)((unsigned long)log->l_grant_reserve_cycle),
173 (void *)((unsigned long)log->l_grant_reserve_bytes),
174 (void *)((unsigned long)log->l_grant_write_cycle),
175 (void *)((unsigned long)log->l_grant_write_bytes),
176 (void *)((unsigned long)log->l_curr_cycle),
177 (void *)((unsigned long)log->l_curr_block),
178 (void *)((unsigned long)CYCLE_LSN(log->l_tail_lsn)),
179 (void *)((unsigned long)BLOCK_LSN(log->l_tail_lsn)),
180 (void *)string,
181 (void *)((unsigned long)13),
182 (void *)((unsigned long)14),
183 (void *)((unsigned long)15),
184 (void *)((unsigned long)16));
185}
186
187void
188xlog_trace_iclog(xlog_in_core_t *iclog, uint state)
189{
190 pid_t pid;
191
192 pid = current_pid();
193
194 if (!iclog->ic_trace)
195 iclog->ic_trace = ktrace_alloc(256, KM_SLEEP);
196 ktrace_enter(iclog->ic_trace,
197 (void *)((unsigned long)state),
198 (void *)((unsigned long)pid),
199 (void *)0,
200 (void *)0,
201 (void *)0,
202 (void *)0,
203 (void *)0,
204 (void *)0,
205 (void *)0,
206 (void *)0,
207 (void *)0,
208 (void *)0,
209 (void *)0,
210 (void *)0,
211 (void *)0,
212 (void *)0);
213}
214
215#else
216#define xlog_trace_loggrant(log,tic,string)
217#define xlog_trace_iclog(iclog,state)
218#endif /* XFS_LOG_TRACE */
219
220/*
221 * NOTES:
222 *
223 * 1. currblock field gets updated at startup and after in-core logs
224 * marked as with WANT_SYNC.
225 */
226
227/*
228 * This routine is called when a user of a log manager ticket is done with
229 * the reservation. If the ticket was ever used, then a commit record for
230 * the associated transaction is written out as a log operation header with
231 * no data. The flag XLOG_TIC_INITED is set when the first write occurs with
232 * a given ticket. If the ticket was one with a permanent reservation, then
233 * a few operations are done differently. Permanent reservation tickets by
234 * default don't release the reservation. They just commit the current
235 * transaction with the belief that the reservation is still needed. A flag
236 * must be passed in before permanent reservations are actually released.
237 * When these type of tickets are not released, they need to be set into
238 * the inited state again. By doing this, a start record will be written
239 * out when the next write occurs.
240 */
241xfs_lsn_t
242xfs_log_done(xfs_mount_t *mp,
243 xfs_log_ticket_t xtic,
244 void **iclog,
245 uint flags)
246{
247 xlog_t *log = mp->m_log;
248 xlog_ticket_t *ticket = (xfs_log_ticket_t) xtic;
249 xfs_lsn_t lsn = 0;
250
251#if defined(DEBUG) || defined(XLOG_NOLOG)
252 if (!xlog_debug && xlog_target == log->l_targ)
253 return 0;
254#endif
255
256 if (XLOG_FORCED_SHUTDOWN(log) ||
257 /*
258 * If nothing was ever written, don't write out commit record.
259 * If we get an error, just continue and give back the log ticket.
260 */
261 (((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
262 (xlog_commit_record(mp, ticket,
263 (xlog_in_core_t **)iclog, &lsn)))) {
264 lsn = (xfs_lsn_t) -1;
265 if (ticket->t_flags & XLOG_TIC_PERM_RESERV) {
266 flags |= XFS_LOG_REL_PERM_RESERV;
267 }
268 }
269
270
271 if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) == 0 ||
272 (flags & XFS_LOG_REL_PERM_RESERV)) {
273 /*
274 * Release ticket if not permanent reservation or a specifc
275 * request has been made to release a permanent reservation.
276 */
277 xlog_ungrant_log_space(log, ticket);
278 xlog_state_put_ticket(log, ticket);
279 } else {
280 xlog_regrant_reserve_log_space(log, ticket);
281 }
282
283 /* If this ticket was a permanent reservation and we aren't
284 * trying to release it, reset the inited flags; so next time
285 * we write, a start record will be written out.
286 */
287 if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) &&
288 (flags & XFS_LOG_REL_PERM_RESERV) == 0)
289 ticket->t_flags |= XLOG_TIC_INITED;
290
291 return lsn;
292} /* xfs_log_done */
293
294
295/*
296 * Force the in-core log to disk. If flags == XFS_LOG_SYNC,
297 * the force is done synchronously.
298 *
299 * Asynchronous forces are implemented by setting the WANT_SYNC
300 * bit in the appropriate in-core log and then returning.
301 *
302 * Synchronous forces are implemented with a semaphore. All callers
303 * to force a given lsn to disk will wait on a semaphore attached to the
304 * specific in-core log. When given in-core log finally completes its
305 * write to disk, that thread will wake up all threads waiting on the
306 * semaphore.
307 */
308int
309xfs_log_force(xfs_mount_t *mp,
310 xfs_lsn_t lsn,
311 uint flags)
312{
313 int rval;
314 xlog_t *log = mp->m_log;
315
316#if defined(DEBUG) || defined(XLOG_NOLOG)
317 if (!xlog_debug && xlog_target == log->l_targ)
318 return 0;
319#endif
320
321 ASSERT(flags & XFS_LOG_FORCE);
322
323 XFS_STATS_INC(xs_log_force);
324
325 if ((log->l_flags & XLOG_IO_ERROR) == 0) {
326 if (lsn == 0)
327 rval = xlog_state_sync_all(log, flags);
328 else
329 rval = xlog_state_sync(log, lsn, flags);
330 } else {
331 rval = XFS_ERROR(EIO);
332 }
333
334 return rval;
335
336} /* xfs_log_force */
337
338/*
339 * Attaches a new iclog I/O completion callback routine during
340 * transaction commit. If the log is in error state, a non-zero
341 * return code is handed back and the caller is responsible for
342 * executing the callback at an appropriate time.
343 */
344int
345xfs_log_notify(xfs_mount_t *mp, /* mount of partition */
346 void *iclog_hndl, /* iclog to hang callback off */
347 xfs_log_callback_t *cb)
348{
349 xlog_t *log = mp->m_log;
350 xlog_in_core_t *iclog = (xlog_in_core_t *)iclog_hndl;
351 int abortflg, spl;
352
353#if defined(DEBUG) || defined(XLOG_NOLOG)
354 if (!xlog_debug && xlog_target == log->l_targ)
355 return 0;
356#endif
357 cb->cb_next = NULL;
358 spl = LOG_LOCK(log);
359 abortflg = (iclog->ic_state & XLOG_STATE_IOERROR);
360 if (!abortflg) {
361 ASSERT_ALWAYS((iclog->ic_state == XLOG_STATE_ACTIVE) ||
362 (iclog->ic_state == XLOG_STATE_WANT_SYNC));
363 cb->cb_next = NULL;
364 *(iclog->ic_callback_tail) = cb;
365 iclog->ic_callback_tail = &(cb->cb_next);
366 }
367 LOG_UNLOCK(log, spl);
368 return abortflg;
369} /* xfs_log_notify */
370
371int
372xfs_log_release_iclog(xfs_mount_t *mp,
373 void *iclog_hndl)
374{
375 xlog_t *log = mp->m_log;
376 xlog_in_core_t *iclog = (xlog_in_core_t *)iclog_hndl;
377
378 if (xlog_state_release_iclog(log, iclog)) {
379 xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
380 return(EIO);
381 }
382
383 return 0;
384}
385
386/*
387 * 1. Reserve an amount of on-disk log space and return a ticket corresponding
388 * to the reservation.
389 * 2. Potentially, push buffers at tail of log to disk.
390 *
391 * Each reservation is going to reserve extra space for a log record header.
392 * When writes happen to the on-disk log, we don't subtract the length of the
393 * log record header from any reservation. By wasting space in each
394 * reservation, we prevent over allocation problems.
395 */
396int
397xfs_log_reserve(xfs_mount_t *mp,
398 int unit_bytes,
399 int cnt,
400 xfs_log_ticket_t *ticket,
401 __uint8_t client,
402 uint flags)
403{
404 xlog_t *log = mp->m_log;
405 xlog_ticket_t *internal_ticket;
406 int retval;
407
408#if defined(DEBUG) || defined(XLOG_NOLOG)
409 if (!xlog_debug && xlog_target == log->l_targ)
410 return 0;
411#endif
412 retval = 0;
413 ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
414 ASSERT((flags & XFS_LOG_NOSLEEP) == 0);
415
416 if (XLOG_FORCED_SHUTDOWN(log))
417 return XFS_ERROR(EIO);
418
419 XFS_STATS_INC(xs_try_logspace);
420
421 if (*ticket != NULL) {
422 ASSERT(flags & XFS_LOG_PERM_RESERV);
423 internal_ticket = (xlog_ticket_t *)*ticket;
424 xlog_grant_push_ail(mp, internal_ticket->t_unit_res);
425 retval = xlog_regrant_write_log_space(log, internal_ticket);
426 } else {
427 /* may sleep if need to allocate more tickets */
428 internal_ticket = xlog_ticket_get(log, unit_bytes, cnt,
429 client, flags);
430 *ticket = internal_ticket;
431 xlog_grant_push_ail(mp,
432 (internal_ticket->t_unit_res *
433 internal_ticket->t_cnt));
434 retval = xlog_grant_log_space(log, internal_ticket);
435 }
436
437 return retval;
438} /* xfs_log_reserve */
439
440
441/*
442 * Mount a log filesystem
443 *
444 * mp - ubiquitous xfs mount point structure
445 * log_target - buftarg of on-disk log device
446 * blk_offset - Start block # where block size is 512 bytes (BBSIZE)
447 * num_bblocks - Number of BBSIZE blocks in on-disk log
448 *
449 * Return error or zero.
450 */
451int
452xfs_log_mount(xfs_mount_t *mp,
453 xfs_buftarg_t *log_target,
454 xfs_daddr_t blk_offset,
455 int num_bblks)
456{
457 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
458 cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname);
459 else {
460 cmn_err(CE_NOTE,
461 "!Mounting filesystem \"%s\" in no-recovery mode. Filesystem will be inconsistent.",
462 mp->m_fsname);
463 ASSERT(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY);
464 }
465
466 mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks);
467
468#if defined(DEBUG) || defined(XLOG_NOLOG)
469 if (!xlog_debug) {
470 cmn_err(CE_NOTE, "log dev: %s", XFS_BUFTARG_NAME(log_target));
471 return 0;
472 }
473#endif
474 /*
475 * skip log recovery on a norecovery mount. pretend it all
476 * just worked.
477 */
478 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
479 int error;
480 vfs_t *vfsp = XFS_MTOVFS(mp);
481 int readonly = (vfsp->vfs_flag & VFS_RDONLY);
482
483 if (readonly)
484 vfsp->vfs_flag &= ~VFS_RDONLY;
485
486 error = xlog_recover(mp->m_log, readonly);
487
488 if (readonly)
489 vfsp->vfs_flag |= VFS_RDONLY;
490 if (error) {
491 cmn_err(CE_WARN, "XFS: log mount/recovery failed: error %d", error);
492 xlog_unalloc_log(mp->m_log);
493 return error;
494 }
495 }
496
497 /* Normal transactions can now occur */
498 mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
499
500 /* End mounting message in xfs_log_mount_finish */
501 return 0;
502} /* xfs_log_mount */
503
504/*
505 * Finish the recovery of the file system. This is separate from
506 * the xfs_log_mount() call, because it depends on the code in
507 * xfs_mountfs() to read in the root and real-time bitmap inodes
508 * between calling xfs_log_mount() and here.
509 *
510 * mp - ubiquitous xfs mount point structure
511 */
512int
513xfs_log_mount_finish(xfs_mount_t *mp, int mfsi_flags)
514{
515 int error;
516
517 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
518 error = xlog_recover_finish(mp->m_log, mfsi_flags);
519 else {
520 error = 0;
521 ASSERT(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY);
522 }
523
524 return error;
525}
526
527/*
528 * Unmount processing for the log.
529 */
530int
531xfs_log_unmount(xfs_mount_t *mp)
532{
533 int error;
534
535 error = xfs_log_unmount_write(mp);
536 xfs_log_unmount_dealloc(mp);
537 return (error);
538}
539
540/*
541 * Final log writes as part of unmount.
542 *
543 * Mark the filesystem clean as unmount happens. Note that during relocation
544 * this routine needs to be executed as part of source-bag while the
545 * deallocation must not be done until source-end.
546 */
547
548/*
549 * Unmount record used to have a string "Unmount filesystem--" in the
550 * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE).
551 * We just write the magic number now since that particular field isn't
552 * currently architecture converted and "nUmount" is a bit foo.
553 * As far as I know, there weren't any dependencies on the old behaviour.
554 */
555
556int
557xfs_log_unmount_write(xfs_mount_t *mp)
558{
559 xlog_t *log = mp->m_log;
560 xlog_in_core_t *iclog;
561#ifdef DEBUG
562 xlog_in_core_t *first_iclog;
563#endif
564 xfs_log_iovec_t reg[1];
565 xfs_log_ticket_t tic = NULL;
566 xfs_lsn_t lsn;
567 int error;
568 SPLDECL(s);
569
570 /* the data section must be 32 bit size aligned */
571 struct {
572 __uint16_t magic;
573 __uint16_t pad1;
574 __uint32_t pad2; /* may as well make it 64 bits */
575 } magic = { XLOG_UNMOUNT_TYPE, 0, 0 };
576
577#if defined(DEBUG) || defined(XLOG_NOLOG)
578 if (!xlog_debug && xlog_target == log->l_targ)
579 return 0;
580#endif
581
582 /*
583 * Don't write out unmount record on read-only mounts.
584 * Or, if we are doing a forced umount (typically because of IO errors).
585 */
586 if (XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY)
587 return 0;
588
589 xfs_log_force(mp, 0, XFS_LOG_FORCE|XFS_LOG_SYNC);
590
591#ifdef DEBUG
592 first_iclog = iclog = log->l_iclog;
593 do {
594 if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
595 ASSERT(iclog->ic_state & XLOG_STATE_ACTIVE);
596 ASSERT(iclog->ic_offset == 0);
597 }
598 iclog = iclog->ic_next;
599 } while (iclog != first_iclog);
600#endif
601 if (! (XLOG_FORCED_SHUTDOWN(log))) {
602 reg[0].i_addr = (void*)&magic;
603 reg[0].i_len = sizeof(magic);
604
605 error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0);
606 if (!error) {
607 /* remove inited flag */
608 ((xlog_ticket_t *)tic)->t_flags = 0;
609 error = xlog_write(mp, reg, 1, tic, &lsn,
610 NULL, XLOG_UNMOUNT_TRANS);
611 /*
612 * At this point, we're umounting anyway,
613 * so there's no point in transitioning log state
614 * to IOERROR. Just continue...
615 */
616 }
617
618 if (error) {
619 xfs_fs_cmn_err(CE_ALERT, mp,
620 "xfs_log_unmount: unmount record failed");
621 }
622
623
624 s = LOG_LOCK(log);
625 iclog = log->l_iclog;
626 iclog->ic_refcnt++;
627 LOG_UNLOCK(log, s);
628 xlog_state_want_sync(log, iclog);
629 (void) xlog_state_release_iclog(log, iclog);
630
631 s = LOG_LOCK(log);
632 if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
633 iclog->ic_state == XLOG_STATE_DIRTY)) {
634 if (!XLOG_FORCED_SHUTDOWN(log)) {
635 sv_wait(&iclog->ic_forcesema, PMEM,
636 &log->l_icloglock, s);
637 } else {
638 LOG_UNLOCK(log, s);
639 }
640 } else {
641 LOG_UNLOCK(log, s);
642 }
643 if (tic)
644 xlog_state_put_ticket(log, tic);
645 } else {
646 /*
647 * We're already in forced_shutdown mode, couldn't
648 * even attempt to write out the unmount transaction.
649 *
650 * Go through the motions of sync'ing and releasing
651 * the iclog, even though no I/O will actually happen,
652 * we need to wait for other log I/O's that may already
653 * be in progress. Do this as a separate section of
654 * code so we'll know if we ever get stuck here that
655 * we're in this odd situation of trying to unmount
656 * a file system that went into forced_shutdown as
657 * the result of an unmount..
658 */
659 s = LOG_LOCK(log);
660 iclog = log->l_iclog;
661 iclog->ic_refcnt++;
662 LOG_UNLOCK(log, s);
663
664 xlog_state_want_sync(log, iclog);
665 (void) xlog_state_release_iclog(log, iclog);
666
667 s = LOG_LOCK(log);
668
669 if ( ! ( iclog->ic_state == XLOG_STATE_ACTIVE
670 || iclog->ic_state == XLOG_STATE_DIRTY
671 || iclog->ic_state == XLOG_STATE_IOERROR) ) {
672
673 sv_wait(&iclog->ic_forcesema, PMEM,
674 &log->l_icloglock, s);
675 } else {
676 LOG_UNLOCK(log, s);
677 }
678 }
679
680 return 0;
681} /* xfs_log_unmount_write */
682
683/*
684 * Deallocate log structures for unmount/relocation.
685 */
686void
687xfs_log_unmount_dealloc(xfs_mount_t *mp)
688{
689 xlog_unalloc_log(mp->m_log);
690}
691
692/*
693 * Write region vectors to log. The write happens using the space reservation
694 * of the ticket (tic). It is not a requirement that all writes for a given
695 * transaction occur with one call to xfs_log_write().
696 */
697int
698xfs_log_write(xfs_mount_t * mp,
699 xfs_log_iovec_t reg[],
700 int nentries,
701 xfs_log_ticket_t tic,
702 xfs_lsn_t *start_lsn)
703{
704 int error;
705 xlog_t *log = mp->m_log;
706
707#if defined(DEBUG) || defined(XLOG_NOLOG)
708 if (!xlog_debug && xlog_target == log->l_targ) {
709 *start_lsn = 0;
710 return 0;
711 }
712#endif
713 if (XLOG_FORCED_SHUTDOWN(log))
714 return XFS_ERROR(EIO);
715
716 if ((error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0))) {
717 xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
718 }
719 return (error);
720} /* xfs_log_write */
721
722
723void
724xfs_log_move_tail(xfs_mount_t *mp,
725 xfs_lsn_t tail_lsn)
726{
727 xlog_ticket_t *tic;
728 xlog_t *log = mp->m_log;
729 int need_bytes, free_bytes, cycle, bytes;
730 SPLDECL(s);
731
732#if defined(DEBUG) || defined(XLOG_NOLOG)
733 if (!xlog_debug && xlog_target == log->l_targ)
734 return;
735#endif
736 /* XXXsup tmp */
737 if (XLOG_FORCED_SHUTDOWN(log))
738 return;
739 ASSERT(!XFS_FORCED_SHUTDOWN(mp));
740
741 if (tail_lsn == 0) {
742 /* needed since sync_lsn is 64 bits */
743 s = LOG_LOCK(log);
744 tail_lsn = log->l_last_sync_lsn;
745 LOG_UNLOCK(log, s);
746 }
747
748 s = GRANT_LOCK(log);
749
750 /* Also an invalid lsn. 1 implies that we aren't passing in a valid
751 * tail_lsn.
752 */
753 if (tail_lsn != 1) {
754 log->l_tail_lsn = tail_lsn;
755 }
756
757 if ((tic = log->l_write_headq)) {
758#ifdef DEBUG
759 if (log->l_flags & XLOG_ACTIVE_RECOVERY)
760 panic("Recovery problem");
761#endif
762 cycle = log->l_grant_write_cycle;
763 bytes = log->l_grant_write_bytes;
764 free_bytes = xlog_space_left(log, cycle, bytes);
765 do {
766 ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
767
768 if (free_bytes < tic->t_unit_res && tail_lsn != 1)
769 break;
770 tail_lsn = 0;
771 free_bytes -= tic->t_unit_res;
772 sv_signal(&tic->t_sema);
773 tic = tic->t_next;
774 } while (tic != log->l_write_headq);
775 }
776 if ((tic = log->l_reserve_headq)) {
777#ifdef DEBUG
778 if (log->l_flags & XLOG_ACTIVE_RECOVERY)
779 panic("Recovery problem");
780#endif
781 cycle = log->l_grant_reserve_cycle;
782 bytes = log->l_grant_reserve_bytes;
783 free_bytes = xlog_space_left(log, cycle, bytes);
784 do {
785 if (tic->t_flags & XLOG_TIC_PERM_RESERV)
786 need_bytes = tic->t_unit_res*tic->t_cnt;
787 else
788 need_bytes = tic->t_unit_res;
789 if (free_bytes < need_bytes && tail_lsn != 1)
790 break;
791 tail_lsn = 0;
792 free_bytes -= need_bytes;
793 sv_signal(&tic->t_sema);
794 tic = tic->t_next;
795 } while (tic != log->l_reserve_headq);
796 }
797 GRANT_UNLOCK(log, s);
798} /* xfs_log_move_tail */
799
800/*
801 * Determine if we have a transaction that has gone to disk
802 * that needs to be covered. Log activity needs to be idle (no AIL and
803 * nothing in the iclogs). And, we need to be in the right state indicating
804 * something has gone out.
805 */
806int
807xfs_log_need_covered(xfs_mount_t *mp)
808{
809 SPLDECL(s);
810 int needed = 0, gen;
811 xlog_t *log = mp->m_log;
812 vfs_t *vfsp = XFS_MTOVFS(mp);
813
814 if (fs_frozen(vfsp) || XFS_FORCED_SHUTDOWN(mp) ||
815 (vfsp->vfs_flag & VFS_RDONLY))
816 return 0;
817
818 s = LOG_LOCK(log);
819 if (((log->l_covered_state == XLOG_STATE_COVER_NEED) ||
820 (log->l_covered_state == XLOG_STATE_COVER_NEED2))
821 && !xfs_trans_first_ail(mp, &gen)
822 && xlog_iclogs_empty(log)) {
823 if (log->l_covered_state == XLOG_STATE_COVER_NEED)
824 log->l_covered_state = XLOG_STATE_COVER_DONE;
825 else {
826 ASSERT(log->l_covered_state == XLOG_STATE_COVER_NEED2);
827 log->l_covered_state = XLOG_STATE_COVER_DONE2;
828 }
829 needed = 1;
830 }
831 LOG_UNLOCK(log, s);
832 return(needed);
833}
834
835/******************************************************************************
836 *
837 * local routines
838 *
839 ******************************************************************************
840 */
841
842/* xfs_trans_tail_ail returns 0 when there is nothing in the list.
843 * The log manager must keep track of the last LR which was committed
844 * to disk. The lsn of this LR will become the new tail_lsn whenever
845 * xfs_trans_tail_ail returns 0. If we don't do this, we run into
846 * the situation where stuff could be written into the log but nothing
847 * was ever in the AIL when asked. Eventually, we panic since the
848 * tail hits the head.
849 *
850 * We may be holding the log iclog lock upon entering this routine.
851 */
852xfs_lsn_t
853xlog_assign_tail_lsn(xfs_mount_t *mp)
854{
855 xfs_lsn_t tail_lsn;
856 SPLDECL(s);
857 xlog_t *log = mp->m_log;
858
859 tail_lsn = xfs_trans_tail_ail(mp);
860 s = GRANT_LOCK(log);
861 if (tail_lsn != 0) {
862 log->l_tail_lsn = tail_lsn;
863 } else {
864 tail_lsn = log->l_tail_lsn = log->l_last_sync_lsn;
865 }
866 GRANT_UNLOCK(log, s);
867
868 return tail_lsn;
869} /* xlog_assign_tail_lsn */
870
871
872/*
873 * Return the space in the log between the tail and the head. The head
874 * is passed in the cycle/bytes formal parms. In the special case where
875 * the reserve head has wrapped passed the tail, this calculation is no
876 * longer valid. In this case, just return 0 which means there is no space
877 * in the log. This works for all places where this function is called
878 * with the reserve head. Of course, if the write head were to ever
879 * wrap the tail, we should blow up. Rather than catch this case here,
880 * we depend on other ASSERTions in other parts of the code. XXXmiken
881 *
882 * This code also handles the case where the reservation head is behind
883 * the tail. The details of this case are described below, but the end
884 * result is that we return the size of the log as the amount of space left.
885 */
886int
887xlog_space_left(xlog_t *log, int cycle, int bytes)
888{
889 int free_bytes;
890 int tail_bytes;
891 int tail_cycle;
892
893 tail_bytes = BBTOB(BLOCK_LSN(log->l_tail_lsn));
894 tail_cycle = CYCLE_LSN(log->l_tail_lsn);
895 if ((tail_cycle == cycle) && (bytes >= tail_bytes)) {
896 free_bytes = log->l_logsize - (bytes - tail_bytes);
897 } else if ((tail_cycle + 1) < cycle) {
898 return 0;
899 } else if (tail_cycle < cycle) {
900 ASSERT(tail_cycle == (cycle - 1));
901 free_bytes = tail_bytes - bytes;
902 } else {
903 /*
904 * The reservation head is behind the tail.
905 * In this case we just want to return the size of the
906 * log as the amount of space left.
907 */
908 xfs_fs_cmn_err(CE_ALERT, log->l_mp,
909 "xlog_space_left: head behind tail\n"
910 " tail_cycle = %d, tail_bytes = %d\n"
911 " GH cycle = %d, GH bytes = %d",
912 tail_cycle, tail_bytes, cycle, bytes);
913 ASSERT(0);
914 free_bytes = log->l_logsize;
915 }
916 return free_bytes;
917} /* xlog_space_left */
918
919
920/*
921 * Log function which is called when an io completes.
922 *
923 * The log manager needs its own routine, in order to control what
924 * happens with the buffer after the write completes.
925 */
926void
927xlog_iodone(xfs_buf_t *bp)
928{
929 xlog_in_core_t *iclog;
930 xlog_t *l;
931 int aborted;
932
933 iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *);
934 ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long) 2);
935 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
936 aborted = 0;
937
938 /*
939 * Some versions of cpp barf on the recursive definition of
940 * ic_log -> hic_fields.ic_log and expand ic_log twice when
941 * it is passed through two macros. Workaround broken cpp.
942 */
943 l = iclog->ic_log;
944
945 /*
946 * Race to shutdown the filesystem if we see an error.
947 */
948 if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp,
949 XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) {
950 xfs_ioerror_alert("xlog_iodone", l->l_mp, bp, XFS_BUF_ADDR(bp));
951 XFS_BUF_STALE(bp);
952 xfs_force_shutdown(l->l_mp, XFS_LOG_IO_ERROR);
953 /*
954 * This flag will be propagated to the trans-committed
955 * callback routines to let them know that the log-commit
956 * didn't succeed.
957 */
958 aborted = XFS_LI_ABORTED;
959 } else if (iclog->ic_state & XLOG_STATE_IOERROR) {
960 aborted = XFS_LI_ABORTED;
961 }
962 xlog_state_done_syncing(iclog, aborted);
963 if (!(XFS_BUF_ISASYNC(bp))) {
964 /*
965 * Corresponding psema() will be done in bwrite(). If we don't
966 * vsema() here, panic.
967 */
968 XFS_BUF_V_IODONESEMA(bp);
969 }
970} /* xlog_iodone */
971
972/*
973 * The bdstrat callback function for log bufs. This gives us a central
974 * place to trap bufs in case we get hit by a log I/O error and need to
975 * shutdown. Actually, in practice, even when we didn't get a log error,
976 * we transition the iclogs to IOERROR state *after* flushing all existing
977 * iclogs to disk. This is because we don't want anymore new transactions to be
978 * started or completed afterwards.
979 */
980STATIC int
981xlog_bdstrat_cb(struct xfs_buf *bp)
982{
983 xlog_in_core_t *iclog;
984
985 iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *);
986
987 if ((iclog->ic_state & XLOG_STATE_IOERROR) == 0) {
988 /* note for irix bstrat will need struct bdevsw passed
989 * Fix the following macro if the code ever is merged
990 */
991 XFS_bdstrat(bp);
992 return 0;
993 }
994
995 xfs_buftrace("XLOG__BDSTRAT IOERROR", bp);
996 XFS_BUF_ERROR(bp, EIO);
997 XFS_BUF_STALE(bp);
998 xfs_biodone(bp);
999 return (XFS_ERROR(EIO));
1000
1001
1002}
1003
1004/*
1005 * Return size of each in-core log record buffer.
1006 *
1007 * Low memory machines only get 2 16KB buffers. We don't want to waste
1008 * memory here. However, all other machines get at least 2 32KB buffers.
1009 * The number is hard coded because we don't care about the minimum
1010 * memory size, just 32MB systems.
1011 *
1012 * If the filesystem blocksize is too large, we may need to choose a
1013 * larger size since the directory code currently logs entire blocks.
1014 */
1015
1016STATIC void
1017xlog_get_iclog_buffer_size(xfs_mount_t *mp,
1018 xlog_t *log)
1019{
1020 int size;
1021 int xhdrs;
1022
1023#if defined(DEBUG) || defined(XLOG_NOLOG)
1024 /*
1025 * When logbufs == 0, someone has disabled the log from the FSTAB
1026 * file. This is not a documented feature. We need to set xlog_debug
1027 * to zero (this deactivates the log) and set xlog_target to the
1028 * appropriate device. Only one filesystem may be affected as such
1029 * since this is just a performance hack to test what we might be able
1030 * to get if the log were not present.
1031 */
1032 if (mp->m_logbufs == 0) {
1033 xlog_debug = 0;
1034 xlog_target = log->l_targ;
1035 log->l_iclog_bufs = XLOG_MIN_ICLOGS;
1036 } else
1037#endif
1038 {
1039 /*
1040 * This is the normal path. If m_logbufs == -1, then the
1041 * admin has chosen to use the system defaults for logbuffers.
1042 */
1043 if (mp->m_logbufs == -1) {
1044 if (xfs_physmem <= btoc(128*1024*1024)) {
1045 log->l_iclog_bufs = XLOG_MIN_ICLOGS;
1046 } else if (xfs_physmem <= btoc(400*1024*1024)) {
1047 log->l_iclog_bufs = XLOG_MED_ICLOGS;
1048 } else {
1049 /* 256K with 32K bufs */
1050 log->l_iclog_bufs = XLOG_MAX_ICLOGS;
1051 }
1052 } else
1053 log->l_iclog_bufs = mp->m_logbufs;
1054
1055#if defined(DEBUG) || defined(XLOG_NOLOG)
1056 /* We are reactivating a filesystem after it was inactive */
1057 if (log->l_targ == xlog_target) {
1058 xlog_target = NULL;
1059 xlog_debug = 1;
1060 }
1061#endif
1062 }
1063
1064 /*
1065 * Buffer size passed in from mount system call.
1066 */
1067 if (mp->m_logbsize != -1) {
1068 size = log->l_iclog_size = mp->m_logbsize;
1069 log->l_iclog_size_log = 0;
1070 while (size != 1) {
1071 log->l_iclog_size_log++;
1072 size >>= 1;
1073 }
1074
1075 if (XFS_SB_VERSION_HASLOGV2(&mp->m_sb)) {
1076 /* # headers = size / 32K
1077 * one header holds cycles from 32K of data
1078 */
1079
1080 xhdrs = mp->m_logbsize / XLOG_HEADER_CYCLE_SIZE;
1081 if (mp->m_logbsize % XLOG_HEADER_CYCLE_SIZE)
1082 xhdrs++;
1083 log->l_iclog_hsize = xhdrs << BBSHIFT;
1084 log->l_iclog_heads = xhdrs;
1085 } else {
1086 ASSERT(mp->m_logbsize <= XLOG_BIG_RECORD_BSIZE);
1087 log->l_iclog_hsize = BBSIZE;
1088 log->l_iclog_heads = 1;
1089 }
1090 return;
1091 }
1092
1093 /*
1094 * Special case machines that have less than 32MB of memory.
1095 * All machines with more memory use 32KB buffers.
1096 */
1097 if (xfs_physmem <= btoc(32*1024*1024)) {
1098 /* Don't change; min configuration */
1099 log->l_iclog_size = XLOG_RECORD_BSIZE; /* 16k */
1100 log->l_iclog_size_log = XLOG_RECORD_BSHIFT;
1101 } else {
1102 log->l_iclog_size = XLOG_BIG_RECORD_BSIZE; /* 32k */
1103 log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
1104 }
1105
1106 /* the default log size is 16k or 32k which is one header sector */
1107 log->l_iclog_hsize = BBSIZE;
1108 log->l_iclog_heads = 1;
1109
1110 /*
1111 * For 16KB, we use 3 32KB buffers. For 32KB block sizes, we use
1112 * 4 32KB buffers. For 64KB block sizes, we use 8 32KB buffers.
1113 */
1114 if (mp->m_sb.sb_blocksize >= 16*1024) {
1115 log->l_iclog_size = XLOG_BIG_RECORD_BSIZE;
1116 log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
1117 if (mp->m_logbufs == -1) {
1118 switch (mp->m_sb.sb_blocksize) {
1119 case 16*1024: /* 16 KB */
1120 log->l_iclog_bufs = 3;
1121 break;
1122 case 32*1024: /* 32 KB */
1123 log->l_iclog_bufs = 4;
1124 break;
1125 case 64*1024: /* 64 KB */
1126 log->l_iclog_bufs = 8;
1127 break;
1128 default:
1129 xlog_panic("XFS: Invalid blocksize");
1130 break;
1131 }
1132 }
1133 }
1134} /* xlog_get_iclog_buffer_size */
1135
1136
1137/*
1138 * This routine initializes some of the log structure for a given mount point.
1139 * Its primary purpose is to fill in enough, so recovery can occur. However,
1140 * some other stuff may be filled in too.
1141 */
1142STATIC xlog_t *
1143xlog_alloc_log(xfs_mount_t *mp,
1144 xfs_buftarg_t *log_target,
1145 xfs_daddr_t blk_offset,
1146 int num_bblks)
1147{
1148 xlog_t *log;
1149 xlog_rec_header_t *head;
1150 xlog_in_core_t **iclogp;
1151 xlog_in_core_t *iclog, *prev_iclog=NULL;
1152 xfs_buf_t *bp;
1153 int i;
1154 int iclogsize;
1155
1156 log = (xlog_t *)kmem_zalloc(sizeof(xlog_t), KM_SLEEP);
1157
1158 log->l_mp = mp;
1159 log->l_targ = log_target;
1160 log->l_logsize = BBTOB(num_bblks);
1161 log->l_logBBstart = blk_offset;
1162 log->l_logBBsize = num_bblks;
1163 log->l_covered_state = XLOG_STATE_COVER_IDLE;
1164 log->l_flags |= XLOG_ACTIVE_RECOVERY;
1165
1166 log->l_prev_block = -1;
1167 ASSIGN_ANY_LSN_HOST(log->l_tail_lsn, 1, 0);
1168 /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
1169 log->l_last_sync_lsn = log->l_tail_lsn;
1170 log->l_curr_cycle = 1; /* 0 is bad since this is initial value */
1171 log->l_grant_reserve_cycle = 1;
1172 log->l_grant_write_cycle = 1;
1173
1174 if (XFS_SB_VERSION_HASSECTOR(&mp->m_sb)) {
1175 log->l_sectbb_log = mp->m_sb.sb_logsectlog - BBSHIFT;
1176 ASSERT(log->l_sectbb_log <= mp->m_sectbb_log);
1177 /* for larger sector sizes, must have v2 or external log */
1178 ASSERT(log->l_sectbb_log == 0 ||
1179 log->l_logBBstart == 0 ||
1180 XFS_SB_VERSION_HASLOGV2(&mp->m_sb));
1181 ASSERT(mp->m_sb.sb_logsectlog >= BBSHIFT);
1182 }
1183 log->l_sectbb_mask = (1 << log->l_sectbb_log) - 1;
1184
1185 xlog_get_iclog_buffer_size(mp, log);
1186
1187 bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp);
1188 XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
1189 XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
1190 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
1191 ASSERT(XFS_BUF_ISBUSY(bp));
1192 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
1193 log->l_xbuf = bp;
1194
1195 spinlock_init(&log->l_icloglock, "iclog");
1196 spinlock_init(&log->l_grant_lock, "grhead_iclog");
1197 initnsema(&log->l_flushsema, 0, "ic-flush");
1198 xlog_state_ticket_alloc(log); /* wait until after icloglock inited */
1199
1200 /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
1201 ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
1202
1203 iclogp = &log->l_iclog;
1204 /*
1205 * The amount of memory to allocate for the iclog structure is
1206 * rather funky due to the way the structure is defined. It is
1207 * done this way so that we can use different sizes for machines
1208 * with different amounts of memory. See the definition of
1209 * xlog_in_core_t in xfs_log_priv.h for details.
1210 */
1211 iclogsize = log->l_iclog_size;
1212 ASSERT(log->l_iclog_size >= 4096);
1213 for (i=0; i < log->l_iclog_bufs; i++) {
1214 *iclogp = (xlog_in_core_t *)
1215 kmem_zalloc(sizeof(xlog_in_core_t), KM_SLEEP);
1216 iclog = *iclogp;
1217 iclog->hic_data = (xlog_in_core_2_t *)
1218 kmem_zalloc(iclogsize, KM_SLEEP);
1219
1220 iclog->ic_prev = prev_iclog;
1221 prev_iclog = iclog;
1222 log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header);
1223
1224 head = &iclog->ic_header;
1225 memset(head, 0, sizeof(xlog_rec_header_t));
1226 INT_SET(head->h_magicno, ARCH_CONVERT, XLOG_HEADER_MAGIC_NUM);
1227 INT_SET(head->h_version, ARCH_CONVERT,
1228 XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) ? 2 : 1);
1229 INT_SET(head->h_size, ARCH_CONVERT, log->l_iclog_size);
1230 /* new fields */
1231 INT_SET(head->h_fmt, ARCH_CONVERT, XLOG_FMT);
1232 memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t));
1233
1234 bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp);
1235 XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
1236 XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
1237 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
1238 iclog->ic_bp = bp;
1239
1240 iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize;
1241 iclog->ic_state = XLOG_STATE_ACTIVE;
1242 iclog->ic_log = log;
1243 iclog->ic_callback_tail = &(iclog->ic_callback);
1244 iclog->ic_datap = (char *)iclog->hic_data + log->l_iclog_hsize;
1245
1246 ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
1247 ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
1248 sv_init(&iclog->ic_forcesema, SV_DEFAULT, "iclog-force");
1249 sv_init(&iclog->ic_writesema, SV_DEFAULT, "iclog-write");
1250
1251 iclogp = &iclog->ic_next;
1252 }
1253 *iclogp = log->l_iclog; /* complete ring */
1254 log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */
1255
1256 return log;
1257} /* xlog_alloc_log */
1258
1259
1260/*
1261 * Write out the commit record of a transaction associated with the given
1262 * ticket. Return the lsn of the commit record.
1263 */
1264STATIC int
1265xlog_commit_record(xfs_mount_t *mp,
1266 xlog_ticket_t *ticket,
1267 xlog_in_core_t **iclog,
1268 xfs_lsn_t *commitlsnp)
1269{
1270 int error;
1271 xfs_log_iovec_t reg[1];
1272
1273 reg[0].i_addr = NULL;
1274 reg[0].i_len = 0;
1275
1276 ASSERT_ALWAYS(iclog);
1277 if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp,
1278 iclog, XLOG_COMMIT_TRANS))) {
1279 xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
1280 }
1281 return (error);
1282} /* xlog_commit_record */
1283
1284
1285/*
1286 * Push on the buffer cache code if we ever use more than 75% of the on-disk
1287 * log space. This code pushes on the lsn which would supposedly free up
1288 * the 25% which we want to leave free. We may need to adopt a policy which
1289 * pushes on an lsn which is further along in the log once we reach the high
1290 * water mark. In this manner, we would be creating a low water mark.
1291 */
1292void
1293xlog_grant_push_ail(xfs_mount_t *mp,
1294 int need_bytes)
1295{
1296 xlog_t *log = mp->m_log; /* pointer to the log */
1297 xfs_lsn_t tail_lsn; /* lsn of the log tail */
1298 xfs_lsn_t threshold_lsn = 0; /* lsn we'd like to be at */
1299 int free_blocks; /* free blocks left to write to */
1300 int free_bytes; /* free bytes left to write to */
1301 int threshold_block; /* block in lsn we'd like to be at */
1302 int threshold_cycle; /* lsn cycle we'd like to be at */
1303 int free_threshold;
1304 SPLDECL(s);
1305
1306 ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
1307
1308 s = GRANT_LOCK(log);
1309 free_bytes = xlog_space_left(log,
1310 log->l_grant_reserve_cycle,
1311 log->l_grant_reserve_bytes);
1312 tail_lsn = log->l_tail_lsn;
1313 free_blocks = BTOBBT(free_bytes);
1314
1315 /*
1316 * Set the threshold for the minimum number of free blocks in the
1317 * log to the maximum of what the caller needs, one quarter of the
1318 * log, and 256 blocks.
1319 */
1320 free_threshold = BTOBB(need_bytes);
1321 free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
1322 free_threshold = MAX(free_threshold, 256);
1323 if (free_blocks < free_threshold) {
1324 threshold_block = BLOCK_LSN(tail_lsn) + free_threshold;
1325 threshold_cycle = CYCLE_LSN(tail_lsn);
1326 if (threshold_block >= log->l_logBBsize) {
1327 threshold_block -= log->l_logBBsize;
1328 threshold_cycle += 1;
1329 }
1330 ASSIGN_ANY_LSN_HOST(threshold_lsn, threshold_cycle,
1331 threshold_block);
1332
1333 /* Don't pass in an lsn greater than the lsn of the last
1334 * log record known to be on disk.
1335 */
1336 if (XFS_LSN_CMP(threshold_lsn, log->l_last_sync_lsn) > 0)
1337 threshold_lsn = log->l_last_sync_lsn;
1338 }
1339 GRANT_UNLOCK(log, s);
1340
1341 /*
1342 * Get the transaction layer to kick the dirty buffers out to
1343 * disk asynchronously. No point in trying to do this if
1344 * the filesystem is shutting down.
1345 */
1346 if (threshold_lsn &&
1347 !XLOG_FORCED_SHUTDOWN(log))
1348 xfs_trans_push_ail(mp, threshold_lsn);
1349} /* xlog_grant_push_ail */
1350
1351
1352/*
1353 * Flush out the in-core log (iclog) to the on-disk log in an asynchronous
1354 * fashion. Previously, we should have moved the current iclog
1355 * ptr in the log to point to the next available iclog. This allows further
1356 * write to continue while this code syncs out an iclog ready to go.
1357 * Before an in-core log can be written out, the data section must be scanned
1358 * to save away the 1st word of each BBSIZE block into the header. We replace
1359 * it with the current cycle count. Each BBSIZE block is tagged with the
1360 * cycle count because there in an implicit assumption that drives will
1361 * guarantee that entire 512 byte blocks get written at once. In other words,
1362 * we can't have part of a 512 byte block written and part not written. By
1363 * tagging each block, we will know which blocks are valid when recovering
1364 * after an unclean shutdown.
1365 *
1366 * This routine is single threaded on the iclog. No other thread can be in
1367 * this routine with the same iclog. Changing contents of iclog can there-
1368 * fore be done without grabbing the state machine lock. Updating the global
1369 * log will require grabbing the lock though.
1370 *
1371 * The entire log manager uses a logical block numbering scheme. Only
1372 * log_sync (and then only bwrite()) know about the fact that the log may
1373 * not start with block zero on a given device. The log block start offset
1374 * is added immediately before calling bwrite().
1375 */
1376
1377int
1378xlog_sync(xlog_t *log,
1379 xlog_in_core_t *iclog)
1380{
1381 xfs_caddr_t dptr; /* pointer to byte sized element */
1382 xfs_buf_t *bp;
1383 int i, ops;
1384 uint count; /* byte count of bwrite */
1385 uint count_init; /* initial count before roundup */
1386 int roundoff; /* roundoff to BB or stripe */
1387 int split = 0; /* split write into two regions */
1388 int error;
1389 SPLDECL(s);
1390 int v2 = XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb);
1391
1392 XFS_STATS_INC(xs_log_writes);
1393 ASSERT(iclog->ic_refcnt == 0);
1394
1395 /* Add for LR header */
1396 count_init = log->l_iclog_hsize + iclog->ic_offset;
1397
1398 /* Round out the log write size */
1399 if (v2 && log->l_mp->m_sb.sb_logsunit > 1) {
1400 /* we have a v2 stripe unit to use */
1401 count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init));
1402 } else {
1403 count = BBTOB(BTOBB(count_init));
1404 }
1405 roundoff = count - count_init;
1406 ASSERT(roundoff >= 0);
1407 ASSERT((v2 && log->l_mp->m_sb.sb_logsunit > 1 &&
1408 roundoff < log->l_mp->m_sb.sb_logsunit)
1409 ||
1410 (log->l_mp->m_sb.sb_logsunit <= 1 &&
1411 roundoff < BBTOB(1)));
1412
1413 /* move grant heads by roundoff in sync */
1414 s = GRANT_LOCK(log);
1415 XLOG_GRANT_ADD_SPACE(log, roundoff, 'w');
1416 XLOG_GRANT_ADD_SPACE(log, roundoff, 'r');
1417 GRANT_UNLOCK(log, s);
1418
1419 /* put cycle number in every block */
1420 xlog_pack_data(log, iclog, roundoff);
1421
1422 /* real byte length */
1423 if (v2) {
1424 INT_SET(iclog->ic_header.h_len,
1425 ARCH_CONVERT,
1426 iclog->ic_offset + roundoff);
1427 } else {
1428 INT_SET(iclog->ic_header.h_len, ARCH_CONVERT, iclog->ic_offset);
1429 }
1430
1431 /* put ops count in correct order */
1432 ops = iclog->ic_header.h_num_logops;
1433 INT_SET(iclog->ic_header.h_num_logops, ARCH_CONVERT, ops);
1434
1435 bp = iclog->ic_bp;
1436 ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long)1);
1437 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2);
1438 XFS_BUF_SET_ADDR(bp, BLOCK_LSN(INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT)));
1439
1440 XFS_STATS_ADD(xs_log_blocks, BTOBB(count));
1441
1442 /* Do we need to split this write into 2 parts? */
1443 if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) {
1444 split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)));
1445 count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp));
1446 iclog->ic_bwritecnt = 2; /* split into 2 writes */
1447 } else {
1448 iclog->ic_bwritecnt = 1;
1449 }
1450 XFS_BUF_SET_PTR(bp, (xfs_caddr_t) &(iclog->ic_header), count);
1451 XFS_BUF_SET_FSPRIVATE(bp, iclog); /* save for later */
1452 XFS_BUF_BUSY(bp);
1453 XFS_BUF_ASYNC(bp);
1454 /*
1455 * Do a disk write cache flush for the log block.
1456 * This is a bit of a sledgehammer, it would be better
1457 * to use a tag barrier here that just prevents reordering.
1458 * It may not be needed to flush the first split block in the log wrap
1459 * case, but do it anyways to be safe -AK
1460 */
1461 if (!(log->l_mp->m_flags & XFS_MOUNT_NOLOGFLUSH))
1462 XFS_BUF_FLUSH(bp);
1463
1464 ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
1465 ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
1466
1467 xlog_verify_iclog(log, iclog, count, B_TRUE);
1468
1469 /* account for log which doesn't start at block #0 */
1470 XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
1471 /*
1472 * Don't call xfs_bwrite here. We do log-syncs even when the filesystem
1473 * is shutting down.
1474 */
1475 XFS_BUF_WRITE(bp);
1476
1477 if ((error = XFS_bwrite(bp))) {
1478 xfs_ioerror_alert("xlog_sync", log->l_mp, bp,
1479 XFS_BUF_ADDR(bp));
1480 return (error);
1481 }
1482 if (split) {
1483 bp = iclog->ic_log->l_xbuf;
1484 ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) ==
1485 (unsigned long)1);
1486 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2);
1487 XFS_BUF_SET_ADDR(bp, 0); /* logical 0 */
1488 XFS_BUF_SET_PTR(bp, (xfs_caddr_t)((__psint_t)&(iclog->ic_header)+
1489 (__psint_t)count), split);
1490 XFS_BUF_SET_FSPRIVATE(bp, iclog);
1491 XFS_BUF_BUSY(bp);
1492 XFS_BUF_ASYNC(bp);
1493 if (!(log->l_mp->m_flags & XFS_MOUNT_NOLOGFLUSH))
1494 XFS_BUF_FLUSH(bp);
1495 dptr = XFS_BUF_PTR(bp);
1496 /*
1497 * Bump the cycle numbers at the start of each block
1498 * since this part of the buffer is at the start of
1499 * a new cycle. Watch out for the header magic number
1500 * case, though.
1501 */
1502 for (i=0; i<split; i += BBSIZE) {
1503 INT_MOD(*(uint *)dptr, ARCH_CONVERT, +1);
1504 if (INT_GET(*(uint *)dptr, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM)
1505 INT_MOD(*(uint *)dptr, ARCH_CONVERT, +1);
1506 dptr += BBSIZE;
1507 }
1508
1509 ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
1510 ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
1511
1512 /* account for internal log which does't start at block #0 */
1513 XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
1514 XFS_BUF_WRITE(bp);
1515 if ((error = XFS_bwrite(bp))) {
1516 xfs_ioerror_alert("xlog_sync (split)", log->l_mp,
1517 bp, XFS_BUF_ADDR(bp));
1518 return (error);
1519 }
1520 }
1521 return (0);
1522} /* xlog_sync */
1523
1524
1525/*
1526 * Unallocate a log structure
1527 */
1528void
1529xlog_unalloc_log(xlog_t *log)
1530{
1531 xlog_in_core_t *iclog, *next_iclog;
1532 xlog_ticket_t *tic, *next_tic;
1533 int i;
1534
1535
1536 iclog = log->l_iclog;
1537 for (i=0; i<log->l_iclog_bufs; i++) {
1538 sv_destroy(&iclog->ic_forcesema);
1539 sv_destroy(&iclog->ic_writesema);
1540 xfs_buf_free(iclog->ic_bp);
1541#ifdef XFS_LOG_TRACE
1542 if (iclog->ic_trace != NULL) {
1543 ktrace_free(iclog->ic_trace);
1544 }
1545#endif
1546 next_iclog = iclog->ic_next;
1547 kmem_free(iclog->hic_data, log->l_iclog_size);
1548 kmem_free(iclog, sizeof(xlog_in_core_t));
1549 iclog = next_iclog;
1550 }
1551 freesema(&log->l_flushsema);
1552 spinlock_destroy(&log->l_icloglock);
1553 spinlock_destroy(&log->l_grant_lock);
1554
1555 /* XXXsup take a look at this again. */
1556 if ((log->l_ticket_cnt != log->l_ticket_tcnt) &&
1557 !XLOG_FORCED_SHUTDOWN(log)) {
1558 xfs_fs_cmn_err(CE_WARN, log->l_mp,
1559 "xlog_unalloc_log: (cnt: %d, total: %d)",
1560 log->l_ticket_cnt, log->l_ticket_tcnt);
1561 /* ASSERT(log->l_ticket_cnt == log->l_ticket_tcnt); */
1562
1563 } else {
1564 tic = log->l_unmount_free;
1565 while (tic) {
1566 next_tic = tic->t_next;
1567 kmem_free(tic, NBPP);
1568 tic = next_tic;
1569 }
1570 }
1571 xfs_buf_free(log->l_xbuf);
1572#ifdef XFS_LOG_TRACE
1573 if (log->l_trace != NULL) {
1574 ktrace_free(log->l_trace);
1575 }
1576 if (log->l_grant_trace != NULL) {
1577 ktrace_free(log->l_grant_trace);
1578 }
1579#endif
1580 log->l_mp->m_log = NULL;
1581 kmem_free(log, sizeof(xlog_t));
1582} /* xlog_unalloc_log */
1583
1584/*
1585 * Update counters atomically now that memcpy is done.
1586 */
1587/* ARGSUSED */
1588static inline void
1589xlog_state_finish_copy(xlog_t *log,
1590 xlog_in_core_t *iclog,
1591 int record_cnt,
1592 int copy_bytes)
1593{
1594 SPLDECL(s);
1595
1596 s = LOG_LOCK(log);
1597
1598 iclog->ic_header.h_num_logops += record_cnt;
1599 iclog->ic_offset += copy_bytes;
1600
1601 LOG_UNLOCK(log, s);
1602} /* xlog_state_finish_copy */
1603
1604
1605
1606
1607/*
1608 * Write some region out to in-core log
1609 *
1610 * This will be called when writing externally provided regions or when
1611 * writing out a commit record for a given transaction.
1612 *
1613 * General algorithm:
1614 * 1. Find total length of this write. This may include adding to the
1615 * lengths passed in.
1616 * 2. Check whether we violate the tickets reservation.
1617 * 3. While writing to this iclog
1618 * A. Reserve as much space in this iclog as can get
1619 * B. If this is first write, save away start lsn
1620 * C. While writing this region:
1621 * 1. If first write of transaction, write start record
1622 * 2. Write log operation header (header per region)
1623 * 3. Find out if we can fit entire region into this iclog
1624 * 4. Potentially, verify destination memcpy ptr
1625 * 5. Memcpy (partial) region
1626 * 6. If partial copy, release iclog; otherwise, continue
1627 * copying more regions into current iclog
1628 * 4. Mark want sync bit (in simulation mode)
1629 * 5. Release iclog for potential flush to on-disk log.
1630 *
1631 * ERRORS:
1632 * 1. Panic if reservation is overrun. This should never happen since
1633 * reservation amounts are generated internal to the filesystem.
1634 * NOTES:
1635 * 1. Tickets are single threaded data structures.
1636 * 2. The XLOG_END_TRANS & XLOG_CONTINUE_TRANS flags are passed down to the
1637 * syncing routine. When a single log_write region needs to span
1638 * multiple in-core logs, the XLOG_CONTINUE_TRANS bit should be set
1639 * on all log operation writes which don't contain the end of the
1640 * region. The XLOG_END_TRANS bit is used for the in-core log
1641 * operation which contains the end of the continued log_write region.
1642 * 3. When xlog_state_get_iclog_space() grabs the rest of the current iclog,
1643 * we don't really know exactly how much space will be used. As a result,
1644 * we don't update ic_offset until the end when we know exactly how many
1645 * bytes have been written out.
1646 */
1647int
1648xlog_write(xfs_mount_t * mp,
1649 xfs_log_iovec_t reg[],
1650 int nentries,
1651 xfs_log_ticket_t tic,
1652 xfs_lsn_t *start_lsn,
1653 xlog_in_core_t **commit_iclog,
1654 uint flags)
1655{
1656 xlog_t *log = mp->m_log;
1657 xlog_ticket_t *ticket = (xlog_ticket_t *)tic;
1658 xlog_op_header_t *logop_head; /* ptr to log operation header */
1659 xlog_in_core_t *iclog; /* ptr to current in-core log */
1660 __psint_t ptr; /* copy address into data region */
1661 int len; /* # xlog_write() bytes 2 still copy */
1662 int index; /* region index currently copying */
1663 int log_offset; /* offset (from 0) into data region */
1664 int start_rec_copy; /* # bytes to copy for start record */
1665 int partial_copy; /* did we split a region? */
1666 int partial_copy_len;/* # bytes copied if split region */
1667 int need_copy; /* # bytes need to memcpy this region */
1668 int copy_len; /* # bytes actually memcpy'ing */
1669 int copy_off; /* # bytes from entry start */
1670 int contwr; /* continued write of in-core log? */
1671 int error;
1672 int record_cnt = 0, data_cnt = 0;
1673
1674 partial_copy_len = partial_copy = 0;
1675
1676 /* Calculate potential maximum space. Each region gets its own
1677 * xlog_op_header_t and may need to be double word aligned.
1678 */
1679 len = 0;
1680 if (ticket->t_flags & XLOG_TIC_INITED) /* acct for start rec of xact */
1681 len += sizeof(xlog_op_header_t);
1682
1683 for (index = 0; index < nentries; index++) {
1684 len += sizeof(xlog_op_header_t); /* each region gets >= 1 */
1685 len += reg[index].i_len;
1686 }
1687 contwr = *start_lsn = 0;
1688
1689 if (ticket->t_curr_res < len) {
1690#ifdef DEBUG
1691 xlog_panic(
1692 "xfs_log_write: reservation ran out. Need to up reservation");
1693#else
1694 /* Customer configurable panic */
1695 xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
1696 "xfs_log_write: reservation ran out. Need to up reservation");
1697 /* If we did not panic, shutdown the filesystem */
1698 xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
1699#endif
1700 } else
1701 ticket->t_curr_res -= len;
1702
1703 for (index = 0; index < nentries; ) {
1704 if ((error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
1705 &contwr, &log_offset)))
1706 return (error);
1707
1708 ASSERT(log_offset <= iclog->ic_size - 1);
1709 ptr = (__psint_t) ((char *)iclog->ic_datap+log_offset);
1710
1711 /* start_lsn is the first lsn written to. That's all we need. */
1712 if (! *start_lsn)
1713 *start_lsn = INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT);
1714
1715 /* This loop writes out as many regions as can fit in the amount
1716 * of space which was allocated by xlog_state_get_iclog_space().
1717 */
1718 while (index < nentries) {
1719 ASSERT(reg[index].i_len % sizeof(__int32_t) == 0);
1720 ASSERT((__psint_t)ptr % sizeof(__int32_t) == 0);
1721 start_rec_copy = 0;
1722
1723 /* If first write for transaction, insert start record.
1724 * We can't be trying to commit if we are inited. We can't
1725 * have any "partial_copy" if we are inited.
1726 */
1727 if (ticket->t_flags & XLOG_TIC_INITED) {
1728 logop_head = (xlog_op_header_t *)ptr;
1729 INT_SET(logop_head->oh_tid, ARCH_CONVERT, ticket->t_tid);
1730 logop_head->oh_clientid = ticket->t_clientid;
1731 logop_head->oh_len = 0;
1732 logop_head->oh_flags = XLOG_START_TRANS;
1733 logop_head->oh_res2 = 0;
1734 ticket->t_flags &= ~XLOG_TIC_INITED; /* clear bit */
1735 record_cnt++;
1736
1737 start_rec_copy = sizeof(xlog_op_header_t);
1738 xlog_write_adv_cnt(ptr, len, log_offset, start_rec_copy);
1739 }
1740
1741 /* Copy log operation header directly into data section */
1742 logop_head = (xlog_op_header_t *)ptr;
1743 INT_SET(logop_head->oh_tid, ARCH_CONVERT, ticket->t_tid);
1744 logop_head->oh_clientid = ticket->t_clientid;
1745 logop_head->oh_res2 = 0;
1746
1747 /* header copied directly */
1748 xlog_write_adv_cnt(ptr, len, log_offset, sizeof(xlog_op_header_t));
1749
1750 /* are we copying a commit or unmount record? */
1751 logop_head->oh_flags = flags;
1752
1753 /*
1754 * We've seen logs corrupted with bad transaction client
1755 * ids. This makes sure that XFS doesn't generate them on.
1756 * Turn this into an EIO and shut down the filesystem.
1757 */
1758 switch (logop_head->oh_clientid) {
1759 case XFS_TRANSACTION:
1760 case XFS_VOLUME:
1761 case XFS_LOG:
1762 break;
1763 default:
1764 xfs_fs_cmn_err(CE_WARN, mp,
1765 "Bad XFS transaction clientid 0x%x in ticket 0x%p",
1766 logop_head->oh_clientid, tic);
1767 return XFS_ERROR(EIO);
1768 }
1769
1770 /* Partial write last time? => (partial_copy != 0)
1771 * need_copy is the amount we'd like to copy if everything could
1772 * fit in the current memcpy.
1773 */
1774 need_copy = reg[index].i_len - partial_copy_len;
1775
1776 copy_off = partial_copy_len;
1777 if (need_copy <= iclog->ic_size - log_offset) { /*complete write */
1778 INT_SET(logop_head->oh_len, ARCH_CONVERT, copy_len = need_copy);
1779 if (partial_copy)
1780 logop_head->oh_flags|= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
1781 partial_copy_len = partial_copy = 0;
1782 } else { /* partial write */
1783 copy_len = iclog->ic_size - log_offset;
1784 INT_SET(logop_head->oh_len, ARCH_CONVERT, copy_len);
1785 logop_head->oh_flags |= XLOG_CONTINUE_TRANS;
1786 if (partial_copy)
1787 logop_head->oh_flags |= XLOG_WAS_CONT_TRANS;
1788 partial_copy_len += copy_len;
1789 partial_copy++;
1790 len += sizeof(xlog_op_header_t); /* from splitting of region */
1791 /* account for new log op header */
1792 ticket->t_curr_res -= sizeof(xlog_op_header_t);
1793 }
1794 xlog_verify_dest_ptr(log, ptr);
1795
1796 /* copy region */
1797 ASSERT(copy_len >= 0);
1798 memcpy((xfs_caddr_t)ptr, reg[index].i_addr + copy_off, copy_len);
1799 xlog_write_adv_cnt(ptr, len, log_offset, copy_len);
1800
1801 /* make copy_len total bytes copied, including headers */
1802 copy_len += start_rec_copy + sizeof(xlog_op_header_t);
1803 record_cnt++;
1804 data_cnt += contwr ? copy_len : 0;
1805 if (partial_copy) { /* copied partial region */
1806 /* already marked WANT_SYNC by xlog_state_get_iclog_space */
1807 xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
1808 record_cnt = data_cnt = 0;
1809 if ((error = xlog_state_release_iclog(log, iclog)))
1810 return (error);
1811 break; /* don't increment index */
1812 } else { /* copied entire region */
1813 index++;
1814 partial_copy_len = partial_copy = 0;
1815
1816 if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
1817 xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
1818 record_cnt = data_cnt = 0;
1819 xlog_state_want_sync(log, iclog);
1820 if (commit_iclog) {
1821 ASSERT(flags & XLOG_COMMIT_TRANS);
1822 *commit_iclog = iclog;
1823 } else if ((error = xlog_state_release_iclog(log, iclog)))
1824 return (error);
1825 if (index == nentries)
1826 return 0; /* we are done */
1827 else
1828 break;
1829 }
1830 } /* if (partial_copy) */
1831 } /* while (index < nentries) */
1832 } /* for (index = 0; index < nentries; ) */
1833 ASSERT(len == 0);
1834
1835 xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
1836 if (commit_iclog) {
1837 ASSERT(flags & XLOG_COMMIT_TRANS);
1838 *commit_iclog = iclog;
1839 return 0;
1840 }
1841 return (xlog_state_release_iclog(log, iclog));
1842} /* xlog_write */
1843
1844
1845/*****************************************************************************
1846 *
1847 * State Machine functions
1848 *
1849 *****************************************************************************
1850 */
1851
1852/* Clean iclogs starting from the head. This ordering must be
1853 * maintained, so an iclog doesn't become ACTIVE beyond one that
1854 * is SYNCING. This is also required to maintain the notion that we use
1855 * a counting semaphore to hold off would be writers to the log when every
1856 * iclog is trying to sync to disk.
1857 *
1858 * State Change: DIRTY -> ACTIVE
1859 */
1860void
1861xlog_state_clean_log(xlog_t *log)
1862{
1863 xlog_in_core_t *iclog;
1864 int changed = 0;
1865
1866 iclog = log->l_iclog;
1867 do {
1868 if (iclog->ic_state == XLOG_STATE_DIRTY) {
1869 iclog->ic_state = XLOG_STATE_ACTIVE;
1870 iclog->ic_offset = 0;
1871 iclog->ic_callback = NULL; /* don't need to free */
1872 /*
1873 * If the number of ops in this iclog indicate it just
1874 * contains the dummy transaction, we can
1875 * change state into IDLE (the second time around).
1876 * Otherwise we should change the state into
1877 * NEED a dummy.
1878 * We don't need to cover the dummy.
1879 */
1880 if (!changed &&
1881 (INT_GET(iclog->ic_header.h_num_logops, ARCH_CONVERT) == XLOG_COVER_OPS)) {
1882 changed = 1;
1883 } else {
1884 /*
1885 * We have two dirty iclogs so start over
1886 * This could also be num of ops indicates
1887 * this is not the dummy going out.
1888 */
1889 changed = 2;
1890 }
1891 iclog->ic_header.h_num_logops = 0;
1892 memset(iclog->ic_header.h_cycle_data, 0,
1893 sizeof(iclog->ic_header.h_cycle_data));
1894 iclog->ic_header.h_lsn = 0;
1895 } else if (iclog->ic_state == XLOG_STATE_ACTIVE)
1896 /* do nothing */;
1897 else
1898 break; /* stop cleaning */
1899 iclog = iclog->ic_next;
1900 } while (iclog != log->l_iclog);
1901
1902 /* log is locked when we are called */
1903 /*
1904 * Change state for the dummy log recording.
1905 * We usually go to NEED. But we go to NEED2 if the changed indicates
1906 * we are done writing the dummy record.
1907 * If we are done with the second dummy recored (DONE2), then
1908 * we go to IDLE.
1909 */
1910 if (changed) {
1911 switch (log->l_covered_state) {
1912 case XLOG_STATE_COVER_IDLE:
1913 case XLOG_STATE_COVER_NEED:
1914 case XLOG_STATE_COVER_NEED2:
1915 log->l_covered_state = XLOG_STATE_COVER_NEED;
1916 break;
1917
1918 case XLOG_STATE_COVER_DONE:
1919 if (changed == 1)
1920 log->l_covered_state = XLOG_STATE_COVER_NEED2;
1921 else
1922 log->l_covered_state = XLOG_STATE_COVER_NEED;
1923 break;
1924
1925 case XLOG_STATE_COVER_DONE2:
1926 if (changed == 1)
1927 log->l_covered_state = XLOG_STATE_COVER_IDLE;
1928 else
1929 log->l_covered_state = XLOG_STATE_COVER_NEED;
1930 break;
1931
1932 default:
1933 ASSERT(0);
1934 }
1935 }
1936} /* xlog_state_clean_log */
1937
1938STATIC xfs_lsn_t
1939xlog_get_lowest_lsn(
1940 xlog_t *log)
1941{
1942 xlog_in_core_t *lsn_log;
1943 xfs_lsn_t lowest_lsn, lsn;
1944
1945 lsn_log = log->l_iclog;
1946 lowest_lsn = 0;
1947 do {
1948 if (!(lsn_log->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY))) {
1949 lsn = INT_GET(lsn_log->ic_header.h_lsn, ARCH_CONVERT);
1950 if ((lsn && !lowest_lsn) ||
1951 (XFS_LSN_CMP(lsn, lowest_lsn) < 0)) {
1952 lowest_lsn = lsn;
1953 }
1954 }
1955 lsn_log = lsn_log->ic_next;
1956 } while (lsn_log != log->l_iclog);
1957 return(lowest_lsn);
1958}
1959
1960
1961STATIC void
1962xlog_state_do_callback(
1963 xlog_t *log,
1964 int aborted,
1965 xlog_in_core_t *ciclog)
1966{
1967 xlog_in_core_t *iclog;
1968 xlog_in_core_t *first_iclog; /* used to know when we've
1969 * processed all iclogs once */
1970 xfs_log_callback_t *cb, *cb_next;
1971 int flushcnt = 0;
1972 xfs_lsn_t lowest_lsn;
1973 int ioerrors; /* counter: iclogs with errors */
1974 int loopdidcallbacks; /* flag: inner loop did callbacks*/
1975 int funcdidcallbacks; /* flag: function did callbacks */
1976 int repeats; /* for issuing console warnings if
1977 * looping too many times */
1978 SPLDECL(s);
1979
1980 s = LOG_LOCK(log);
1981 first_iclog = iclog = log->l_iclog;
1982 ioerrors = 0;
1983 funcdidcallbacks = 0;
1984 repeats = 0;
1985
1986 do {
1987 /*
1988 * Scan all iclogs starting with the one pointed to by the
1989 * log. Reset this starting point each time the log is
1990 * unlocked (during callbacks).
1991 *
1992 * Keep looping through iclogs until one full pass is made
1993 * without running any callbacks.
1994 */
1995 first_iclog = log->l_iclog;
1996 iclog = log->l_iclog;
1997 loopdidcallbacks = 0;
1998 repeats++;
1999
2000 do {
2001
2002 /* skip all iclogs in the ACTIVE & DIRTY states */
2003 if (iclog->ic_state &
2004 (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY)) {
2005 iclog = iclog->ic_next;
2006 continue;
2007 }
2008
2009 /*
2010 * Between marking a filesystem SHUTDOWN and stopping
2011 * the log, we do flush all iclogs to disk (if there
2012 * wasn't a log I/O error). So, we do want things to
2013 * go smoothly in case of just a SHUTDOWN w/o a
2014 * LOG_IO_ERROR.
2015 */
2016 if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
2017 /*
2018 * Can only perform callbacks in order. Since
2019 * this iclog is not in the DONE_SYNC/
2020 * DO_CALLBACK state, we skip the rest and
2021 * just try to clean up. If we set our iclog
2022 * to DO_CALLBACK, we will not process it when
2023 * we retry since a previous iclog is in the
2024 * CALLBACK and the state cannot change since
2025 * we are holding the LOG_LOCK.
2026 */
2027 if (!(iclog->ic_state &
2028 (XLOG_STATE_DONE_SYNC |
2029 XLOG_STATE_DO_CALLBACK))) {
2030 if (ciclog && (ciclog->ic_state ==
2031 XLOG_STATE_DONE_SYNC)) {
2032 ciclog->ic_state = XLOG_STATE_DO_CALLBACK;
2033 }
2034 break;
2035 }
2036 /*
2037 * We now have an iclog that is in either the
2038 * DO_CALLBACK or DONE_SYNC states. The other
2039 * states (WANT_SYNC, SYNCING, or CALLBACK were
2040 * caught by the above if and are going to
2041 * clean (i.e. we aren't doing their callbacks)
2042 * see the above if.
2043 */
2044
2045 /*
2046 * We will do one more check here to see if we
2047 * have chased our tail around.
2048 */
2049
2050 lowest_lsn = xlog_get_lowest_lsn(log);
2051 if (lowest_lsn && (
2052 XFS_LSN_CMP(
2053 lowest_lsn,
2054 INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT)
2055 )<0)) {
2056 iclog = iclog->ic_next;
2057 continue; /* Leave this iclog for
2058 * another thread */
2059 }
2060
2061 iclog->ic_state = XLOG_STATE_CALLBACK;
2062
2063 LOG_UNLOCK(log, s);
2064
2065 /* l_last_sync_lsn field protected by
2066 * GRANT_LOCK. Don't worry about iclog's lsn.
2067 * No one else can be here except us.
2068 */
2069 s = GRANT_LOCK(log);
2070 ASSERT(XFS_LSN_CMP(
2071 log->l_last_sync_lsn,
2072 INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT)
2073 )<=0);
2074 log->l_last_sync_lsn = INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT);
2075 GRANT_UNLOCK(log, s);
2076
2077 /*
2078 * Keep processing entries in the callback list
2079 * until we come around and it is empty. We
2080 * need to atomically see that the list is
2081 * empty and change the state to DIRTY so that
2082 * we don't miss any more callbacks being added.
2083 */
2084 s = LOG_LOCK(log);
2085 } else {
2086 ioerrors++;
2087 }
2088 cb = iclog->ic_callback;
2089
2090 while (cb != 0) {
2091 iclog->ic_callback_tail = &(iclog->ic_callback);
2092 iclog->ic_callback = NULL;
2093 LOG_UNLOCK(log, s);
2094
2095 /* perform callbacks in the order given */
2096 for (; cb != 0; cb = cb_next) {
2097 cb_next = cb->cb_next;
2098 cb->cb_func(cb->cb_arg, aborted);
2099 }
2100 s = LOG_LOCK(log);
2101 cb = iclog->ic_callback;
2102 }
2103
2104 loopdidcallbacks++;
2105 funcdidcallbacks++;
2106
2107 ASSERT(iclog->ic_callback == 0);
2108 if (!(iclog->ic_state & XLOG_STATE_IOERROR))
2109 iclog->ic_state = XLOG_STATE_DIRTY;
2110
2111 /*
2112 * Transition from DIRTY to ACTIVE if applicable.
2113 * NOP if STATE_IOERROR.
2114 */
2115 xlog_state_clean_log(log);
2116
2117 /* wake up threads waiting in xfs_log_force() */
2118 sv_broadcast(&iclog->ic_forcesema);
2119
2120 iclog = iclog->ic_next;
2121 } while (first_iclog != iclog);
2122 if (repeats && (repeats % 10) == 0) {
2123 xfs_fs_cmn_err(CE_WARN, log->l_mp,
2124 "xlog_state_do_callback: looping %d", repeats);
2125 }
2126 } while (!ioerrors && loopdidcallbacks);
2127
2128 /*
2129 * make one last gasp attempt to see if iclogs are being left in
2130 * limbo..
2131 */
2132#ifdef DEBUG
2133 if (funcdidcallbacks) {
2134 first_iclog = iclog = log->l_iclog;
2135 do {
2136 ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK);
2137 /*
2138 * Terminate the loop if iclogs are found in states
2139 * which will cause other threads to clean up iclogs.
2140 *
2141 * SYNCING - i/o completion will go through logs
2142 * DONE_SYNC - interrupt thread should be waiting for
2143 * LOG_LOCK
2144 * IOERROR - give up hope all ye who enter here
2145 */
2146 if (iclog->ic_state == XLOG_STATE_WANT_SYNC ||
2147 iclog->ic_state == XLOG_STATE_SYNCING ||
2148 iclog->ic_state == XLOG_STATE_DONE_SYNC ||
2149 iclog->ic_state == XLOG_STATE_IOERROR )
2150 break;
2151 iclog = iclog->ic_next;
2152 } while (first_iclog != iclog);
2153 }
2154#endif
2155
2156 if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR)) {
2157 flushcnt = log->l_flushcnt;
2158 log->l_flushcnt = 0;
2159 }
2160 LOG_UNLOCK(log, s);
2161 while (flushcnt--)
2162 vsema(&log->l_flushsema);
2163} /* xlog_state_do_callback */
2164
2165
2166/*
2167 * Finish transitioning this iclog to the dirty state.
2168 *
2169 * Make sure that we completely execute this routine only when this is
2170 * the last call to the iclog. There is a good chance that iclog flushes,
2171 * when we reach the end of the physical log, get turned into 2 separate
2172 * calls to bwrite. Hence, one iclog flush could generate two calls to this
2173 * routine. By using the reference count bwritecnt, we guarantee that only
2174 * the second completion goes through.
2175 *
2176 * Callbacks could take time, so they are done outside the scope of the
2177 * global state machine log lock. Assume that the calls to cvsema won't
2178 * take a long time. At least we know it won't sleep.
2179 */
2180void
2181xlog_state_done_syncing(
2182 xlog_in_core_t *iclog,
2183 int aborted)
2184{
2185 xlog_t *log = iclog->ic_log;
2186 SPLDECL(s);
2187
2188 s = LOG_LOCK(log);
2189
2190 ASSERT(iclog->ic_state == XLOG_STATE_SYNCING ||
2191 iclog->ic_state == XLOG_STATE_IOERROR);
2192 ASSERT(iclog->ic_refcnt == 0);
2193 ASSERT(iclog->ic_bwritecnt == 1 || iclog->ic_bwritecnt == 2);
2194
2195
2196 /*
2197 * If we got an error, either on the first buffer, or in the case of
2198 * split log writes, on the second, we mark ALL iclogs STATE_IOERROR,
2199 * and none should ever be attempted to be written to disk
2200 * again.
2201 */
2202 if (iclog->ic_state != XLOG_STATE_IOERROR) {
2203 if (--iclog->ic_bwritecnt == 1) {
2204 LOG_UNLOCK(log, s);
2205 return;
2206 }
2207 iclog->ic_state = XLOG_STATE_DONE_SYNC;
2208 }
2209
2210 /*
2211 * Someone could be sleeping prior to writing out the next
2212 * iclog buffer, we wake them all, one will get to do the
2213 * I/O, the others get to wait for the result.
2214 */
2215 sv_broadcast(&iclog->ic_writesema);
2216 LOG_UNLOCK(log, s);
2217 xlog_state_do_callback(log, aborted, iclog); /* also cleans log */
2218} /* xlog_state_done_syncing */
2219
2220
2221/*
2222 * If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must
2223 * sleep. The flush semaphore is set to the number of in-core buffers and
2224 * decremented around disk syncing. Therefore, if all buffers are syncing,
2225 * this semaphore will cause new writes to sleep until a sync completes.
2226 * Otherwise, this code just does p() followed by v(). This approximates
2227 * a sleep/wakeup except we can't race.
2228 *
2229 * The in-core logs are used in a circular fashion. They are not used
2230 * out-of-order even when an iclog past the head is free.
2231 *
2232 * return:
2233 * * log_offset where xlog_write() can start writing into the in-core
2234 * log's data space.
2235 * * in-core log pointer to which xlog_write() should write.
2236 * * boolean indicating this is a continued write to an in-core log.
2237 * If this is the last write, then the in-core log's offset field
2238 * needs to be incremented, depending on the amount of data which
2239 * is copied.
2240 */
2241int
2242xlog_state_get_iclog_space(xlog_t *log,
2243 int len,
2244 xlog_in_core_t **iclogp,
2245 xlog_ticket_t *ticket,
2246 int *continued_write,
2247 int *logoffsetp)
2248{
2249 SPLDECL(s);
2250 int log_offset;
2251 xlog_rec_header_t *head;
2252 xlog_in_core_t *iclog;
2253 int error;
2254
2255restart:
2256 s = LOG_LOCK(log);
2257 if (XLOG_FORCED_SHUTDOWN(log)) {
2258 LOG_UNLOCK(log, s);
2259 return XFS_ERROR(EIO);
2260 }
2261
2262 iclog = log->l_iclog;
2263 if (! (iclog->ic_state == XLOG_STATE_ACTIVE)) {
2264 log->l_flushcnt++;
2265 LOG_UNLOCK(log, s);
2266 xlog_trace_iclog(iclog, XLOG_TRACE_SLEEP_FLUSH);
2267 XFS_STATS_INC(xs_log_noiclogs);
2268 /* Ensure that log writes happen */
2269 psema(&log->l_flushsema, PINOD);
2270 goto restart;
2271 }
2272 ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
2273 head = &iclog->ic_header;
2274
2275 iclog->ic_refcnt++; /* prevents sync */
2276 log_offset = iclog->ic_offset;
2277
2278 /* On the 1st write to an iclog, figure out lsn. This works
2279 * if iclogs marked XLOG_STATE_WANT_SYNC always write out what they are
2280 * committing to. If the offset is set, that's how many blocks
2281 * must be written.
2282 */
2283 if (log_offset == 0) {
2284 ticket->t_curr_res -= log->l_iclog_hsize;
2285 INT_SET(head->h_cycle, ARCH_CONVERT, log->l_curr_cycle);
2286 ASSIGN_LSN(head->h_lsn, log);
2287 ASSERT(log->l_curr_block >= 0);
2288 }
2289
2290 /* If there is enough room to write everything, then do it. Otherwise,
2291 * claim the rest of the region and make sure the XLOG_STATE_WANT_SYNC
2292 * bit is on, so this will get flushed out. Don't update ic_offset
2293 * until you know exactly how many bytes get copied. Therefore, wait
2294 * until later to update ic_offset.
2295 *
2296 * xlog_write() algorithm assumes that at least 2 xlog_op_header_t's
2297 * can fit into remaining data section.
2298 */
2299 if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) {
2300 xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
2301
2302 /* If I'm the only one writing to this iclog, sync it to disk */
2303 if (iclog->ic_refcnt == 1) {
2304 LOG_UNLOCK(log, s);
2305 if ((error = xlog_state_release_iclog(log, iclog)))
2306 return (error);
2307 } else {
2308 iclog->ic_refcnt--;
2309 LOG_UNLOCK(log, s);
2310 }
2311 goto restart;
2312 }
2313
2314 /* Do we have enough room to write the full amount in the remainder
2315 * of this iclog? Or must we continue a write on the next iclog and
2316 * mark this iclog as completely taken? In the case where we switch
2317 * iclogs (to mark it taken), this particular iclog will release/sync
2318 * to disk in xlog_write().
2319 */
2320 if (len <= iclog->ic_size - iclog->ic_offset) {
2321 *continued_write = 0;
2322 iclog->ic_offset += len;
2323 } else {
2324 *continued_write = 1;
2325 xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
2326 }
2327 *iclogp = iclog;
2328
2329 ASSERT(iclog->ic_offset <= iclog->ic_size);
2330 LOG_UNLOCK(log, s);
2331
2332 *logoffsetp = log_offset;
2333 return 0;
2334} /* xlog_state_get_iclog_space */
2335
2336/*
2337 * Atomically get the log space required for a log ticket.
2338 *
2339 * Once a ticket gets put onto the reserveq, it will only return after
2340 * the needed reservation is satisfied.
2341 */
2342STATIC int
2343xlog_grant_log_space(xlog_t *log,
2344 xlog_ticket_t *tic)
2345{
2346 int free_bytes;
2347 int need_bytes;
2348 SPLDECL(s);
2349#ifdef DEBUG
2350 xfs_lsn_t tail_lsn;
2351#endif
2352
2353
2354#ifdef DEBUG
2355 if (log->l_flags & XLOG_ACTIVE_RECOVERY)
2356 panic("grant Recovery problem");
2357#endif
2358
2359 /* Is there space or do we need to sleep? */
2360 s = GRANT_LOCK(log);
2361 xlog_trace_loggrant(log, tic, "xlog_grant_log_space: enter");
2362
2363 /* something is already sleeping; insert new transaction at end */
2364 if (log->l_reserve_headq) {
2365 XLOG_INS_TICKETQ(log->l_reserve_headq, tic);
2366 xlog_trace_loggrant(log, tic,
2367 "xlog_grant_log_space: sleep 1");
2368 /*
2369 * Gotta check this before going to sleep, while we're
2370 * holding the grant lock.
2371 */
2372 if (XLOG_FORCED_SHUTDOWN(log))
2373 goto error_return;
2374
2375 XFS_STATS_INC(xs_sleep_logspace);
2376 sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s);
2377 /*
2378 * If we got an error, and the filesystem is shutting down,
2379 * we'll catch it down below. So just continue...
2380 */
2381 xlog_trace_loggrant(log, tic,
2382 "xlog_grant_log_space: wake 1");
2383 s = GRANT_LOCK(log);
2384 }
2385 if (tic->t_flags & XFS_LOG_PERM_RESERV)
2386 need_bytes = tic->t_unit_res*tic->t_ocnt;
2387 else
2388 need_bytes = tic->t_unit_res;
2389
2390redo:
2391 if (XLOG_FORCED_SHUTDOWN(log))
2392 goto error_return;
2393
2394 free_bytes = xlog_space_left(log, log->l_grant_reserve_cycle,
2395 log->l_grant_reserve_bytes);
2396 if (free_bytes < need_bytes) {
2397 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
2398 XLOG_INS_TICKETQ(log->l_reserve_headq, tic);
2399 xlog_trace_loggrant(log, tic,
2400 "xlog_grant_log_space: sleep 2");
2401 XFS_STATS_INC(xs_sleep_logspace);
2402 sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s);
2403
2404 if (XLOG_FORCED_SHUTDOWN(log)) {
2405 s = GRANT_LOCK(log);
2406 goto error_return;
2407 }
2408
2409 xlog_trace_loggrant(log, tic,
2410 "xlog_grant_log_space: wake 2");
2411 xlog_grant_push_ail(log->l_mp, need_bytes);
2412 s = GRANT_LOCK(log);
2413 goto redo;
2414 } else if (tic->t_flags & XLOG_TIC_IN_Q)
2415 XLOG_DEL_TICKETQ(log->l_reserve_headq, tic);
2416
2417 /* we've got enough space */
2418 XLOG_GRANT_ADD_SPACE(log, need_bytes, 'w');
2419 XLOG_GRANT_ADD_SPACE(log, need_bytes, 'r');
2420#ifdef DEBUG
2421 tail_lsn = log->l_tail_lsn;
2422 /*
2423 * Check to make sure the grant write head didn't just over lap the
2424 * tail. If the cycles are the same, we can't be overlapping.
2425 * Otherwise, make sure that the cycles differ by exactly one and
2426 * check the byte count.
2427 */
2428 if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
2429 ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
2430 ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
2431 }
2432#endif
2433 xlog_trace_loggrant(log, tic, "xlog_grant_log_space: exit");
2434 xlog_verify_grant_head(log, 1);
2435 GRANT_UNLOCK(log, s);
2436 return 0;
2437
2438 error_return:
2439 if (tic->t_flags & XLOG_TIC_IN_Q)
2440 XLOG_DEL_TICKETQ(log->l_reserve_headq, tic);
2441 xlog_trace_loggrant(log, tic, "xlog_grant_log_space: err_ret");
2442 /*
2443 * If we are failing, make sure the ticket doesn't have any
2444 * current reservations. We don't want to add this back when
2445 * the ticket/transaction gets cancelled.
2446 */
2447 tic->t_curr_res = 0;
2448 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
2449 GRANT_UNLOCK(log, s);
2450 return XFS_ERROR(EIO);
2451} /* xlog_grant_log_space */
2452
2453
2454/*
2455 * Replenish the byte reservation required by moving the grant write head.
2456 *
2457 *
2458 */
2459STATIC int
2460xlog_regrant_write_log_space(xlog_t *log,
2461 xlog_ticket_t *tic)
2462{
2463 SPLDECL(s);
2464 int free_bytes, need_bytes;
2465 xlog_ticket_t *ntic;
2466#ifdef DEBUG
2467 xfs_lsn_t tail_lsn;
2468#endif
2469
2470 tic->t_curr_res = tic->t_unit_res;
2471
2472 if (tic->t_cnt > 0)
2473 return (0);
2474
2475#ifdef DEBUG
2476 if (log->l_flags & XLOG_ACTIVE_RECOVERY)
2477 panic("regrant Recovery problem");
2478#endif
2479
2480 s = GRANT_LOCK(log);
2481 xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: enter");
2482
2483 if (XLOG_FORCED_SHUTDOWN(log))
2484 goto error_return;
2485
2486 /* If there are other waiters on the queue then give them a
2487 * chance at logspace before us. Wake up the first waiters,
2488 * if we do not wake up all the waiters then go to sleep waiting
2489 * for more free space, otherwise try to get some space for
2490 * this transaction.
2491 */
2492
2493 if ((ntic = log->l_write_headq)) {
2494 free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
2495 log->l_grant_write_bytes);
2496 do {
2497 ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
2498
2499 if (free_bytes < ntic->t_unit_res)
2500 break;
2501 free_bytes -= ntic->t_unit_res;
2502 sv_signal(&ntic->t_sema);
2503 ntic = ntic->t_next;
2504 } while (ntic != log->l_write_headq);
2505
2506 if (ntic != log->l_write_headq) {
2507 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
2508 XLOG_INS_TICKETQ(log->l_write_headq, tic);
2509
2510 xlog_trace_loggrant(log, tic,
2511 "xlog_regrant_write_log_space: sleep 1");
2512 XFS_STATS_INC(xs_sleep_logspace);
2513 sv_wait(&tic->t_sema, PINOD|PLTWAIT,
2514 &log->l_grant_lock, s);
2515
2516 /* If we're shutting down, this tic is already
2517 * off the queue */
2518 if (XLOG_FORCED_SHUTDOWN(log)) {
2519 s = GRANT_LOCK(log);
2520 goto error_return;
2521 }
2522
2523 xlog_trace_loggrant(log, tic,
2524 "xlog_regrant_write_log_space: wake 1");
2525 xlog_grant_push_ail(log->l_mp, tic->t_unit_res);
2526 s = GRANT_LOCK(log);
2527 }
2528 }
2529
2530 need_bytes = tic->t_unit_res;
2531
2532redo:
2533 if (XLOG_FORCED_SHUTDOWN(log))
2534 goto error_return;
2535
2536 free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
2537 log->l_grant_write_bytes);
2538 if (free_bytes < need_bytes) {
2539 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
2540 XLOG_INS_TICKETQ(log->l_write_headq, tic);
2541 XFS_STATS_INC(xs_sleep_logspace);
2542 sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s);
2543
2544 /* If we're shutting down, this tic is already off the queue */
2545 if (XLOG_FORCED_SHUTDOWN(log)) {
2546 s = GRANT_LOCK(log);
2547 goto error_return;
2548 }
2549
2550 xlog_trace_loggrant(log, tic,
2551 "xlog_regrant_write_log_space: wake 2");
2552 xlog_grant_push_ail(log->l_mp, need_bytes);
2553 s = GRANT_LOCK(log);
2554 goto redo;
2555 } else if (tic->t_flags & XLOG_TIC_IN_Q)
2556 XLOG_DEL_TICKETQ(log->l_write_headq, tic);
2557
2558 XLOG_GRANT_ADD_SPACE(log, need_bytes, 'w'); /* we've got enough space */
2559#ifdef DEBUG
2560 tail_lsn = log->l_tail_lsn;
2561 if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
2562 ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
2563 ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
2564 }
2565#endif
2566
2567 xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: exit");
2568 xlog_verify_grant_head(log, 1);
2569 GRANT_UNLOCK(log, s);
2570 return (0);
2571
2572
2573 error_return:
2574 if (tic->t_flags & XLOG_TIC_IN_Q)
2575 XLOG_DEL_TICKETQ(log->l_reserve_headq, tic);
2576 xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: err_ret");
2577 /*
2578 * If we are failing, make sure the ticket doesn't have any
2579 * current reservations. We don't want to add this back when
2580 * the ticket/transaction gets cancelled.
2581 */
2582 tic->t_curr_res = 0;
2583 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
2584 GRANT_UNLOCK(log, s);
2585 return XFS_ERROR(EIO);
2586} /* xlog_regrant_write_log_space */
2587
2588
2589/* The first cnt-1 times through here we don't need to
2590 * move the grant write head because the permanent
2591 * reservation has reserved cnt times the unit amount.
2592 * Release part of current permanent unit reservation and
2593 * reset current reservation to be one units worth. Also
2594 * move grant reservation head forward.
2595 */
2596STATIC void
2597xlog_regrant_reserve_log_space(xlog_t *log,
2598 xlog_ticket_t *ticket)
2599{
2600 SPLDECL(s);
2601
2602 xlog_trace_loggrant(log, ticket,
2603 "xlog_regrant_reserve_log_space: enter");
2604 if (ticket->t_cnt > 0)
2605 ticket->t_cnt--;
2606
2607 s = GRANT_LOCK(log);
2608 XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'w');
2609 XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'r');
2610 ticket->t_curr_res = ticket->t_unit_res;
2611 xlog_trace_loggrant(log, ticket,
2612 "xlog_regrant_reserve_log_space: sub current res");
2613 xlog_verify_grant_head(log, 1);
2614
2615 /* just return if we still have some of the pre-reserved space */
2616 if (ticket->t_cnt > 0) {
2617 GRANT_UNLOCK(log, s);
2618 return;
2619 }
2620
2621 XLOG_GRANT_ADD_SPACE(log, ticket->t_unit_res, 'r');
2622 xlog_trace_loggrant(log, ticket,
2623 "xlog_regrant_reserve_log_space: exit");
2624 xlog_verify_grant_head(log, 0);
2625 GRANT_UNLOCK(log, s);
2626 ticket->t_curr_res = ticket->t_unit_res;
2627} /* xlog_regrant_reserve_log_space */
2628
2629
2630/*
2631 * Give back the space left from a reservation.
2632 *
2633 * All the information we need to make a correct determination of space left
2634 * is present. For non-permanent reservations, things are quite easy. The
2635 * count should have been decremented to zero. We only need to deal with the
2636 * space remaining in the current reservation part of the ticket. If the
2637 * ticket contains a permanent reservation, there may be left over space which
2638 * needs to be released. A count of N means that N-1 refills of the current
2639 * reservation can be done before we need to ask for more space. The first
2640 * one goes to fill up the first current reservation. Once we run out of
2641 * space, the count will stay at zero and the only space remaining will be
2642 * in the current reservation field.
2643 */
2644STATIC void
2645xlog_ungrant_log_space(xlog_t *log,
2646 xlog_ticket_t *ticket)
2647{
2648 SPLDECL(s);
2649
2650 if (ticket->t_cnt > 0)
2651 ticket->t_cnt--;
2652
2653 s = GRANT_LOCK(log);
2654 xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: enter");
2655
2656 XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'w');
2657 XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'r');
2658
2659 xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: sub current");
2660
2661 /* If this is a permanent reservation ticket, we may be able to free
2662 * up more space based on the remaining count.
2663 */
2664 if (ticket->t_cnt > 0) {
2665 ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
2666 XLOG_GRANT_SUB_SPACE(log, ticket->t_unit_res*ticket->t_cnt,'w');
2667 XLOG_GRANT_SUB_SPACE(log, ticket->t_unit_res*ticket->t_cnt,'r');
2668 }
2669
2670 xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: exit");
2671 xlog_verify_grant_head(log, 1);
2672 GRANT_UNLOCK(log, s);
2673 xfs_log_move_tail(log->l_mp, 1);
2674} /* xlog_ungrant_log_space */
2675
2676
2677/*
2678 * Atomically put back used ticket.
2679 */
2680void
2681xlog_state_put_ticket(xlog_t *log,
2682 xlog_ticket_t *tic)
2683{
2684 unsigned long s;
2685
2686 s = LOG_LOCK(log);
2687 xlog_ticket_put(log, tic);
2688 LOG_UNLOCK(log, s);
2689} /* xlog_state_put_ticket */
2690
2691/*
2692 * Flush iclog to disk if this is the last reference to the given iclog and
2693 * the WANT_SYNC bit is set.
2694 *
2695 * When this function is entered, the iclog is not necessarily in the
2696 * WANT_SYNC state. It may be sitting around waiting to get filled.
2697 *
2698 *
2699 */
2700int
2701xlog_state_release_iclog(xlog_t *log,
2702 xlog_in_core_t *iclog)
2703{
2704 SPLDECL(s);
2705 int sync = 0; /* do we sync? */
2706
2707 xlog_assign_tail_lsn(log->l_mp);
2708
2709 s = LOG_LOCK(log);
2710
2711 if (iclog->ic_state & XLOG_STATE_IOERROR) {
2712 LOG_UNLOCK(log, s);
2713 return XFS_ERROR(EIO);
2714 }
2715
2716 ASSERT(iclog->ic_refcnt > 0);
2717 ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE ||
2718 iclog->ic_state == XLOG_STATE_WANT_SYNC);
2719
2720 if (--iclog->ic_refcnt == 0 &&
2721 iclog->ic_state == XLOG_STATE_WANT_SYNC) {
2722 sync++;
2723 iclog->ic_state = XLOG_STATE_SYNCING;
2724 INT_SET(iclog->ic_header.h_tail_lsn, ARCH_CONVERT, log->l_tail_lsn);
2725 xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn);
2726 /* cycle incremented when incrementing curr_block */
2727 }
2728
2729 LOG_UNLOCK(log, s);
2730
2731 /*
2732 * We let the log lock go, so it's possible that we hit a log I/O
2733 * error or someother SHUTDOWN condition that marks the iclog
2734 * as XLOG_STATE_IOERROR before the bwrite. However, we know that
2735 * this iclog has consistent data, so we ignore IOERROR
2736 * flags after this point.
2737 */
2738 if (sync) {
2739 return xlog_sync(log, iclog);
2740 }
2741 return (0);
2742
2743} /* xlog_state_release_iclog */
2744
2745
2746/*
2747 * This routine will mark the current iclog in the ring as WANT_SYNC
2748 * and move the current iclog pointer to the next iclog in the ring.
2749 * When this routine is called from xlog_state_get_iclog_space(), the
2750 * exact size of the iclog has not yet been determined. All we know is
2751 * that every data block. We have run out of space in this log record.
2752 */
2753STATIC void
2754xlog_state_switch_iclogs(xlog_t *log,
2755 xlog_in_core_t *iclog,
2756 int eventual_size)
2757{
2758 ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
2759 if (!eventual_size)
2760 eventual_size = iclog->ic_offset;
2761 iclog->ic_state = XLOG_STATE_WANT_SYNC;
2762 INT_SET(iclog->ic_header.h_prev_block, ARCH_CONVERT, log->l_prev_block);
2763 log->l_prev_block = log->l_curr_block;
2764 log->l_prev_cycle = log->l_curr_cycle;
2765
2766 /* roll log?: ic_offset changed later */
2767 log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize);
2768
2769 /* Round up to next log-sunit */
2770 if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) &&
2771 log->l_mp->m_sb.sb_logsunit > 1) {
2772 __uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit);
2773 log->l_curr_block = roundup(log->l_curr_block, sunit_bb);
2774 }
2775
2776 if (log->l_curr_block >= log->l_logBBsize) {
2777 log->l_curr_cycle++;
2778 if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM)
2779 log->l_curr_cycle++;
2780 log->l_curr_block -= log->l_logBBsize;
2781 ASSERT(log->l_curr_block >= 0);
2782 }
2783 ASSERT(iclog == log->l_iclog);
2784 log->l_iclog = iclog->ic_next;
2785} /* xlog_state_switch_iclogs */
2786
2787
2788/*
2789 * Write out all data in the in-core log as of this exact moment in time.
2790 *
2791 * Data may be written to the in-core log during this call. However,
2792 * we don't guarantee this data will be written out. A change from past
2793 * implementation means this routine will *not* write out zero length LRs.
2794 *
2795 * Basically, we try and perform an intelligent scan of the in-core logs.
2796 * If we determine there is no flushable data, we just return. There is no
2797 * flushable data if:
2798 *
2799 * 1. the current iclog is active and has no data; the previous iclog
2800 * is in the active or dirty state.
2801 * 2. the current iclog is drity, and the previous iclog is in the
2802 * active or dirty state.
2803 *
2804 * We may sleep (call psema) if:
2805 *
2806 * 1. the current iclog is not in the active nor dirty state.
2807 * 2. the current iclog dirty, and the previous iclog is not in the
2808 * active nor dirty state.
2809 * 3. the current iclog is active, and there is another thread writing
2810 * to this particular iclog.
2811 * 4. a) the current iclog is active and has no other writers
2812 * b) when we return from flushing out this iclog, it is still
2813 * not in the active nor dirty state.
2814 */
2815STATIC int
2816xlog_state_sync_all(xlog_t *log, uint flags)
2817{
2818 xlog_in_core_t *iclog;
2819 xfs_lsn_t lsn;
2820 SPLDECL(s);
2821
2822 s = LOG_LOCK(log);
2823
2824 iclog = log->l_iclog;
2825 if (iclog->ic_state & XLOG_STATE_IOERROR) {
2826 LOG_UNLOCK(log, s);
2827 return XFS_ERROR(EIO);
2828 }
2829
2830 /* If the head iclog is not active nor dirty, we just attach
2831 * ourselves to the head and go to sleep.
2832 */
2833 if (iclog->ic_state == XLOG_STATE_ACTIVE ||
2834 iclog->ic_state == XLOG_STATE_DIRTY) {
2835 /*
2836 * If the head is dirty or (active and empty), then
2837 * we need to look at the previous iclog. If the previous
2838 * iclog is active or dirty we are done. There is nothing
2839 * to sync out. Otherwise, we attach ourselves to the
2840 * previous iclog and go to sleep.
2841 */
2842 if (iclog->ic_state == XLOG_STATE_DIRTY ||
2843 (iclog->ic_refcnt == 0 && iclog->ic_offset == 0)) {
2844 iclog = iclog->ic_prev;
2845 if (iclog->ic_state == XLOG_STATE_ACTIVE ||
2846 iclog->ic_state == XLOG_STATE_DIRTY)
2847 goto no_sleep;
2848 else
2849 goto maybe_sleep;
2850 } else {
2851 if (iclog->ic_refcnt == 0) {
2852 /* We are the only one with access to this
2853 * iclog. Flush it out now. There should
2854 * be a roundoff of zero to show that someone
2855 * has already taken care of the roundoff from
2856 * the previous sync.
2857 */
2858 iclog->ic_refcnt++;
2859 lsn = INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT);
2860 xlog_state_switch_iclogs(log, iclog, 0);
2861 LOG_UNLOCK(log, s);
2862
2863 if (xlog_state_release_iclog(log, iclog))
2864 return XFS_ERROR(EIO);
2865 s = LOG_LOCK(log);
2866 if (INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT) == lsn &&
2867 iclog->ic_state != XLOG_STATE_DIRTY)
2868 goto maybe_sleep;
2869 else
2870 goto no_sleep;
2871 } else {
2872 /* Someone else is writing to this iclog.
2873 * Use its call to flush out the data. However,
2874 * the other thread may not force out this LR,
2875 * so we mark it WANT_SYNC.
2876 */
2877 xlog_state_switch_iclogs(log, iclog, 0);
2878 goto maybe_sleep;
2879 }
2880 }
2881 }
2882
2883 /* By the time we come around again, the iclog could've been filled
2884 * which would give it another lsn. If we have a new lsn, just
2885 * return because the relevant data has been flushed.
2886 */
2887maybe_sleep:
2888 if (flags & XFS_LOG_SYNC) {
2889 /*
2890 * We must check if we're shutting down here, before
2891 * we wait, while we're holding the LOG_LOCK.
2892 * Then we check again after waking up, in case our
2893 * sleep was disturbed by a bad news.
2894 */
2895 if (iclog->ic_state & XLOG_STATE_IOERROR) {
2896 LOG_UNLOCK(log, s);
2897 return XFS_ERROR(EIO);
2898 }
2899 XFS_STATS_INC(xs_log_force_sleep);
2900 sv_wait(&iclog->ic_forcesema, PINOD, &log->l_icloglock, s);
2901 /*
2902 * No need to grab the log lock here since we're
2903 * only deciding whether or not to return EIO
2904 * and the memory read should be atomic.
2905 */
2906 if (iclog->ic_state & XLOG_STATE_IOERROR)
2907 return XFS_ERROR(EIO);
2908
2909 } else {
2910
2911no_sleep:
2912 LOG_UNLOCK(log, s);
2913 }
2914 return 0;
2915} /* xlog_state_sync_all */
2916
2917
2918/*
2919 * Used by code which implements synchronous log forces.
2920 *
2921 * Find in-core log with lsn.
2922 * If it is in the DIRTY state, just return.
2923 * If it is in the ACTIVE state, move the in-core log into the WANT_SYNC
2924 * state and go to sleep or return.
2925 * If it is in any other state, go to sleep or return.
2926 *
2927 * If filesystem activity goes to zero, the iclog will get flushed only by
2928 * bdflush().
2929 */
2930int
2931xlog_state_sync(xlog_t *log,
2932 xfs_lsn_t lsn,
2933 uint flags)
2934{
2935 xlog_in_core_t *iclog;
2936 int already_slept = 0;
2937 SPLDECL(s);
2938
2939
2940try_again:
2941 s = LOG_LOCK(log);
2942 iclog = log->l_iclog;
2943
2944 if (iclog->ic_state & XLOG_STATE_IOERROR) {
2945 LOG_UNLOCK(log, s);
2946 return XFS_ERROR(EIO);
2947 }
2948
2949 do {
2950 if (INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT) != lsn) {
2951 iclog = iclog->ic_next;
2952 continue;
2953 }
2954
2955 if (iclog->ic_state == XLOG_STATE_DIRTY) {
2956 LOG_UNLOCK(log, s);
2957 return 0;
2958 }
2959
2960 if (iclog->ic_state == XLOG_STATE_ACTIVE) {
2961 /*
2962 * We sleep here if we haven't already slept (e.g.
2963 * this is the first time we've looked at the correct
2964 * iclog buf) and the buffer before us is going to
2965 * be sync'ed. The reason for this is that if we
2966 * are doing sync transactions here, by waiting for
2967 * the previous I/O to complete, we can allow a few
2968 * more transactions into this iclog before we close
2969 * it down.
2970 *
2971 * Otherwise, we mark the buffer WANT_SYNC, and bump
2972 * up the refcnt so we can release the log (which drops
2973 * the ref count). The state switch keeps new transaction
2974 * commits from using this buffer. When the current commits
2975 * finish writing into the buffer, the refcount will drop to
2976 * zero and the buffer will go out then.
2977 */
2978 if (!already_slept &&
2979 (iclog->ic_prev->ic_state & (XLOG_STATE_WANT_SYNC |
2980 XLOG_STATE_SYNCING))) {
2981 ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
2982 XFS_STATS_INC(xs_log_force_sleep);
2983 sv_wait(&iclog->ic_prev->ic_writesema, PSWP,
2984 &log->l_icloglock, s);
2985 already_slept = 1;
2986 goto try_again;
2987 } else {
2988 iclog->ic_refcnt++;
2989 xlog_state_switch_iclogs(log, iclog, 0);
2990 LOG_UNLOCK(log, s);
2991 if (xlog_state_release_iclog(log, iclog))
2992 return XFS_ERROR(EIO);
2993 s = LOG_LOCK(log);
2994 }
2995 }
2996
2997 if ((flags & XFS_LOG_SYNC) && /* sleep */
2998 !(iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) {
2999
3000 /*
3001 * Don't wait on the forcesema if we know that we've
3002 * gotten a log write error.
3003 */
3004 if (iclog->ic_state & XLOG_STATE_IOERROR) {
3005 LOG_UNLOCK(log, s);
3006 return XFS_ERROR(EIO);
3007 }
3008 XFS_STATS_INC(xs_log_force_sleep);
3009 sv_wait(&iclog->ic_forcesema, PSWP, &log->l_icloglock, s);
3010 /*
3011 * No need to grab the log lock here since we're
3012 * only deciding whether or not to return EIO
3013 * and the memory read should be atomic.
3014 */
3015 if (iclog->ic_state & XLOG_STATE_IOERROR)
3016 return XFS_ERROR(EIO);
3017 } else { /* just return */
3018 LOG_UNLOCK(log, s);
3019 }
3020 return 0;
3021
3022 } while (iclog != log->l_iclog);
3023
3024 LOG_UNLOCK(log, s);
3025 return (0);
3026} /* xlog_state_sync */
3027
3028
3029/*
3030 * Called when we want to mark the current iclog as being ready to sync to
3031 * disk.
3032 */
3033void
3034xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
3035{
3036 SPLDECL(s);
3037
3038 s = LOG_LOCK(log);
3039
3040 if (iclog->ic_state == XLOG_STATE_ACTIVE) {
3041 xlog_state_switch_iclogs(log, iclog, 0);
3042 } else {
3043 ASSERT(iclog->ic_state &
3044 (XLOG_STATE_WANT_SYNC|XLOG_STATE_IOERROR));
3045 }
3046
3047 LOG_UNLOCK(log, s);
3048} /* xlog_state_want_sync */
3049
3050
3051
3052/*****************************************************************************
3053 *
3054 * TICKET functions
3055 *
3056 *****************************************************************************
3057 */
3058
3059/*
3060 * Algorithm doesn't take into account page size. ;-(
3061 */
3062STATIC void
3063xlog_state_ticket_alloc(xlog_t *log)
3064{
3065 xlog_ticket_t *t_list;
3066 xlog_ticket_t *next;
3067 xfs_caddr_t buf;
3068 uint i = (NBPP / sizeof(xlog_ticket_t)) - 2;
3069 SPLDECL(s);
3070
3071 /*
3072 * The kmem_zalloc may sleep, so we shouldn't be holding the
3073 * global lock. XXXmiken: may want to use zone allocator.
3074 */
3075 buf = (xfs_caddr_t) kmem_zalloc(NBPP, KM_SLEEP);
3076
3077 s = LOG_LOCK(log);
3078
3079 /* Attach 1st ticket to Q, so we can keep track of allocated memory */
3080 t_list = (xlog_ticket_t *)buf;
3081 t_list->t_next = log->l_unmount_free;
3082 log->l_unmount_free = t_list++;
3083 log->l_ticket_cnt++;
3084 log->l_ticket_tcnt++;
3085
3086 /* Next ticket becomes first ticket attached to ticket free list */
3087 if (log->l_freelist != NULL) {
3088 ASSERT(log->l_tail != NULL);
3089 log->l_tail->t_next = t_list;
3090 } else {
3091 log->l_freelist = t_list;
3092 }
3093 log->l_ticket_cnt++;
3094 log->l_ticket_tcnt++;
3095
3096 /* Cycle through rest of alloc'ed memory, building up free Q */
3097 for ( ; i > 0; i--) {
3098 next = t_list + 1;
3099 t_list->t_next = next;
3100 t_list = next;
3101 log->l_ticket_cnt++;
3102 log->l_ticket_tcnt++;
3103 }
3104 t_list->t_next = NULL;
3105 log->l_tail = t_list;
3106 LOG_UNLOCK(log, s);
3107} /* xlog_state_ticket_alloc */
3108
3109
3110/*
3111 * Put ticket into free list
3112 *
3113 * Assumption: log lock is held around this call.
3114 */
3115STATIC void
3116xlog_ticket_put(xlog_t *log,
3117 xlog_ticket_t *ticket)
3118{
3119 sv_destroy(&ticket->t_sema);
3120
3121 /*
3122 * Don't think caching will make that much difference. It's
3123 * more important to make debug easier.
3124 */
3125#if 0
3126 /* real code will want to use LIFO for caching */
3127 ticket->t_next = log->l_freelist;
3128 log->l_freelist = ticket;
3129 /* no need to clear fields */
3130#else
3131 /* When we debug, it is easier if tickets are cycled */
3132 ticket->t_next = NULL;
3133 if (log->l_tail != 0) {
3134 log->l_tail->t_next = ticket;
3135 } else {
3136 ASSERT(log->l_freelist == 0);
3137 log->l_freelist = ticket;
3138 }
3139 log->l_tail = ticket;
3140#endif /* DEBUG */
3141 log->l_ticket_cnt++;
3142} /* xlog_ticket_put */
3143
3144
3145/*
3146 * Grab ticket off freelist or allocation some more
3147 */
3148xlog_ticket_t *
3149xlog_ticket_get(xlog_t *log,
3150 int unit_bytes,
3151 int cnt,
3152 char client,
3153 uint xflags)
3154{
3155 xlog_ticket_t *tic;
3156 uint num_headers;
3157 SPLDECL(s);
3158
3159 alloc:
3160 if (log->l_freelist == NULL)
3161 xlog_state_ticket_alloc(log); /* potentially sleep */
3162
3163 s = LOG_LOCK(log);
3164 if (log->l_freelist == NULL) {
3165 LOG_UNLOCK(log, s);
3166 goto alloc;
3167 }
3168 tic = log->l_freelist;
3169 log->l_freelist = tic->t_next;
3170 if (log->l_freelist == NULL)
3171 log->l_tail = NULL;
3172 log->l_ticket_cnt--;
3173 LOG_UNLOCK(log, s);
3174
3175 /*
3176 * Permanent reservations have up to 'cnt'-1 active log operations
3177 * in the log. A unit in this case is the amount of space for one
3178 * of these log operations. Normal reservations have a cnt of 1
3179 * and their unit amount is the total amount of space required.
3180 *
3181 * The following lines of code account for non-transaction data
3182 * which occupy space in the on-disk log.
3183 */
3184
3185 /* for start-rec */
3186 unit_bytes += sizeof(xlog_op_header_t);
3187
3188 /* for padding */
3189 if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) &&
3190 log->l_mp->m_sb.sb_logsunit > 1) {
3191 /* log su roundoff */
3192 unit_bytes += log->l_mp->m_sb.sb_logsunit;
3193 } else {
3194 /* BB roundoff */
3195 unit_bytes += BBSIZE;
3196 }
3197
3198 /* for commit-rec */
3199 unit_bytes += sizeof(xlog_op_header_t);
3200
3201 /* for LR headers */
3202 num_headers = ((unit_bytes + log->l_iclog_size-1) >> log->l_iclog_size_log);
3203 unit_bytes += log->l_iclog_hsize * num_headers;
3204
3205 tic->t_unit_res = unit_bytes;
3206 tic->t_curr_res = unit_bytes;
3207 tic->t_cnt = cnt;
3208 tic->t_ocnt = cnt;
3209 tic->t_tid = (xlog_tid_t)((__psint_t)tic & 0xffffffff);
3210 tic->t_clientid = client;
3211 tic->t_flags = XLOG_TIC_INITED;
3212 if (xflags & XFS_LOG_PERM_RESERV)
3213 tic->t_flags |= XLOG_TIC_PERM_RESERV;
3214 sv_init(&(tic->t_sema), SV_DEFAULT, "logtick");
3215
3216 return tic;
3217} /* xlog_ticket_get */
3218
3219
3220/******************************************************************************
3221 *
3222 * Log debug routines
3223 *
3224 ******************************************************************************
3225 */
3226#if defined(DEBUG) && !defined(XLOG_NOLOG)
3227/*
3228 * Make sure that the destination ptr is within the valid data region of
3229 * one of the iclogs. This uses backup pointers stored in a different
3230 * part of the log in case we trash the log structure.
3231 */
3232void
3233xlog_verify_dest_ptr(xlog_t *log,
3234 __psint_t ptr)
3235{
3236 int i;
3237 int good_ptr = 0;
3238
3239 for (i=0; i < log->l_iclog_bufs; i++) {
3240 if (ptr >= (__psint_t)log->l_iclog_bak[i] &&
3241 ptr <= (__psint_t)log->l_iclog_bak[i]+log->l_iclog_size)
3242 good_ptr++;
3243 }
3244 if (! good_ptr)
3245 xlog_panic("xlog_verify_dest_ptr: invalid ptr");
3246} /* xlog_verify_dest_ptr */
3247
3248STATIC void
3249xlog_verify_grant_head(xlog_t *log, int equals)
3250{
3251 if (log->l_grant_reserve_cycle == log->l_grant_write_cycle) {
3252 if (equals)
3253 ASSERT(log->l_grant_reserve_bytes >= log->l_grant_write_bytes);
3254 else
3255 ASSERT(log->l_grant_reserve_bytes > log->l_grant_write_bytes);
3256 } else {
3257 ASSERT(log->l_grant_reserve_cycle-1 == log->l_grant_write_cycle);
3258 ASSERT(log->l_grant_write_bytes >= log->l_grant_reserve_bytes);
3259 }
3260} /* xlog_verify_grant_head */
3261
3262/* check if it will fit */
3263STATIC void
3264xlog_verify_tail_lsn(xlog_t *log,
3265 xlog_in_core_t *iclog,
3266 xfs_lsn_t tail_lsn)
3267{
3268 int blocks;
3269
3270 if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) {
3271 blocks =
3272 log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn));
3273 if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize))
3274 xlog_panic("xlog_verify_tail_lsn: ran out of log space");
3275 } else {
3276 ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle);
3277
3278 if (BLOCK_LSN(tail_lsn) == log->l_prev_block)
3279 xlog_panic("xlog_verify_tail_lsn: tail wrapped");
3280
3281 blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block;
3282 if (blocks < BTOBB(iclog->ic_offset) + 1)
3283 xlog_panic("xlog_verify_tail_lsn: ran out of log space");
3284 }
3285} /* xlog_verify_tail_lsn */
3286
3287/*
3288 * Perform a number of checks on the iclog before writing to disk.
3289 *
3290 * 1. Make sure the iclogs are still circular
3291 * 2. Make sure we have a good magic number
3292 * 3. Make sure we don't have magic numbers in the data
3293 * 4. Check fields of each log operation header for:
3294 * A. Valid client identifier
3295 * B. tid ptr value falls in valid ptr space (user space code)
3296 * C. Length in log record header is correct according to the
3297 * individual operation headers within record.
3298 * 5. When a bwrite will occur within 5 blocks of the front of the physical
3299 * log, check the preceding blocks of the physical log to make sure all
3300 * the cycle numbers agree with the current cycle number.
3301 */
3302STATIC void
3303xlog_verify_iclog(xlog_t *log,
3304 xlog_in_core_t *iclog,
3305 int count,
3306 boolean_t syncing)
3307{
3308 xlog_op_header_t *ophead;
3309 xlog_in_core_t *icptr;
3310 xlog_in_core_2_t *xhdr;
3311 xfs_caddr_t ptr;
3312 xfs_caddr_t base_ptr;
3313 __psint_t field_offset;
3314 __uint8_t clientid;
3315 int len, i, j, k, op_len;
3316 int idx;
3317 SPLDECL(s);
3318
3319 /* check validity of iclog pointers */
3320 s = LOG_LOCK(log);
3321 icptr = log->l_iclog;
3322 for (i=0; i < log->l_iclog_bufs; i++) {
3323 if (icptr == 0)
3324 xlog_panic("xlog_verify_iclog: invalid ptr");
3325 icptr = icptr->ic_next;
3326 }
3327 if (icptr != log->l_iclog)
3328 xlog_panic("xlog_verify_iclog: corrupt iclog ring");
3329 LOG_UNLOCK(log, s);
3330
3331 /* check log magic numbers */
3332 ptr = (xfs_caddr_t) &(iclog->ic_header);
3333 if (INT_GET(*(uint *)ptr, ARCH_CONVERT) != XLOG_HEADER_MAGIC_NUM)
3334 xlog_panic("xlog_verify_iclog: invalid magic num");
3335
3336 for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&(iclog->ic_header))+count;
3337 ptr += BBSIZE) {
3338 if (INT_GET(*(uint *)ptr, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM)
3339 xlog_panic("xlog_verify_iclog: unexpected magic num");
3340 }
3341
3342 /* check fields */
3343 len = INT_GET(iclog->ic_header.h_num_logops, ARCH_CONVERT);
3344 ptr = iclog->ic_datap;
3345 base_ptr = ptr;
3346 ophead = (xlog_op_header_t *)ptr;
3347 xhdr = (xlog_in_core_2_t *)&iclog->ic_header;
3348 for (i = 0; i < len; i++) {
3349 ophead = (xlog_op_header_t *)ptr;
3350
3351 /* clientid is only 1 byte */
3352 field_offset = (__psint_t)
3353 ((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr);
3354 if (syncing == B_FALSE || (field_offset & 0x1ff)) {
3355 clientid = ophead->oh_clientid;
3356 } else {
3357 idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap);
3358 if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
3359 j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3360 k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3361 clientid = GET_CLIENT_ID(xhdr[j].hic_xheader.xh_cycle_data[k], ARCH_CONVERT);
3362 } else {
3363 clientid = GET_CLIENT_ID(iclog->ic_header.h_cycle_data[idx], ARCH_CONVERT);
3364 }
3365 }
3366 if (clientid != XFS_TRANSACTION && clientid != XFS_LOG)
3367 cmn_err(CE_WARN, "xlog_verify_iclog: invalid clientid %d op 0x%p offset 0x%x", clientid, ophead, field_offset);
3368
3369 /* check length */
3370 field_offset = (__psint_t)
3371 ((xfs_caddr_t)&(ophead->oh_len) - base_ptr);
3372 if (syncing == B_FALSE || (field_offset & 0x1ff)) {
3373 op_len = INT_GET(ophead->oh_len, ARCH_CONVERT);
3374 } else {
3375 idx = BTOBBT((__psint_t)&ophead->oh_len -
3376 (__psint_t)iclog->ic_datap);
3377 if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
3378 j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3379 k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3380 op_len = INT_GET(xhdr[j].hic_xheader.xh_cycle_data[k], ARCH_CONVERT);
3381 } else {
3382 op_len = INT_GET(iclog->ic_header.h_cycle_data[idx], ARCH_CONVERT);
3383 }
3384 }
3385 ptr += sizeof(xlog_op_header_t) + op_len;
3386 }
3387} /* xlog_verify_iclog */
3388#endif /* DEBUG && !XLOG_NOLOG */
3389
3390/*
3391 * Mark all iclogs IOERROR. LOG_LOCK is held by the caller.
3392 */
3393STATIC int
3394xlog_state_ioerror(
3395 xlog_t *log)
3396{
3397 xlog_in_core_t *iclog, *ic;
3398
3399 iclog = log->l_iclog;
3400 if (! (iclog->ic_state & XLOG_STATE_IOERROR)) {
3401 /*
3402 * Mark all the incore logs IOERROR.
3403 * From now on, no log flushes will result.
3404 */
3405 ic = iclog;
3406 do {
3407 ic->ic_state = XLOG_STATE_IOERROR;
3408 ic = ic->ic_next;
3409 } while (ic != iclog);
3410 return (0);
3411 }
3412 /*
3413 * Return non-zero, if state transition has already happened.
3414 */
3415 return (1);
3416}
3417
3418/*
3419 * This is called from xfs_force_shutdown, when we're forcibly
3420 * shutting down the filesystem, typically because of an IO error.
3421 * Our main objectives here are to make sure that:
3422 * a. the filesystem gets marked 'SHUTDOWN' for all interested
3423 * parties to find out, 'atomically'.
3424 * b. those who're sleeping on log reservations, pinned objects and
3425 * other resources get woken up, and be told the bad news.
3426 * c. nothing new gets queued up after (a) and (b) are done.
3427 * d. if !logerror, flush the iclogs to disk, then seal them off
3428 * for business.
3429 */
3430int
3431xfs_log_force_umount(
3432 struct xfs_mount *mp,
3433 int logerror)
3434{
3435 xlog_ticket_t *tic;
3436 xlog_t *log;
3437 int retval;
3438 SPLDECL(s);
3439 SPLDECL(s2);
3440
3441 log = mp->m_log;
3442
3443 /*
3444 * If this happens during log recovery, don't worry about
3445 * locking; the log isn't open for business yet.
3446 */
3447 if (!log ||
3448 log->l_flags & XLOG_ACTIVE_RECOVERY) {
3449 mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
3450 XFS_BUF_DONE(mp->m_sb_bp);
3451 return (0);
3452 }
3453
3454 /*
3455 * Somebody could've already done the hard work for us.
3456 * No need to get locks for this.
3457 */
3458 if (logerror && log->l_iclog->ic_state & XLOG_STATE_IOERROR) {
3459 ASSERT(XLOG_FORCED_SHUTDOWN(log));
3460 return (1);
3461 }
3462 retval = 0;
3463 /*
3464 * We must hold both the GRANT lock and the LOG lock,
3465 * before we mark the filesystem SHUTDOWN and wake
3466 * everybody up to tell the bad news.
3467 */
3468 s = GRANT_LOCK(log);
3469 s2 = LOG_LOCK(log);
3470 mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
3471 XFS_BUF_DONE(mp->m_sb_bp);
3472 /*
3473 * This flag is sort of redundant because of the mount flag, but
3474 * it's good to maintain the separation between the log and the rest
3475 * of XFS.
3476 */
3477 log->l_flags |= XLOG_IO_ERROR;
3478
3479 /*
3480 * If we hit a log error, we want to mark all the iclogs IOERROR
3481 * while we're still holding the loglock.
3482 */
3483 if (logerror)
3484 retval = xlog_state_ioerror(log);
3485 LOG_UNLOCK(log, s2);
3486
3487 /*
3488 * We don't want anybody waiting for log reservations
3489 * after this. That means we have to wake up everybody
3490 * queued up on reserve_headq as well as write_headq.
3491 * In addition, we make sure in xlog_{re}grant_log_space
3492 * that we don't enqueue anything once the SHUTDOWN flag
3493 * is set, and this action is protected by the GRANTLOCK.
3494 */
3495 if ((tic = log->l_reserve_headq)) {
3496 do {
3497 sv_signal(&tic->t_sema);
3498 tic = tic->t_next;
3499 } while (tic != log->l_reserve_headq);
3500 }
3501
3502 if ((tic = log->l_write_headq)) {
3503 do {
3504 sv_signal(&tic->t_sema);
3505 tic = tic->t_next;
3506 } while (tic != log->l_write_headq);
3507 }
3508 GRANT_UNLOCK(log, s);
3509
3510 if (! (log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
3511 ASSERT(!logerror);
3512 /*
3513 * Force the incore logs to disk before shutting the
3514 * log down completely.
3515 */
3516 xlog_state_sync_all(log, XFS_LOG_FORCE|XFS_LOG_SYNC);
3517 s2 = LOG_LOCK(log);
3518 retval = xlog_state_ioerror(log);
3519 LOG_UNLOCK(log, s2);
3520 }
3521 /*
3522 * Wake up everybody waiting on xfs_log_force.
3523 * Callback all log item committed functions as if the
3524 * log writes were completed.
3525 */
3526 xlog_state_do_callback(log, XFS_LI_ABORTED, NULL);
3527
3528#ifdef XFSERRORDEBUG
3529 {
3530 xlog_in_core_t *iclog;
3531
3532 s = LOG_LOCK(log);
3533 iclog = log->l_iclog;
3534 do {
3535 ASSERT(iclog->ic_callback == 0);
3536 iclog = iclog->ic_next;
3537 } while (iclog != log->l_iclog);
3538 LOG_UNLOCK(log, s);
3539 }
3540#endif
3541 /* return non-zero if log IOERROR transition had already happened */
3542 return (retval);
3543}
3544
3545int
3546xlog_iclogs_empty(xlog_t *log)
3547{
3548 xlog_in_core_t *iclog;
3549
3550 iclog = log->l_iclog;
3551 do {
3552 /* endianness does not matter here, zero is zero in
3553 * any language.
3554 */
3555 if (iclog->ic_header.h_num_logops)
3556 return(0);
3557 iclog = iclog->ic_next;
3558 } while (iclog != log->l_iclog);
3559 return(1);
3560}
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
new file mode 100644
index 000000000000..0db122ddda3f
--- /dev/null
+++ b/fs/xfs/xfs_log.h
@@ -0,0 +1,182 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_LOG_H__
33#define __XFS_LOG_H__
34
35/* get lsn fields */
36
37#define CYCLE_LSN(lsn) ((uint)((lsn)>>32))
38#define BLOCK_LSN(lsn) ((uint)(lsn))
39/* this is used in a spot where we might otherwise double-endian-flip */
40#define CYCLE_LSN_DISK(lsn) (((uint *)&(lsn))[0])
41
42#ifdef __KERNEL__
43/*
44 * By comparing each compnent, we don't have to worry about extra
45 * endian issues in treating two 32 bit numbers as one 64 bit number
46 */
47static
48#if defined(__GNUC__) && (__GNUC__ == 2) && ( (__GNUC_MINOR__ == 95) || (__GNUC_MINOR__ == 96))
49__attribute__((unused)) /* gcc 2.95, 2.96 miscompile this when inlined */
50#else
51__inline__
52#endif
53xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
54{
55 if (CYCLE_LSN(lsn1) != CYCLE_LSN(lsn2))
56 return (CYCLE_LSN(lsn1)<CYCLE_LSN(lsn2))? -999 : 999;
57
58 if (BLOCK_LSN(lsn1) != BLOCK_LSN(lsn2))
59 return (BLOCK_LSN(lsn1)<BLOCK_LSN(lsn2))? -999 : 999;
60
61 return 0;
62}
63
64#define XFS_LSN_CMP(x,y) _lsn_cmp(x,y)
65
66/*
67 * Macros, structures, prototypes for interface to the log manager.
68 */
69
70/*
71 * Flags to xfs_log_mount
72 */
73#define XFS_LOG_RECOVER 0x1
74
75/*
76 * Flags to xfs_log_done()
77 */
78#define XFS_LOG_REL_PERM_RESERV 0x1
79
80
81/*
82 * Flags to xfs_log_reserve()
83 *
84 * XFS_LOG_SLEEP: If space is not available, sleep (default)
85 * XFS_LOG_NOSLEEP: If space is not available, return error
86 * XFS_LOG_PERM_RESERV: Permanent reservation. When writes are
87 * performed against this type of reservation, the reservation
88 * is not decreased. Long running transactions should use this.
89 */
90#define XFS_LOG_SLEEP 0x0
91#define XFS_LOG_NOSLEEP 0x1
92#define XFS_LOG_PERM_RESERV 0x2
93#define XFS_LOG_RESV_ALL (XFS_LOG_NOSLEEP|XFS_LOG_PERM_RESERV)
94
95
96/*
97 * Flags to xfs_log_force()
98 *
99 * XFS_LOG_SYNC: Synchronous force in-core log to disk
100 * XFS_LOG_FORCE: Start in-core log write now.
101 * XFS_LOG_URGE: Start write within some window of time.
102 *
103 * Note: Either XFS_LOG_FORCE or XFS_LOG_URGE must be set.
104 */
105#define XFS_LOG_SYNC 0x1
106#define XFS_LOG_FORCE 0x2
107#define XFS_LOG_URGE 0x4
108
109#endif /* __KERNEL__ */
110
111
112/* Log Clients */
113#define XFS_TRANSACTION 0x69
114#define XFS_VOLUME 0x2
115#define XFS_LOG 0xaa
116
117typedef struct xfs_log_iovec {
118 xfs_caddr_t i_addr; /* beginning address of region */
119 int i_len; /* length in bytes of region */
120} xfs_log_iovec_t;
121
122typedef void* xfs_log_ticket_t;
123
124/*
125 * Structure used to pass callback function and the function's argument
126 * to the log manager.
127 */
128typedef struct xfs_log_callback {
129 struct xfs_log_callback *cb_next;
130 void (*cb_func)(void *, int);
131 void *cb_arg;
132} xfs_log_callback_t;
133
134
135#ifdef __KERNEL__
136/* Log manager interfaces */
137struct xfs_mount;
138xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
139 xfs_log_ticket_t ticket,
140 void **iclog,
141 uint flags);
142int xfs_log_force(struct xfs_mount *mp,
143 xfs_lsn_t lsn,
144 uint flags);
145int xfs_log_mount(struct xfs_mount *mp,
146 struct xfs_buftarg *log_target,
147 xfs_daddr_t start_block,
148 int num_bblocks);
149int xfs_log_mount_finish(struct xfs_mount *mp, int);
150void xfs_log_move_tail(struct xfs_mount *mp,
151 xfs_lsn_t tail_lsn);
152int xfs_log_notify(struct xfs_mount *mp,
153 void *iclog,
154 xfs_log_callback_t *callback_entry);
155int xfs_log_release_iclog(struct xfs_mount *mp,
156 void *iclog_hndl);
157int xfs_log_reserve(struct xfs_mount *mp,
158 int length,
159 int count,
160 xfs_log_ticket_t *ticket,
161 __uint8_t clientid,
162 uint flags);
163int xfs_log_write(struct xfs_mount *mp,
164 xfs_log_iovec_t region[],
165 int nentries,
166 xfs_log_ticket_t ticket,
167 xfs_lsn_t *start_lsn);
168int xfs_log_unmount(struct xfs_mount *mp);
169int xfs_log_unmount_write(struct xfs_mount *mp);
170void xfs_log_unmount_dealloc(struct xfs_mount *mp);
171int xfs_log_force_umount(struct xfs_mount *mp, int logerror);
172int xfs_log_need_covered(struct xfs_mount *mp);
173
174void xlog_iodone(struct xfs_buf *);
175
176#endif
177
178
179extern int xlog_debug; /* set to 1 to enable real log */
180
181
182#endif /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
new file mode 100644
index 000000000000..c31e3ce3be66
--- /dev/null
+++ b/fs/xfs/xfs_log_priv.h
@@ -0,0 +1,561 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_LOG_PRIV_H__
33#define __XFS_LOG_PRIV_H__
34
35struct xfs_buf;
36struct ktrace;
37struct log;
38struct xfs_buf_cancel;
39struct xfs_mount;
40
41/*
42 * Macros, structures, prototypes for internal log manager use.
43 */
44
45#define XLOG_MIN_ICLOGS 2
46#define XLOG_MED_ICLOGS 4
47#define XLOG_MAX_ICLOGS 8
48#define XLOG_CALLBACK_SIZE 10
49#define XLOG_HEADER_MAGIC_NUM 0xFEEDbabe /* Invalid cycle number */
50#define XLOG_VERSION_1 1
51#define XLOG_VERSION_2 2 /* Large IClogs, Log sunit */
52#define XLOG_VERSION_OKBITS (XLOG_VERSION_1 | XLOG_VERSION_2)
53#define XLOG_RECORD_BSIZE (16*1024) /* eventually 32k */
54#define XLOG_BIG_RECORD_BSIZE (32*1024) /* 32k buffers */
55#define XLOG_MAX_RECORD_BSIZE (256*1024)
56#define XLOG_HEADER_CYCLE_SIZE (32*1024) /* cycle data in header */
57#define XLOG_RECORD_BSHIFT 14 /* 16384 == 1 << 14 */
58#define XLOG_BIG_RECORD_BSHIFT 15 /* 32k == 1 << 15 */
59#define XLOG_MAX_RECORD_BSHIFT 18 /* 256k == 1 << 18 */
60#define XLOG_BTOLSUNIT(log, b) (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \
61 (log)->l_mp->m_sb.sb_logsunit)
62#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit)
63
64#define XLOG_HEADER_SIZE 512
65
66#define XLOG_REC_SHIFT(log) \
67 BTOBB(1 << (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) ? \
68 XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
69#define XLOG_TOTAL_REC_SHIFT(log) \
70 BTOBB(XLOG_MAX_ICLOGS << (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) ? \
71 XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
72
73/*
74 * set lsns
75 */
76
77#define ASSIGN_ANY_LSN_HOST(lsn,cycle,block) \
78 { \
79 (lsn) = ((xfs_lsn_t)(cycle)<<32)|(block); \
80 }
81#define ASSIGN_ANY_LSN_DISK(lsn,cycle,block) \
82 { \
83 INT_SET(((uint *)&(lsn))[0], ARCH_CONVERT, (cycle)); \
84 INT_SET(((uint *)&(lsn))[1], ARCH_CONVERT, (block)); \
85 }
86#define ASSIGN_LSN(lsn,log) \
87 ASSIGN_ANY_LSN_DISK(lsn,(log)->l_curr_cycle,(log)->l_curr_block);
88
89#define XLOG_SET(f,b) (((f) & (b)) == (b))
90
91#define GET_CYCLE(ptr, arch) \
92 (INT_GET(*(uint *)(ptr), arch) == XLOG_HEADER_MAGIC_NUM ? \
93 INT_GET(*((uint *)(ptr)+1), arch) : \
94 INT_GET(*(uint *)(ptr), arch) \
95 )
96
97#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
98
99
100#ifdef __KERNEL__
101
102/*
103 * get client id from packed copy.
104 *
105 * this hack is here because the xlog_pack code copies four bytes
106 * of xlog_op_header containing the fields oh_clientid, oh_flags
107 * and oh_res2 into the packed copy.
108 *
109 * later on this four byte chunk is treated as an int and the
110 * client id is pulled out.
111 *
112 * this has endian issues, of course.
113 */
114
115#if __BYTE_ORDER == __LITTLE_ENDIAN
116#define GET_CLIENT_ID(i,arch) \
117 ((i) & 0xff)
118#else
119#define GET_CLIENT_ID(i,arch) \
120 ((i) >> 24)
121#endif
122
123#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XLOG_GRANT_SUB_SPACE)
124void xlog_grant_sub_space(struct log *log, int bytes, int type);
125#define XLOG_GRANT_SUB_SPACE(log,bytes,type) \
126 xlog_grant_sub_space(log,bytes,type)
127#else
128#define XLOG_GRANT_SUB_SPACE(log,bytes,type) \
129 { \
130 if (type == 'w') { \
131 (log)->l_grant_write_bytes -= (bytes); \
132 if ((log)->l_grant_write_bytes < 0) { \
133 (log)->l_grant_write_bytes += (log)->l_logsize; \
134 (log)->l_grant_write_cycle--; \
135 } \
136 } else { \
137 (log)->l_grant_reserve_bytes -= (bytes); \
138 if ((log)->l_grant_reserve_bytes < 0) { \
139 (log)->l_grant_reserve_bytes += (log)->l_logsize;\
140 (log)->l_grant_reserve_cycle--; \
141 } \
142 } \
143 }
144#endif
145#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XLOG_GRANT_ADD_SPACE)
146void xlog_grant_add_space(struct log *log, int bytes, int type);
147#define XLOG_GRANT_ADD_SPACE(log,bytes,type) \
148 xlog_grant_add_space(log,bytes,type)
149#else
150#define XLOG_GRANT_ADD_SPACE(log,bytes,type) \
151 { \
152 if (type == 'w') { \
153 (log)->l_grant_write_bytes += (bytes); \
154 if ((log)->l_grant_write_bytes > (log)->l_logsize) { \
155 (log)->l_grant_write_bytes -= (log)->l_logsize; \
156 (log)->l_grant_write_cycle++; \
157 } \
158 } else { \
159 (log)->l_grant_reserve_bytes += (bytes); \
160 if ((log)->l_grant_reserve_bytes > (log)->l_logsize) { \
161 (log)->l_grant_reserve_bytes -= (log)->l_logsize;\
162 (log)->l_grant_reserve_cycle++; \
163 } \
164 } \
165 }
166#endif
167#define XLOG_INS_TICKETQ(q,tic) \
168 { \
169 if (q) { \
170 (tic)->t_next = (q); \
171 (tic)->t_prev = (q)->t_prev; \
172 (q)->t_prev->t_next = (tic); \
173 (q)->t_prev = (tic); \
174 } else { \
175 (tic)->t_prev = (tic)->t_next = (tic); \
176 (q) = (tic); \
177 } \
178 (tic)->t_flags |= XLOG_TIC_IN_Q; \
179 }
180#define XLOG_DEL_TICKETQ(q,tic) \
181 { \
182 if ((tic) == (tic)->t_next) { \
183 (q) = NULL; \
184 } else { \
185 (q) = (tic)->t_next; \
186 (tic)->t_next->t_prev = (tic)->t_prev; \
187 (tic)->t_prev->t_next = (tic)->t_next; \
188 } \
189 (tic)->t_next = (tic)->t_prev = NULL; \
190 (tic)->t_flags &= ~XLOG_TIC_IN_Q; \
191 }
192
193
194#define GRANT_LOCK(log) mutex_spinlock(&(log)->l_grant_lock)
195#define GRANT_UNLOCK(log, s) mutex_spinunlock(&(log)->l_grant_lock, s)
196#define LOG_LOCK(log) mutex_spinlock(&(log)->l_icloglock)
197#define LOG_UNLOCK(log, s) mutex_spinunlock(&(log)->l_icloglock, s)
198
199#define xlog_panic(args...) cmn_err(CE_PANIC, ## args)
200#define xlog_exit(args...) cmn_err(CE_PANIC, ## args)
201#define xlog_warn(args...) cmn_err(CE_WARN, ## args)
202
203/*
204 * In core log state
205 */
206#define XLOG_STATE_ACTIVE 0x0001 /* Current IC log being written to */
207#define XLOG_STATE_WANT_SYNC 0x0002 /* Want to sync this iclog; no more writes */
208#define XLOG_STATE_SYNCING 0x0004 /* This IC log is syncing */
209#define XLOG_STATE_DONE_SYNC 0x0008 /* Done syncing to disk */
210#define XLOG_STATE_DO_CALLBACK \
211 0x0010 /* Process callback functions */
212#define XLOG_STATE_CALLBACK 0x0020 /* Callback functions now */
213#define XLOG_STATE_DIRTY 0x0040 /* Dirty IC log, not ready for ACTIVE status*/
214#define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */
215#define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */
216#define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */
217#endif /* __KERNEL__ */
218
219/*
220 * Flags to log operation header
221 *
222 * The first write of a new transaction will be preceded with a start
223 * record, XLOG_START_TRANS. Once a transaction is committed, a commit
224 * record is written, XLOG_COMMIT_TRANS. If a single region can not fit into
225 * the remainder of the current active in-core log, it is split up into
226 * multiple regions. Each partial region will be marked with a
227 * XLOG_CONTINUE_TRANS until the last one, which gets marked with XLOG_END_TRANS.
228 *
229 */
230#define XLOG_START_TRANS 0x01 /* Start a new transaction */
231#define XLOG_COMMIT_TRANS 0x02 /* Commit this transaction */
232#define XLOG_CONTINUE_TRANS 0x04 /* Cont this trans into new region */
233#define XLOG_WAS_CONT_TRANS 0x08 /* Cont this trans into new region */
234#define XLOG_END_TRANS 0x10 /* End a continued transaction */
235#define XLOG_UNMOUNT_TRANS 0x20 /* Unmount a filesystem transaction */
236#define XLOG_SKIP_TRANS (XLOG_COMMIT_TRANS | XLOG_CONTINUE_TRANS | \
237 XLOG_WAS_CONT_TRANS | XLOG_END_TRANS | \
238 XLOG_UNMOUNT_TRANS)
239
240#ifdef __KERNEL__
241/*
242 * Flags to log ticket
243 */
244#define XLOG_TIC_INITED 0x1 /* has been initialized */
245#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */
246#define XLOG_TIC_IN_Q 0x4
247#endif /* __KERNEL__ */
248
249#define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */
250
251/*
252 * Flags for log structure
253 */
254#define XLOG_CHKSUM_MISMATCH 0x1 /* used only during recovery */
255#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */
256#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */
257#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being
258 shutdown */
259typedef __uint32_t xlog_tid_t;
260
261
262#ifdef __KERNEL__
263/*
264 * Below are states for covering allocation transactions.
265 * By covering, we mean changing the h_tail_lsn in the last on-disk
266 * log write such that no allocation transactions will be re-done during
267 * recovery after a system crash. Recovery starts at the last on-disk
268 * log write.
269 *
270 * These states are used to insert dummy log entries to cover
271 * space allocation transactions which can undo non-transactional changes
272 * after a crash. Writes to a file with space
273 * already allocated do not result in any transactions. Allocations
274 * might include space beyond the EOF. So if we just push the EOF a
275 * little, the last transaction for the file could contain the wrong
276 * size. If there is no file system activity, after an allocation
277 * transaction, and the system crashes, the allocation transaction
278 * will get replayed and the file will be truncated. This could
279 * be hours/days/... after the allocation occurred.
280 *
281 * The fix for this is to do two dummy transactions when the
282 * system is idle. We need two dummy transaction because the h_tail_lsn
283 * in the log record header needs to point beyond the last possible
284 * non-dummy transaction. The first dummy changes the h_tail_lsn to
285 * the first transaction before the dummy. The second dummy causes
286 * h_tail_lsn to point to the first dummy. Recovery starts at h_tail_lsn.
287 *
288 * These dummy transactions get committed when everything
289 * is idle (after there has been some activity).
290 *
291 * There are 5 states used to control this.
292 *
293 * IDLE -- no logging has been done on the file system or
294 * we are done covering previous transactions.
295 * NEED -- logging has occurred and we need a dummy transaction
296 * when the log becomes idle.
297 * DONE -- we were in the NEED state and have committed a dummy
298 * transaction.
299 * NEED2 -- we detected that a dummy transaction has gone to the
300 * on disk log with no other transactions.
301 * DONE2 -- we committed a dummy transaction when in the NEED2 state.
302 *
303 * There are two places where we switch states:
304 *
305 * 1.) In xfs_sync, when we detect an idle log and are in NEED or NEED2.
306 * We commit the dummy transaction and switch to DONE or DONE2,
307 * respectively. In all other states, we don't do anything.
308 *
309 * 2.) When we finish writing the on-disk log (xlog_state_clean_log).
310 *
311 * No matter what state we are in, if this isn't the dummy
312 * transaction going out, the next state is NEED.
313 * So, if we aren't in the DONE or DONE2 states, the next state
314 * is NEED. We can't be finishing a write of the dummy record
315 * unless it was committed and the state switched to DONE or DONE2.
316 *
317 * If we are in the DONE state and this was a write of the
318 * dummy transaction, we move to NEED2.
319 *
320 * If we are in the DONE2 state and this was a write of the
321 * dummy transaction, we move to IDLE.
322 *
323 *
324 * Writing only one dummy transaction can get appended to
325 * one file space allocation. When this happens, the log recovery
326 * code replays the space allocation and a file could be truncated.
327 * This is why we have the NEED2 and DONE2 states before going idle.
328 */
329
330#define XLOG_STATE_COVER_IDLE 0
331#define XLOG_STATE_COVER_NEED 1
332#define XLOG_STATE_COVER_DONE 2
333#define XLOG_STATE_COVER_NEED2 3
334#define XLOG_STATE_COVER_DONE2 4
335
336#define XLOG_COVER_OPS 5
337
338typedef struct xlog_ticket {
339 sv_t t_sema; /* sleep on this semaphore :20 */
340 struct xlog_ticket *t_next; /* : 4 */
341 struct xlog_ticket *t_prev; /* : 4 */
342 xlog_tid_t t_tid; /* transaction identifier : 4 */
343 int t_curr_res; /* current reservation in bytes : 4 */
344 int t_unit_res; /* unit reservation in bytes : 4 */
345 __uint8_t t_ocnt; /* original count : 1 */
346 __uint8_t t_cnt; /* current count : 1 */
347 __uint8_t t_clientid; /* who does this belong to; : 1 */
348 __uint8_t t_flags; /* properties of reservation : 1 */
349} xlog_ticket_t;
350#endif
351
352
353typedef struct xlog_op_header {
354 xlog_tid_t oh_tid; /* transaction id of operation : 4 b */
355 int oh_len; /* bytes in data region : 4 b */
356 __uint8_t oh_clientid; /* who sent me this : 1 b */
357 __uint8_t oh_flags; /* : 1 b */
358 ushort oh_res2; /* 32 bit align : 2 b */
359} xlog_op_header_t;
360
361
362/* valid values for h_fmt */
363#define XLOG_FMT_UNKNOWN 0
364#define XLOG_FMT_LINUX_LE 1
365#define XLOG_FMT_LINUX_BE 2
366#define XLOG_FMT_IRIX_BE 3
367
368/* our fmt */
369#if __BYTE_ORDER == __LITTLE_ENDIAN
370#define XLOG_FMT XLOG_FMT_LINUX_LE
371#else
372#if __BYTE_ORDER == __BIG_ENDIAN
373#define XLOG_FMT XLOG_FMT_LINUX_BE
374#else
375#error unknown byte order
376#endif
377#endif
378
379typedef struct xlog_rec_header {
380 uint h_magicno; /* log record (LR) identifier : 4 */
381 uint h_cycle; /* write cycle of log : 4 */
382 int h_version; /* LR version : 4 */
383 int h_len; /* len in bytes; should be 64-bit aligned: 4 */
384 xfs_lsn_t h_lsn; /* lsn of this LR : 8 */
385 xfs_lsn_t h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */
386 uint h_chksum; /* may not be used; non-zero if used : 4 */
387 int h_prev_block; /* block number to previous LR : 4 */
388 int h_num_logops; /* number of log operations in this LR : 4 */
389 uint h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
390 /* new fields */
391 int h_fmt; /* format of log record : 4 */
392 uuid_t h_fs_uuid; /* uuid of FS : 16 */
393 int h_size; /* iclog size : 4 */
394} xlog_rec_header_t;
395
396typedef struct xlog_rec_ext_header {
397 uint xh_cycle; /* write cycle of log : 4 */
398 uint xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /* : 256 */
399} xlog_rec_ext_header_t;
400
401#ifdef __KERNEL__
402/*
403 * - A log record header is 512 bytes. There is plenty of room to grow the
404 * xlog_rec_header_t into the reserved space.
405 * - ic_data follows, so a write to disk can start at the beginning of
406 * the iclog.
407 * - ic_forcesema is used to implement synchronous forcing of the iclog to disk.
408 * - ic_next is the pointer to the next iclog in the ring.
409 * - ic_bp is a pointer to the buffer used to write this incore log to disk.
410 * - ic_log is a pointer back to the global log structure.
411 * - ic_callback is a linked list of callback function/argument pairs to be
412 * called after an iclog finishes writing.
413 * - ic_size is the full size of the header plus data.
414 * - ic_offset is the current number of bytes written to in this iclog.
415 * - ic_refcnt is bumped when someone is writing to the log.
416 * - ic_state is the state of the iclog.
417 */
418typedef struct xlog_iclog_fields {
419 sv_t ic_forcesema;
420 sv_t ic_writesema;
421 struct xlog_in_core *ic_next;
422 struct xlog_in_core *ic_prev;
423 struct xfs_buf *ic_bp;
424 struct log *ic_log;
425 xfs_log_callback_t *ic_callback;
426 xfs_log_callback_t **ic_callback_tail;
427#ifdef XFS_LOG_TRACE
428 struct ktrace *ic_trace;
429#endif
430 int ic_size;
431 int ic_offset;
432 int ic_refcnt;
433 int ic_bwritecnt;
434 ushort_t ic_state;
435 char *ic_datap; /* pointer to iclog data */
436} xlog_iclog_fields_t;
437
438typedef union xlog_in_core2 {
439 xlog_rec_header_t hic_header;
440 xlog_rec_ext_header_t hic_xheader;
441 char hic_sector[XLOG_HEADER_SIZE];
442} xlog_in_core_2_t;
443
444typedef struct xlog_in_core {
445 xlog_iclog_fields_t hic_fields;
446 xlog_in_core_2_t *hic_data;
447} xlog_in_core_t;
448
449/*
450 * Defines to save our code from this glop.
451 */
452#define ic_forcesema hic_fields.ic_forcesema
453#define ic_writesema hic_fields.ic_writesema
454#define ic_next hic_fields.ic_next
455#define ic_prev hic_fields.ic_prev
456#define ic_bp hic_fields.ic_bp
457#define ic_log hic_fields.ic_log
458#define ic_callback hic_fields.ic_callback
459#define ic_callback_tail hic_fields.ic_callback_tail
460#define ic_trace hic_fields.ic_trace
461#define ic_size hic_fields.ic_size
462#define ic_offset hic_fields.ic_offset
463#define ic_refcnt hic_fields.ic_refcnt
464#define ic_bwritecnt hic_fields.ic_bwritecnt
465#define ic_state hic_fields.ic_state
466#define ic_datap hic_fields.ic_datap
467#define ic_header hic_data->hic_header
468
469/*
470 * The reservation head lsn is not made up of a cycle number and block number.
471 * Instead, it uses a cycle number and byte number. Logs don't expect to
472 * overflow 31 bits worth of byte offset, so using a byte number will mean
473 * that round off problems won't occur when releasing partial reservations.
474 */
475typedef struct log {
476 /* The following block of fields are changed while holding icloglock */
477 sema_t l_flushsema; /* iclog flushing semaphore */
478 int l_flushcnt; /* # of procs waiting on this
479 * sema */
480 int l_ticket_cnt; /* free ticket count */
481 int l_ticket_tcnt; /* total ticket count */
482 int l_covered_state;/* state of "covering disk
483 * log entries" */
484 xlog_ticket_t *l_freelist; /* free list of tickets */
485 xlog_ticket_t *l_unmount_free;/* kmem_free these addresses */
486 xlog_ticket_t *l_tail; /* free list of tickets */
487 xlog_in_core_t *l_iclog; /* head log queue */
488 lock_t l_icloglock; /* grab to change iclog state */
489 xfs_lsn_t l_tail_lsn; /* lsn of 1st LR with unflushed
490 * buffers */
491 xfs_lsn_t l_last_sync_lsn;/* lsn of last LR on disk */
492 struct xfs_mount *l_mp; /* mount point */
493 struct xfs_buf *l_xbuf; /* extra buffer for log
494 * wrapping */
495 struct xfs_buftarg *l_targ; /* buftarg of log */
496 xfs_daddr_t l_logBBstart; /* start block of log */
497 int l_logsize; /* size of log in bytes */
498 int l_logBBsize; /* size of log in BB chunks */
499 int l_curr_cycle; /* Cycle number of log writes */
500 int l_prev_cycle; /* Cycle number before last
501 * block increment */
502 int l_curr_block; /* current logical log block */
503 int l_prev_block; /* previous logical log block */
504 int l_iclog_size; /* size of log in bytes */
505 int l_iclog_size_log; /* log power size of log */
506 int l_iclog_bufs; /* number of iclog buffers */
507
508 /* The following field are used for debugging; need to hold icloglock */
509 char *l_iclog_bak[XLOG_MAX_ICLOGS];
510
511 /* The following block of fields are changed while holding grant_lock */
512 lock_t l_grant_lock;
513 xlog_ticket_t *l_reserve_headq;
514 xlog_ticket_t *l_write_headq;
515 int l_grant_reserve_cycle;
516 int l_grant_reserve_bytes;
517 int l_grant_write_cycle;
518 int l_grant_write_bytes;
519
520 /* The following fields don't need locking */
521#ifdef XFS_LOG_TRACE
522 struct ktrace *l_trace;
523 struct ktrace *l_grant_trace;
524#endif
525 uint l_flags;
526 uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
527 struct xfs_buf_cancel **l_buf_cancel_table;
528 int l_iclog_hsize; /* size of iclog header */
529 int l_iclog_heads; /* # of iclog header sectors */
530 uint l_sectbb_log; /* log2 of sector size in BBs */
531 uint l_sectbb_mask; /* sector size (in BBs)
532 * alignment mask */
533} xlog_t;
534
535
536/* common routines */
537extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
538extern int xlog_find_head(xlog_t *log, xfs_daddr_t *head_blk);
539extern int xlog_find_tail(xlog_t *log,
540 xfs_daddr_t *head_blk,
541 xfs_daddr_t *tail_blk,
542 int readonly);
543extern int xlog_recover(xlog_t *log, int readonly);
544extern int xlog_recover_finish(xlog_t *log, int mfsi_flags);
545extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
546extern void xlog_recover_process_iunlinks(xlog_t *log);
547
548extern struct xfs_buf *xlog_get_bp(xlog_t *, int);
549extern void xlog_put_bp(struct xfs_buf *);
550extern int xlog_bread(xlog_t *, xfs_daddr_t, int, struct xfs_buf *);
551extern xfs_caddr_t xlog_align(xlog_t *, xfs_daddr_t, int, struct xfs_buf *);
552
553/* iclog tracing */
554#define XLOG_TRACE_GRAB_FLUSH 1
555#define XLOG_TRACE_REL_FLUSH 2
556#define XLOG_TRACE_SLEEP_FLUSH 3
557#define XLOG_TRACE_WAKE_FLUSH 4
558
559#endif /* __KERNEL__ */
560
561#endif /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
new file mode 100644
index 000000000000..9824b5bf0ec0
--- /dev/null
+++ b/fs/xfs/xfs_log_recover.c
@@ -0,0 +1,4098 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34#include "xfs_macros.h"
35#include "xfs_types.h"
36#include "xfs_inum.h"
37#include "xfs_log.h"
38#include "xfs_ag.h"
39#include "xfs_sb.h"
40#include "xfs_trans.h"
41#include "xfs_dir.h"
42#include "xfs_dir2.h"
43#include "xfs_dmapi.h"
44#include "xfs_mount.h"
45#include "xfs_error.h"
46#include "xfs_bmap_btree.h"
47#include "xfs_alloc.h"
48#include "xfs_attr_sf.h"
49#include "xfs_dir_sf.h"
50#include "xfs_dir2_sf.h"
51#include "xfs_dinode.h"
52#include "xfs_imap.h"
53#include "xfs_inode_item.h"
54#include "xfs_inode.h"
55#include "xfs_ialloc_btree.h"
56#include "xfs_ialloc.h"
57#include "xfs_log_priv.h"
58#include "xfs_buf_item.h"
59#include "xfs_alloc_btree.h"
60#include "xfs_log_recover.h"
61#include "xfs_extfree_item.h"
62#include "xfs_trans_priv.h"
63#include "xfs_bit.h"
64#include "xfs_quota.h"
65#include "xfs_rw.h"
66
67STATIC int xlog_find_zeroed(xlog_t *, xfs_daddr_t *);
68STATIC int xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t);
69STATIC void xlog_recover_insert_item_backq(xlog_recover_item_t **q,
70 xlog_recover_item_t *item);
71#if defined(DEBUG)
72STATIC void xlog_recover_check_summary(xlog_t *);
73STATIC void xlog_recover_check_ail(xfs_mount_t *, xfs_log_item_t *, int);
74#else
75#define xlog_recover_check_summary(log)
76#define xlog_recover_check_ail(mp, lip, gen)
77#endif
78
79
80/*
81 * Sector aligned buffer routines for buffer create/read/write/access
82 */
83
84#define XLOG_SECTOR_ROUNDUP_BBCOUNT(log, bbs) \
85 ( ((log)->l_sectbb_mask && (bbs & (log)->l_sectbb_mask)) ? \
86 ((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) )
87#define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno) ((bno) & ~(log)->l_sectbb_mask)
88
89xfs_buf_t *
90xlog_get_bp(
91 xlog_t *log,
92 int num_bblks)
93{
94 ASSERT(num_bblks > 0);
95
96 if (log->l_sectbb_log) {
97 if (num_bblks > 1)
98 num_bblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
99 num_bblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, num_bblks);
100 }
101 return xfs_buf_get_noaddr(BBTOB(num_bblks), log->l_mp->m_logdev_targp);
102}
103
104void
105xlog_put_bp(
106 xfs_buf_t *bp)
107{
108 xfs_buf_free(bp);
109}
110
111
112/*
113 * nbblks should be uint, but oh well. Just want to catch that 32-bit length.
114 */
115int
116xlog_bread(
117 xlog_t *log,
118 xfs_daddr_t blk_no,
119 int nbblks,
120 xfs_buf_t *bp)
121{
122 int error;
123
124 if (log->l_sectbb_log) {
125 blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
126 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
127 }
128
129 ASSERT(nbblks > 0);
130 ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
131 ASSERT(bp);
132
133 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
134 XFS_BUF_READ(bp);
135 XFS_BUF_BUSY(bp);
136 XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
137 XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
138
139 xfsbdstrat(log->l_mp, bp);
140 if ((error = xfs_iowait(bp)))
141 xfs_ioerror_alert("xlog_bread", log->l_mp,
142 bp, XFS_BUF_ADDR(bp));
143 return error;
144}
145
146/*
147 * Write out the buffer at the given block for the given number of blocks.
148 * The buffer is kept locked across the write and is returned locked.
149 * This can only be used for synchronous log writes.
150 */
151int
152xlog_bwrite(
153 xlog_t *log,
154 xfs_daddr_t blk_no,
155 int nbblks,
156 xfs_buf_t *bp)
157{
158 int error;
159
160 if (log->l_sectbb_log) {
161 blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
162 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
163 }
164
165 ASSERT(nbblks > 0);
166 ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
167
168 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
169 XFS_BUF_ZEROFLAGS(bp);
170 XFS_BUF_BUSY(bp);
171 XFS_BUF_HOLD(bp);
172 XFS_BUF_PSEMA(bp, PRIBIO);
173 XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
174 XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
175
176 if ((error = xfs_bwrite(log->l_mp, bp)))
177 xfs_ioerror_alert("xlog_bwrite", log->l_mp,
178 bp, XFS_BUF_ADDR(bp));
179 return error;
180}
181
182xfs_caddr_t
183xlog_align(
184 xlog_t *log,
185 xfs_daddr_t blk_no,
186 int nbblks,
187 xfs_buf_t *bp)
188{
189 xfs_caddr_t ptr;
190
191 if (!log->l_sectbb_log)
192 return XFS_BUF_PTR(bp);
193
194 ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
195 ASSERT(XFS_BUF_SIZE(bp) >=
196 BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
197 return ptr;
198}
199
200#ifdef DEBUG
201/*
202 * dump debug superblock and log record information
203 */
204STATIC void
205xlog_header_check_dump(
206 xfs_mount_t *mp,
207 xlog_rec_header_t *head)
208{
209 int b;
210
211 printk("%s: SB : uuid = ", __FUNCTION__);
212 for (b = 0; b < 16; b++)
213 printk("%02x",((unsigned char *)&mp->m_sb.sb_uuid)[b]);
214 printk(", fmt = %d\n", XLOG_FMT);
215 printk(" log : uuid = ");
216 for (b = 0; b < 16; b++)
217 printk("%02x",((unsigned char *)&head->h_fs_uuid)[b]);
218 printk(", fmt = %d\n", INT_GET(head->h_fmt, ARCH_CONVERT));
219}
220#else
221#define xlog_header_check_dump(mp, head)
222#endif
223
224/*
225 * check log record header for recovery
226 */
227STATIC int
228xlog_header_check_recover(
229 xfs_mount_t *mp,
230 xlog_rec_header_t *head)
231{
232 ASSERT(INT_GET(head->h_magicno, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM);
233
234 /*
235 * IRIX doesn't write the h_fmt field and leaves it zeroed
236 * (XLOG_FMT_UNKNOWN). This stops us from trying to recover
237 * a dirty log created in IRIX.
238 */
239 if (unlikely(INT_GET(head->h_fmt, ARCH_CONVERT) != XLOG_FMT)) {
240 xlog_warn(
241 "XFS: dirty log written in incompatible format - can't recover");
242 xlog_header_check_dump(mp, head);
243 XFS_ERROR_REPORT("xlog_header_check_recover(1)",
244 XFS_ERRLEVEL_HIGH, mp);
245 return XFS_ERROR(EFSCORRUPTED);
246 } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
247 xlog_warn(
248 "XFS: dirty log entry has mismatched uuid - can't recover");
249 xlog_header_check_dump(mp, head);
250 XFS_ERROR_REPORT("xlog_header_check_recover(2)",
251 XFS_ERRLEVEL_HIGH, mp);
252 return XFS_ERROR(EFSCORRUPTED);
253 }
254 return 0;
255}
256
257/*
258 * read the head block of the log and check the header
259 */
260STATIC int
261xlog_header_check_mount(
262 xfs_mount_t *mp,
263 xlog_rec_header_t *head)
264{
265 ASSERT(INT_GET(head->h_magicno, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM);
266
267 if (uuid_is_nil(&head->h_fs_uuid)) {
268 /*
269 * IRIX doesn't write the h_fs_uuid or h_fmt fields. If
270 * h_fs_uuid is nil, we assume this log was last mounted
271 * by IRIX and continue.
272 */
273 xlog_warn("XFS: nil uuid in log - IRIX style log");
274 } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
275 xlog_warn("XFS: log has mismatched uuid - can't recover");
276 xlog_header_check_dump(mp, head);
277 XFS_ERROR_REPORT("xlog_header_check_mount",
278 XFS_ERRLEVEL_HIGH, mp);
279 return XFS_ERROR(EFSCORRUPTED);
280 }
281 return 0;
282}
283
284STATIC void
285xlog_recover_iodone(
286 struct xfs_buf *bp)
287{
288 xfs_mount_t *mp;
289
290 ASSERT(XFS_BUF_FSPRIVATE(bp, void *));
291
292 if (XFS_BUF_GETERROR(bp)) {
293 /*
294 * We're not going to bother about retrying
295 * this during recovery. One strike!
296 */
297 mp = XFS_BUF_FSPRIVATE(bp, xfs_mount_t *);
298 xfs_ioerror_alert("xlog_recover_iodone",
299 mp, bp, XFS_BUF_ADDR(bp));
300 xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR);
301 }
302 XFS_BUF_SET_FSPRIVATE(bp, NULL);
303 XFS_BUF_CLR_IODONE_FUNC(bp);
304 xfs_biodone(bp);
305}
306
307/*
308 * This routine finds (to an approximation) the first block in the physical
309 * log which contains the given cycle. It uses a binary search algorithm.
310 * Note that the algorithm can not be perfect because the disk will not
311 * necessarily be perfect.
312 */
313int
314xlog_find_cycle_start(
315 xlog_t *log,
316 xfs_buf_t *bp,
317 xfs_daddr_t first_blk,
318 xfs_daddr_t *last_blk,
319 uint cycle)
320{
321 xfs_caddr_t offset;
322 xfs_daddr_t mid_blk;
323 uint mid_cycle;
324 int error;
325
326 mid_blk = BLK_AVG(first_blk, *last_blk);
327 while (mid_blk != first_blk && mid_blk != *last_blk) {
328 if ((error = xlog_bread(log, mid_blk, 1, bp)))
329 return error;
330 offset = xlog_align(log, mid_blk, 1, bp);
331 mid_cycle = GET_CYCLE(offset, ARCH_CONVERT);
332 if (mid_cycle == cycle) {
333 *last_blk = mid_blk;
334 /* last_half_cycle == mid_cycle */
335 } else {
336 first_blk = mid_blk;
337 /* first_half_cycle == mid_cycle */
338 }
339 mid_blk = BLK_AVG(first_blk, *last_blk);
340 }
341 ASSERT((mid_blk == first_blk && mid_blk+1 == *last_blk) ||
342 (mid_blk == *last_blk && mid_blk-1 == first_blk));
343
344 return 0;
345}
346
347/*
348 * Check that the range of blocks does not contain the cycle number
349 * given. The scan needs to occur from front to back and the ptr into the
350 * region must be updated since a later routine will need to perform another
351 * test. If the region is completely good, we end up returning the same
352 * last block number.
353 *
354 * Set blkno to -1 if we encounter no errors. This is an invalid block number
355 * since we don't ever expect logs to get this large.
356 */
357STATIC int
358xlog_find_verify_cycle(
359 xlog_t *log,
360 xfs_daddr_t start_blk,
361 int nbblks,
362 uint stop_on_cycle_no,
363 xfs_daddr_t *new_blk)
364{
365 xfs_daddr_t i, j;
366 uint cycle;
367 xfs_buf_t *bp;
368 xfs_daddr_t bufblks;
369 xfs_caddr_t buf = NULL;
370 int error = 0;
371
372 bufblks = 1 << ffs(nbblks);
373
374 while (!(bp = xlog_get_bp(log, bufblks))) {
375 /* can't get enough memory to do everything in one big buffer */
376 bufblks >>= 1;
377 if (bufblks <= log->l_sectbb_log)
378 return ENOMEM;
379 }
380
381 for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
382 int bcount;
383
384 bcount = min(bufblks, (start_blk + nbblks - i));
385
386 if ((error = xlog_bread(log, i, bcount, bp)))
387 goto out;
388
389 buf = xlog_align(log, i, bcount, bp);
390 for (j = 0; j < bcount; j++) {
391 cycle = GET_CYCLE(buf, ARCH_CONVERT);
392 if (cycle == stop_on_cycle_no) {
393 *new_blk = i+j;
394 goto out;
395 }
396
397 buf += BBSIZE;
398 }
399 }
400
401 *new_blk = -1;
402
403out:
404 xlog_put_bp(bp);
405 return error;
406}
407
408/*
409 * Potentially backup over partial log record write.
410 *
411 * In the typical case, last_blk is the number of the block directly after
412 * a good log record. Therefore, we subtract one to get the block number
413 * of the last block in the given buffer. extra_bblks contains the number
414 * of blocks we would have read on a previous read. This happens when the
415 * last log record is split over the end of the physical log.
416 *
417 * extra_bblks is the number of blocks potentially verified on a previous
418 * call to this routine.
419 */
420STATIC int
421xlog_find_verify_log_record(
422 xlog_t *log,
423 xfs_daddr_t start_blk,
424 xfs_daddr_t *last_blk,
425 int extra_bblks)
426{
427 xfs_daddr_t i;
428 xfs_buf_t *bp;
429 xfs_caddr_t offset = NULL;
430 xlog_rec_header_t *head = NULL;
431 int error = 0;
432 int smallmem = 0;
433 int num_blks = *last_blk - start_blk;
434 int xhdrs;
435
436 ASSERT(start_blk != 0 || *last_blk != start_blk);
437
438 if (!(bp = xlog_get_bp(log, num_blks))) {
439 if (!(bp = xlog_get_bp(log, 1)))
440 return ENOMEM;
441 smallmem = 1;
442 } else {
443 if ((error = xlog_bread(log, start_blk, num_blks, bp)))
444 goto out;
445 offset = xlog_align(log, start_blk, num_blks, bp);
446 offset += ((num_blks - 1) << BBSHIFT);
447 }
448
449 for (i = (*last_blk) - 1; i >= 0; i--) {
450 if (i < start_blk) {
451 /* valid log record not found */
452 xlog_warn(
453 "XFS: Log inconsistent (didn't find previous header)");
454 ASSERT(0);
455 error = XFS_ERROR(EIO);
456 goto out;
457 }
458
459 if (smallmem) {
460 if ((error = xlog_bread(log, i, 1, bp)))
461 goto out;
462 offset = xlog_align(log, i, 1, bp);
463 }
464
465 head = (xlog_rec_header_t *)offset;
466
467 if (XLOG_HEADER_MAGIC_NUM ==
468 INT_GET(head->h_magicno, ARCH_CONVERT))
469 break;
470
471 if (!smallmem)
472 offset -= BBSIZE;
473 }
474
475 /*
476 * We hit the beginning of the physical log & still no header. Return
477 * to caller. If caller can handle a return of -1, then this routine
478 * will be called again for the end of the physical log.
479 */
480 if (i == -1) {
481 error = -1;
482 goto out;
483 }
484
485 /*
486 * We have the final block of the good log (the first block
487 * of the log record _before_ the head. So we check the uuid.
488 */
489 if ((error = xlog_header_check_mount(log->l_mp, head)))
490 goto out;
491
492 /*
493 * We may have found a log record header before we expected one.
494 * last_blk will be the 1st block # with a given cycle #. We may end
495 * up reading an entire log record. In this case, we don't want to
496 * reset last_blk. Only when last_blk points in the middle of a log
497 * record do we update last_blk.
498 */
499 if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
500 uint h_size = INT_GET(head->h_size, ARCH_CONVERT);
501
502 xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
503 if (h_size % XLOG_HEADER_CYCLE_SIZE)
504 xhdrs++;
505 } else {
506 xhdrs = 1;
507 }
508
509 if (*last_blk - i + extra_bblks
510 != BTOBB(INT_GET(head->h_len, ARCH_CONVERT)) + xhdrs)
511 *last_blk = i;
512
513out:
514 xlog_put_bp(bp);
515 return error;
516}
517
518/*
519 * Head is defined to be the point of the log where the next log write
520 * write could go. This means that incomplete LR writes at the end are
521 * eliminated when calculating the head. We aren't guaranteed that previous
522 * LR have complete transactions. We only know that a cycle number of
523 * current cycle number -1 won't be present in the log if we start writing
524 * from our current block number.
525 *
526 * last_blk contains the block number of the first block with a given
527 * cycle number.
528 *
529 * Return: zero if normal, non-zero if error.
530 */
531int
532xlog_find_head(
533 xlog_t *log,
534 xfs_daddr_t *return_head_blk)
535{
536 xfs_buf_t *bp;
537 xfs_caddr_t offset;
538 xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk;
539 int num_scan_bblks;
540 uint first_half_cycle, last_half_cycle;
541 uint stop_on_cycle;
542 int error, log_bbnum = log->l_logBBsize;
543
544 /* Is the end of the log device zeroed? */
545 if ((error = xlog_find_zeroed(log, &first_blk)) == -1) {
546 *return_head_blk = first_blk;
547
548 /* Is the whole lot zeroed? */
549 if (!first_blk) {
550 /* Linux XFS shouldn't generate totally zeroed logs -
551 * mkfs etc write a dummy unmount record to a fresh
552 * log so we can store the uuid in there
553 */
554 xlog_warn("XFS: totally zeroed log");
555 }
556
557 return 0;
558 } else if (error) {
559 xlog_warn("XFS: empty log check failed");
560 return error;
561 }
562
563 first_blk = 0; /* get cycle # of 1st block */
564 bp = xlog_get_bp(log, 1);
565 if (!bp)
566 return ENOMEM;
567 if ((error = xlog_bread(log, 0, 1, bp)))
568 goto bp_err;
569 offset = xlog_align(log, 0, 1, bp);
570 first_half_cycle = GET_CYCLE(offset, ARCH_CONVERT);
571
572 last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */
573 if ((error = xlog_bread(log, last_blk, 1, bp)))
574 goto bp_err;
575 offset = xlog_align(log, last_blk, 1, bp);
576 last_half_cycle = GET_CYCLE(offset, ARCH_CONVERT);
577 ASSERT(last_half_cycle != 0);
578
579 /*
580 * If the 1st half cycle number is equal to the last half cycle number,
581 * then the entire log is stamped with the same cycle number. In this
582 * case, head_blk can't be set to zero (which makes sense). The below
583 * math doesn't work out properly with head_blk equal to zero. Instead,
584 * we set it to log_bbnum which is an invalid block number, but this
585 * value makes the math correct. If head_blk doesn't changed through
586 * all the tests below, *head_blk is set to zero at the very end rather
587 * than log_bbnum. In a sense, log_bbnum and zero are the same block
588 * in a circular file.
589 */
590 if (first_half_cycle == last_half_cycle) {
591 /*
592 * In this case we believe that the entire log should have
593 * cycle number last_half_cycle. We need to scan backwards
594 * from the end verifying that there are no holes still
595 * containing last_half_cycle - 1. If we find such a hole,
596 * then the start of that hole will be the new head. The
597 * simple case looks like
598 * x | x ... | x - 1 | x
599 * Another case that fits this picture would be
600 * x | x + 1 | x ... | x
601 * In this case the head really is somwhere at the end of the
602 * log, as one of the latest writes at the beginning was
603 * incomplete.
604 * One more case is
605 * x | x + 1 | x ... | x - 1 | x
606 * This is really the combination of the above two cases, and
607 * the head has to end up at the start of the x-1 hole at the
608 * end of the log.
609 *
610 * In the 256k log case, we will read from the beginning to the
611 * end of the log and search for cycle numbers equal to x-1.
612 * We don't worry about the x+1 blocks that we encounter,
613 * because we know that they cannot be the head since the log
614 * started with x.
615 */
616 head_blk = log_bbnum;
617 stop_on_cycle = last_half_cycle - 1;
618 } else {
619 /*
620 * In this case we want to find the first block with cycle
621 * number matching last_half_cycle. We expect the log to be
622 * some variation on
623 * x + 1 ... | x ...
624 * The first block with cycle number x (last_half_cycle) will
625 * be where the new head belongs. First we do a binary search
626 * for the first occurrence of last_half_cycle. The binary
627 * search may not be totally accurate, so then we scan back
628 * from there looking for occurrences of last_half_cycle before
629 * us. If that backwards scan wraps around the beginning of
630 * the log, then we look for occurrences of last_half_cycle - 1
631 * at the end of the log. The cases we're looking for look
632 * like
633 * x + 1 ... | x | x + 1 | x ...
634 * ^ binary search stopped here
635 * or
636 * x + 1 ... | x ... | x - 1 | x
637 * <---------> less than scan distance
638 */
639 stop_on_cycle = last_half_cycle;
640 if ((error = xlog_find_cycle_start(log, bp, first_blk,
641 &head_blk, last_half_cycle)))
642 goto bp_err;
643 }
644
645 /*
646 * Now validate the answer. Scan back some number of maximum possible
647 * blocks and make sure each one has the expected cycle number. The
648 * maximum is determined by the total possible amount of buffering
649 * in the in-core log. The following number can be made tighter if
650 * we actually look at the block size of the filesystem.
651 */
652 num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
653 if (head_blk >= num_scan_bblks) {
654 /*
655 * We are guaranteed that the entire check can be performed
656 * in one buffer.
657 */
658 start_blk = head_blk - num_scan_bblks;
659 if ((error = xlog_find_verify_cycle(log,
660 start_blk, num_scan_bblks,
661 stop_on_cycle, &new_blk)))
662 goto bp_err;
663 if (new_blk != -1)
664 head_blk = new_blk;
665 } else { /* need to read 2 parts of log */
666 /*
667 * We are going to scan backwards in the log in two parts.
668 * First we scan the physical end of the log. In this part
669 * of the log, we are looking for blocks with cycle number
670 * last_half_cycle - 1.
671 * If we find one, then we know that the log starts there, as
672 * we've found a hole that didn't get written in going around
673 * the end of the physical log. The simple case for this is
674 * x + 1 ... | x ... | x - 1 | x
675 * <---------> less than scan distance
676 * If all of the blocks at the end of the log have cycle number
677 * last_half_cycle, then we check the blocks at the start of
678 * the log looking for occurrences of last_half_cycle. If we
679 * find one, then our current estimate for the location of the
680 * first occurrence of last_half_cycle is wrong and we move
681 * back to the hole we've found. This case looks like
682 * x + 1 ... | x | x + 1 | x ...
683 * ^ binary search stopped here
684 * Another case we need to handle that only occurs in 256k
685 * logs is
686 * x + 1 ... | x ... | x+1 | x ...
687 * ^ binary search stops here
688 * In a 256k log, the scan at the end of the log will see the
689 * x + 1 blocks. We need to skip past those since that is
690 * certainly not the head of the log. By searching for
691 * last_half_cycle-1 we accomplish that.
692 */
693 start_blk = log_bbnum - num_scan_bblks + head_blk;
694 ASSERT(head_blk <= INT_MAX &&
695 (xfs_daddr_t) num_scan_bblks - head_blk >= 0);
696 if ((error = xlog_find_verify_cycle(log, start_blk,
697 num_scan_bblks - (int)head_blk,
698 (stop_on_cycle - 1), &new_blk)))
699 goto bp_err;
700 if (new_blk != -1) {
701 head_blk = new_blk;
702 goto bad_blk;
703 }
704
705 /*
706 * Scan beginning of log now. The last part of the physical
707 * log is good. This scan needs to verify that it doesn't find
708 * the last_half_cycle.
709 */
710 start_blk = 0;
711 ASSERT(head_blk <= INT_MAX);
712 if ((error = xlog_find_verify_cycle(log,
713 start_blk, (int)head_blk,
714 stop_on_cycle, &new_blk)))
715 goto bp_err;
716 if (new_blk != -1)
717 head_blk = new_blk;
718 }
719
720 bad_blk:
721 /*
722 * Now we need to make sure head_blk is not pointing to a block in
723 * the middle of a log record.
724 */
725 num_scan_bblks = XLOG_REC_SHIFT(log);
726 if (head_blk >= num_scan_bblks) {
727 start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
728
729 /* start ptr at last block ptr before head_blk */
730 if ((error = xlog_find_verify_log_record(log, start_blk,
731 &head_blk, 0)) == -1) {
732 error = XFS_ERROR(EIO);
733 goto bp_err;
734 } else if (error)
735 goto bp_err;
736 } else {
737 start_blk = 0;
738 ASSERT(head_blk <= INT_MAX);
739 if ((error = xlog_find_verify_log_record(log, start_blk,
740 &head_blk, 0)) == -1) {
741 /* We hit the beginning of the log during our search */
742 start_blk = log_bbnum - num_scan_bblks + head_blk;
743 new_blk = log_bbnum;
744 ASSERT(start_blk <= INT_MAX &&
745 (xfs_daddr_t) log_bbnum-start_blk >= 0);
746 ASSERT(head_blk <= INT_MAX);
747 if ((error = xlog_find_verify_log_record(log,
748 start_blk, &new_blk,
749 (int)head_blk)) == -1) {
750 error = XFS_ERROR(EIO);
751 goto bp_err;
752 } else if (error)
753 goto bp_err;
754 if (new_blk != log_bbnum)
755 head_blk = new_blk;
756 } else if (error)
757 goto bp_err;
758 }
759
760 xlog_put_bp(bp);
761 if (head_blk == log_bbnum)
762 *return_head_blk = 0;
763 else
764 *return_head_blk = head_blk;
765 /*
766 * When returning here, we have a good block number. Bad block
767 * means that during a previous crash, we didn't have a clean break
768 * from cycle number N to cycle number N-1. In this case, we need
769 * to find the first block with cycle number N-1.
770 */
771 return 0;
772
773 bp_err:
774 xlog_put_bp(bp);
775
776 if (error)
777 xlog_warn("XFS: failed to find log head");
778 return error;
779}
780
781/*
782 * Find the sync block number or the tail of the log.
783 *
784 * This will be the block number of the last record to have its
785 * associated buffers synced to disk. Every log record header has
786 * a sync lsn embedded in it. LSNs hold block numbers, so it is easy
787 * to get a sync block number. The only concern is to figure out which
788 * log record header to believe.
789 *
790 * The following algorithm uses the log record header with the largest
791 * lsn. The entire log record does not need to be valid. We only care
792 * that the header is valid.
793 *
794 * We could speed up search by using current head_blk buffer, but it is not
795 * available.
796 */
797int
798xlog_find_tail(
799 xlog_t *log,
800 xfs_daddr_t *head_blk,
801 xfs_daddr_t *tail_blk,
802 int readonly)
803{
804 xlog_rec_header_t *rhead;
805 xlog_op_header_t *op_head;
806 xfs_caddr_t offset = NULL;
807 xfs_buf_t *bp;
808 int error, i, found;
809 xfs_daddr_t umount_data_blk;
810 xfs_daddr_t after_umount_blk;
811 xfs_lsn_t tail_lsn;
812 int hblks;
813
814 found = 0;
815
816 /*
817 * Find previous log record
818 */
819 if ((error = xlog_find_head(log, head_blk)))
820 return error;
821
822 bp = xlog_get_bp(log, 1);
823 if (!bp)
824 return ENOMEM;
825 if (*head_blk == 0) { /* special case */
826 if ((error = xlog_bread(log, 0, 1, bp)))
827 goto bread_err;
828 offset = xlog_align(log, 0, 1, bp);
829 if (GET_CYCLE(offset, ARCH_CONVERT) == 0) {
830 *tail_blk = 0;
831 /* leave all other log inited values alone */
832 goto exit;
833 }
834 }
835
836 /*
837 * Search backwards looking for log record header block
838 */
839 ASSERT(*head_blk < INT_MAX);
840 for (i = (int)(*head_blk) - 1; i >= 0; i--) {
841 if ((error = xlog_bread(log, i, 1, bp)))
842 goto bread_err;
843 offset = xlog_align(log, i, 1, bp);
844 if (XLOG_HEADER_MAGIC_NUM ==
845 INT_GET(*(uint *)offset, ARCH_CONVERT)) {
846 found = 1;
847 break;
848 }
849 }
850 /*
851 * If we haven't found the log record header block, start looking
852 * again from the end of the physical log. XXXmiken: There should be
853 * a check here to make sure we didn't search more than N blocks in
854 * the previous code.
855 */
856 if (!found) {
857 for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
858 if ((error = xlog_bread(log, i, 1, bp)))
859 goto bread_err;
860 offset = xlog_align(log, i, 1, bp);
861 if (XLOG_HEADER_MAGIC_NUM ==
862 INT_GET(*(uint*)offset, ARCH_CONVERT)) {
863 found = 2;
864 break;
865 }
866 }
867 }
868 if (!found) {
869 xlog_warn("XFS: xlog_find_tail: couldn't find sync record");
870 ASSERT(0);
871 return XFS_ERROR(EIO);
872 }
873
874 /* find blk_no of tail of log */
875 rhead = (xlog_rec_header_t *)offset;
876 *tail_blk = BLOCK_LSN(INT_GET(rhead->h_tail_lsn, ARCH_CONVERT));
877
878 /*
879 * Reset log values according to the state of the log when we
880 * crashed. In the case where head_blk == 0, we bump curr_cycle
881 * one because the next write starts a new cycle rather than
882 * continuing the cycle of the last good log record. At this
883 * point we have guaranteed that all partial log records have been
884 * accounted for. Therefore, we know that the last good log record
885 * written was complete and ended exactly on the end boundary
886 * of the physical log.
887 */
888 log->l_prev_block = i;
889 log->l_curr_block = (int)*head_blk;
890 log->l_curr_cycle = INT_GET(rhead->h_cycle, ARCH_CONVERT);
891 if (found == 2)
892 log->l_curr_cycle++;
893 log->l_tail_lsn = INT_GET(rhead->h_tail_lsn, ARCH_CONVERT);
894 log->l_last_sync_lsn = INT_GET(rhead->h_lsn, ARCH_CONVERT);
895 log->l_grant_reserve_cycle = log->l_curr_cycle;
896 log->l_grant_reserve_bytes = BBTOB(log->l_curr_block);
897 log->l_grant_write_cycle = log->l_curr_cycle;
898 log->l_grant_write_bytes = BBTOB(log->l_curr_block);
899
900 /*
901 * Look for unmount record. If we find it, then we know there
902 * was a clean unmount. Since 'i' could be the last block in
903 * the physical log, we convert to a log block before comparing
904 * to the head_blk.
905 *
906 * Save the current tail lsn to use to pass to
907 * xlog_clear_stale_blocks() below. We won't want to clear the
908 * unmount record if there is one, so we pass the lsn of the
909 * unmount record rather than the block after it.
910 */
911 if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
912 int h_size = INT_GET(rhead->h_size, ARCH_CONVERT);
913 int h_version = INT_GET(rhead->h_version, ARCH_CONVERT);
914
915 if ((h_version & XLOG_VERSION_2) &&
916 (h_size > XLOG_HEADER_CYCLE_SIZE)) {
917 hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
918 if (h_size % XLOG_HEADER_CYCLE_SIZE)
919 hblks++;
920 } else {
921 hblks = 1;
922 }
923 } else {
924 hblks = 1;
925 }
926 after_umount_blk = (i + hblks + (int)
927 BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT))) % log->l_logBBsize;
928 tail_lsn = log->l_tail_lsn;
929 if (*head_blk == after_umount_blk &&
930 INT_GET(rhead->h_num_logops, ARCH_CONVERT) == 1) {
931 umount_data_blk = (i + hblks) % log->l_logBBsize;
932 if ((error = xlog_bread(log, umount_data_blk, 1, bp))) {
933 goto bread_err;
934 }
935 offset = xlog_align(log, umount_data_blk, 1, bp);
936 op_head = (xlog_op_header_t *)offset;
937 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
938 /*
939 * Set tail and last sync so that newly written
940 * log records will point recovery to after the
941 * current unmount record.
942 */
943 ASSIGN_ANY_LSN_HOST(log->l_tail_lsn, log->l_curr_cycle,
944 after_umount_blk);
945 ASSIGN_ANY_LSN_HOST(log->l_last_sync_lsn, log->l_curr_cycle,
946 after_umount_blk);
947 *tail_blk = after_umount_blk;
948 }
949 }
950
951 /*
952 * Make sure that there are no blocks in front of the head
953 * with the same cycle number as the head. This can happen
954 * because we allow multiple outstanding log writes concurrently,
955 * and the later writes might make it out before earlier ones.
956 *
957 * We use the lsn from before modifying it so that we'll never
958 * overwrite the unmount record after a clean unmount.
959 *
960 * Do this only if we are going to recover the filesystem
961 *
962 * NOTE: This used to say "if (!readonly)"
963 * However on Linux, we can & do recover a read-only filesystem.
964 * We only skip recovery if NORECOVERY is specified on mount,
965 * in which case we would not be here.
966 *
967 * But... if the -device- itself is readonly, just skip this.
968 * We can't recover this device anyway, so it won't matter.
969 */
970 if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) {
971 error = xlog_clear_stale_blocks(log, tail_lsn);
972 }
973
974bread_err:
975exit:
976 xlog_put_bp(bp);
977
978 if (error)
979 xlog_warn("XFS: failed to locate log tail");
980 return error;
981}
982
983/*
984 * Is the log zeroed at all?
985 *
986 * The last binary search should be changed to perform an X block read
987 * once X becomes small enough. You can then search linearly through
988 * the X blocks. This will cut down on the number of reads we need to do.
989 *
990 * If the log is partially zeroed, this routine will pass back the blkno
991 * of the first block with cycle number 0. It won't have a complete LR
992 * preceding it.
993 *
994 * Return:
995 * 0 => the log is completely written to
996 * -1 => use *blk_no as the first block of the log
997 * >0 => error has occurred
998 */
999int
1000xlog_find_zeroed(
1001 xlog_t *log,
1002 xfs_daddr_t *blk_no)
1003{
1004 xfs_buf_t *bp;
1005 xfs_caddr_t offset;
1006 uint first_cycle, last_cycle;
1007 xfs_daddr_t new_blk, last_blk, start_blk;
1008 xfs_daddr_t num_scan_bblks;
1009 int error, log_bbnum = log->l_logBBsize;
1010
1011 /* check totally zeroed log */
1012 bp = xlog_get_bp(log, 1);
1013 if (!bp)
1014 return ENOMEM;
1015 if ((error = xlog_bread(log, 0, 1, bp)))
1016 goto bp_err;
1017 offset = xlog_align(log, 0, 1, bp);
1018 first_cycle = GET_CYCLE(offset, ARCH_CONVERT);
1019 if (first_cycle == 0) { /* completely zeroed log */
1020 *blk_no = 0;
1021 xlog_put_bp(bp);
1022 return -1;
1023 }
1024
1025 /* check partially zeroed log */
1026 if ((error = xlog_bread(log, log_bbnum-1, 1, bp)))
1027 goto bp_err;
1028 offset = xlog_align(log, log_bbnum-1, 1, bp);
1029 last_cycle = GET_CYCLE(offset, ARCH_CONVERT);
1030 if (last_cycle != 0) { /* log completely written to */
1031 xlog_put_bp(bp);
1032 return 0;
1033 } else if (first_cycle != 1) {
1034 /*
1035 * If the cycle of the last block is zero, the cycle of
1036 * the first block must be 1. If it's not, maybe we're
1037 * not looking at a log... Bail out.
1038 */
1039 xlog_warn("XFS: Log inconsistent or not a log (last==0, first!=1)");
1040 return XFS_ERROR(EINVAL);
1041 }
1042
1043 /* we have a partially zeroed log */
1044 last_blk = log_bbnum-1;
1045 if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
1046 goto bp_err;
1047
1048 /*
1049 * Validate the answer. Because there is no way to guarantee that
1050 * the entire log is made up of log records which are the same size,
1051 * we scan over the defined maximum blocks. At this point, the maximum
1052 * is not chosen to mean anything special. XXXmiken
1053 */
1054 num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
1055 ASSERT(num_scan_bblks <= INT_MAX);
1056
1057 if (last_blk < num_scan_bblks)
1058 num_scan_bblks = last_blk;
1059 start_blk = last_blk - num_scan_bblks;
1060
1061 /*
1062 * We search for any instances of cycle number 0 that occur before
1063 * our current estimate of the head. What we're trying to detect is
1064 * 1 ... | 0 | 1 | 0...
1065 * ^ binary search ends here
1066 */
1067 if ((error = xlog_find_verify_cycle(log, start_blk,
1068 (int)num_scan_bblks, 0, &new_blk)))
1069 goto bp_err;
1070 if (new_blk != -1)
1071 last_blk = new_blk;
1072
1073 /*
1074 * Potentially backup over partial log record write. We don't need
1075 * to search the end of the log because we know it is zero.
1076 */
1077 if ((error = xlog_find_verify_log_record(log, start_blk,
1078 &last_blk, 0)) == -1) {
1079 error = XFS_ERROR(EIO);
1080 goto bp_err;
1081 } else if (error)
1082 goto bp_err;
1083
1084 *blk_no = last_blk;
1085bp_err:
1086 xlog_put_bp(bp);
1087 if (error)
1088 return error;
1089 return -1;
1090}
1091
1092/*
1093 * These are simple subroutines used by xlog_clear_stale_blocks() below
1094 * to initialize a buffer full of empty log record headers and write
1095 * them into the log.
1096 */
1097STATIC void
1098xlog_add_record(
1099 xlog_t *log,
1100 xfs_caddr_t buf,
1101 int cycle,
1102 int block,
1103 int tail_cycle,
1104 int tail_block)
1105{
1106 xlog_rec_header_t *recp = (xlog_rec_header_t *)buf;
1107
1108 memset(buf, 0, BBSIZE);
1109 INT_SET(recp->h_magicno, ARCH_CONVERT, XLOG_HEADER_MAGIC_NUM);
1110 INT_SET(recp->h_cycle, ARCH_CONVERT, cycle);
1111 INT_SET(recp->h_version, ARCH_CONVERT,
1112 XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) ? 2 : 1);
1113 ASSIGN_ANY_LSN_DISK(recp->h_lsn, cycle, block);
1114 ASSIGN_ANY_LSN_DISK(recp->h_tail_lsn, tail_cycle, tail_block);
1115 INT_SET(recp->h_fmt, ARCH_CONVERT, XLOG_FMT);
1116 memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t));
1117}
1118
1119STATIC int
1120xlog_write_log_records(
1121 xlog_t *log,
1122 int cycle,
1123 int start_block,
1124 int blocks,
1125 int tail_cycle,
1126 int tail_block)
1127{
1128 xfs_caddr_t offset;
1129 xfs_buf_t *bp;
1130 int balign, ealign;
1131 int sectbb = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
1132 int end_block = start_block + blocks;
1133 int bufblks;
1134 int error = 0;
1135 int i, j = 0;
1136
1137 bufblks = 1 << ffs(blocks);
1138 while (!(bp = xlog_get_bp(log, bufblks))) {
1139 bufblks >>= 1;
1140 if (bufblks <= log->l_sectbb_log)
1141 return ENOMEM;
1142 }
1143
1144 /* We may need to do a read at the start to fill in part of
1145 * the buffer in the starting sector not covered by the first
1146 * write below.
1147 */
1148 balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block);
1149 if (balign != start_block) {
1150 if ((error = xlog_bread(log, start_block, 1, bp))) {
1151 xlog_put_bp(bp);
1152 return error;
1153 }
1154 j = start_block - balign;
1155 }
1156
1157 for (i = start_block; i < end_block; i += bufblks) {
1158 int bcount, endcount;
1159
1160 bcount = min(bufblks, end_block - start_block);
1161 endcount = bcount - j;
1162
1163 /* We may need to do a read at the end to fill in part of
1164 * the buffer in the final sector not covered by the write.
1165 * If this is the same sector as the above read, skip it.
1166 */
1167 ealign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, end_block);
1168 if (j == 0 && (start_block + endcount > ealign)) {
1169 offset = XFS_BUF_PTR(bp);
1170 balign = BBTOB(ealign - start_block);
1171 XFS_BUF_SET_PTR(bp, offset + balign, BBTOB(sectbb));
1172 if ((error = xlog_bread(log, ealign, sectbb, bp)))
1173 break;
1174 XFS_BUF_SET_PTR(bp, offset, bufblks);
1175 }
1176
1177 offset = xlog_align(log, start_block, endcount, bp);
1178 for (; j < endcount; j++) {
1179 xlog_add_record(log, offset, cycle, i+j,
1180 tail_cycle, tail_block);
1181 offset += BBSIZE;
1182 }
1183 error = xlog_bwrite(log, start_block, endcount, bp);
1184 if (error)
1185 break;
1186 start_block += endcount;
1187 j = 0;
1188 }
1189 xlog_put_bp(bp);
1190 return error;
1191}
1192
1193/*
1194 * This routine is called to blow away any incomplete log writes out
1195 * in front of the log head. We do this so that we won't become confused
1196 * if we come up, write only a little bit more, and then crash again.
1197 * If we leave the partial log records out there, this situation could
1198 * cause us to think those partial writes are valid blocks since they
1199 * have the current cycle number. We get rid of them by overwriting them
1200 * with empty log records with the old cycle number rather than the
1201 * current one.
1202 *
1203 * The tail lsn is passed in rather than taken from
1204 * the log so that we will not write over the unmount record after a
1205 * clean unmount in a 512 block log. Doing so would leave the log without
1206 * any valid log records in it until a new one was written. If we crashed
1207 * during that time we would not be able to recover.
1208 */
1209STATIC int
1210xlog_clear_stale_blocks(
1211 xlog_t *log,
1212 xfs_lsn_t tail_lsn)
1213{
1214 int tail_cycle, head_cycle;
1215 int tail_block, head_block;
1216 int tail_distance, max_distance;
1217 int distance;
1218 int error;
1219
1220 tail_cycle = CYCLE_LSN(tail_lsn);
1221 tail_block = BLOCK_LSN(tail_lsn);
1222 head_cycle = log->l_curr_cycle;
1223 head_block = log->l_curr_block;
1224
1225 /*
1226 * Figure out the distance between the new head of the log
1227 * and the tail. We want to write over any blocks beyond the
1228 * head that we may have written just before the crash, but
1229 * we don't want to overwrite the tail of the log.
1230 */
1231 if (head_cycle == tail_cycle) {
1232 /*
1233 * The tail is behind the head in the physical log,
1234 * so the distance from the head to the tail is the
1235 * distance from the head to the end of the log plus
1236 * the distance from the beginning of the log to the
1237 * tail.
1238 */
1239 if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) {
1240 XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)",
1241 XFS_ERRLEVEL_LOW, log->l_mp);
1242 return XFS_ERROR(EFSCORRUPTED);
1243 }
1244 tail_distance = tail_block + (log->l_logBBsize - head_block);
1245 } else {
1246 /*
1247 * The head is behind the tail in the physical log,
1248 * so the distance from the head to the tail is just
1249 * the tail block minus the head block.
1250 */
1251 if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){
1252 XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)",
1253 XFS_ERRLEVEL_LOW, log->l_mp);
1254 return XFS_ERROR(EFSCORRUPTED);
1255 }
1256 tail_distance = tail_block - head_block;
1257 }
1258
1259 /*
1260 * If the head is right up against the tail, we can't clear
1261 * anything.
1262 */
1263 if (tail_distance <= 0) {
1264 ASSERT(tail_distance == 0);
1265 return 0;
1266 }
1267
1268 max_distance = XLOG_TOTAL_REC_SHIFT(log);
1269 /*
1270 * Take the smaller of the maximum amount of outstanding I/O
1271 * we could have and the distance to the tail to clear out.
1272 * We take the smaller so that we don't overwrite the tail and
1273 * we don't waste all day writing from the head to the tail
1274 * for no reason.
1275 */
1276 max_distance = MIN(max_distance, tail_distance);
1277
1278 if ((head_block + max_distance) <= log->l_logBBsize) {
1279 /*
1280 * We can stomp all the blocks we need to without
1281 * wrapping around the end of the log. Just do it
1282 * in a single write. Use the cycle number of the
1283 * current cycle minus one so that the log will look like:
1284 * n ... | n - 1 ...
1285 */
1286 error = xlog_write_log_records(log, (head_cycle - 1),
1287 head_block, max_distance, tail_cycle,
1288 tail_block);
1289 if (error)
1290 return error;
1291 } else {
1292 /*
1293 * We need to wrap around the end of the physical log in
1294 * order to clear all the blocks. Do it in two separate
1295 * I/Os. The first write should be from the head to the
1296 * end of the physical log, and it should use the current
1297 * cycle number minus one just like above.
1298 */
1299 distance = log->l_logBBsize - head_block;
1300 error = xlog_write_log_records(log, (head_cycle - 1),
1301 head_block, distance, tail_cycle,
1302 tail_block);
1303
1304 if (error)
1305 return error;
1306
1307 /*
1308 * Now write the blocks at the start of the physical log.
1309 * This writes the remainder of the blocks we want to clear.
1310 * It uses the current cycle number since we're now on the
1311 * same cycle as the head so that we get:
1312 * n ... n ... | n - 1 ...
1313 * ^^^^^ blocks we're writing
1314 */
1315 distance = max_distance - (log->l_logBBsize - head_block);
1316 error = xlog_write_log_records(log, head_cycle, 0, distance,
1317 tail_cycle, tail_block);
1318 if (error)
1319 return error;
1320 }
1321
1322 return 0;
1323}
1324
1325/******************************************************************************
1326 *
1327 * Log recover routines
1328 *
1329 ******************************************************************************
1330 */
1331
1332STATIC xlog_recover_t *
1333xlog_recover_find_tid(
1334 xlog_recover_t *q,
1335 xlog_tid_t tid)
1336{
1337 xlog_recover_t *p = q;
1338
1339 while (p != NULL) {
1340 if (p->r_log_tid == tid)
1341 break;
1342 p = p->r_next;
1343 }
1344 return p;
1345}
1346
1347STATIC void
1348xlog_recover_put_hashq(
1349 xlog_recover_t **q,
1350 xlog_recover_t *trans)
1351{
1352 trans->r_next = *q;
1353 *q = trans;
1354}
1355
1356STATIC void
1357xlog_recover_add_item(
1358 xlog_recover_item_t **itemq)
1359{
1360 xlog_recover_item_t *item;
1361
1362 item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
1363 xlog_recover_insert_item_backq(itemq, item);
1364}
1365
1366STATIC int
1367xlog_recover_add_to_cont_trans(
1368 xlog_recover_t *trans,
1369 xfs_caddr_t dp,
1370 int len)
1371{
1372 xlog_recover_item_t *item;
1373 xfs_caddr_t ptr, old_ptr;
1374 int old_len;
1375
1376 item = trans->r_itemq;
1377 if (item == 0) {
1378 /* finish copying rest of trans header */
1379 xlog_recover_add_item(&trans->r_itemq);
1380 ptr = (xfs_caddr_t) &trans->r_theader +
1381 sizeof(xfs_trans_header_t) - len;
1382 memcpy(ptr, dp, len); /* d, s, l */
1383 return 0;
1384 }
1385 item = item->ri_prev;
1386
1387 old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
1388 old_len = item->ri_buf[item->ri_cnt-1].i_len;
1389
1390 ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0);
1391 memcpy(&ptr[old_len], dp, len); /* d, s, l */
1392 item->ri_buf[item->ri_cnt-1].i_len += len;
1393 item->ri_buf[item->ri_cnt-1].i_addr = ptr;
1394 return 0;
1395}
1396
1397/*
1398 * The next region to add is the start of a new region. It could be
1399 * a whole region or it could be the first part of a new region. Because
1400 * of this, the assumption here is that the type and size fields of all
1401 * format structures fit into the first 32 bits of the structure.
1402 *
1403 * This works because all regions must be 32 bit aligned. Therefore, we
1404 * either have both fields or we have neither field. In the case we have
1405 * neither field, the data part of the region is zero length. We only have
1406 * a log_op_header and can throw away the header since a new one will appear
1407 * later. If we have at least 4 bytes, then we can determine how many regions
1408 * will appear in the current log item.
1409 */
1410STATIC int
1411xlog_recover_add_to_trans(
1412 xlog_recover_t *trans,
1413 xfs_caddr_t dp,
1414 int len)
1415{
1416 xfs_inode_log_format_t *in_f; /* any will do */
1417 xlog_recover_item_t *item;
1418 xfs_caddr_t ptr;
1419
1420 if (!len)
1421 return 0;
1422 item = trans->r_itemq;
1423 if (item == 0) {
1424 ASSERT(*(uint *)dp == XFS_TRANS_HEADER_MAGIC);
1425 if (len == sizeof(xfs_trans_header_t))
1426 xlog_recover_add_item(&trans->r_itemq);
1427 memcpy(&trans->r_theader, dp, len); /* d, s, l */
1428 return 0;
1429 }
1430
1431 ptr = kmem_alloc(len, KM_SLEEP);
1432 memcpy(ptr, dp, len);
1433 in_f = (xfs_inode_log_format_t *)ptr;
1434
1435 if (item->ri_prev->ri_total != 0 &&
1436 item->ri_prev->ri_total == item->ri_prev->ri_cnt) {
1437 xlog_recover_add_item(&trans->r_itemq);
1438 }
1439 item = trans->r_itemq;
1440 item = item->ri_prev;
1441
1442 if (item->ri_total == 0) { /* first region to be added */
1443 item->ri_total = in_f->ilf_size;
1444 ASSERT(item->ri_total <= XLOG_MAX_REGIONS_IN_ITEM);
1445 item->ri_buf = kmem_zalloc((item->ri_total *
1446 sizeof(xfs_log_iovec_t)), KM_SLEEP);
1447 }
1448 ASSERT(item->ri_total > item->ri_cnt);
1449 /* Description region is ri_buf[0] */
1450 item->ri_buf[item->ri_cnt].i_addr = ptr;
1451 item->ri_buf[item->ri_cnt].i_len = len;
1452 item->ri_cnt++;
1453 return 0;
1454}
1455
1456STATIC void
1457xlog_recover_new_tid(
1458 xlog_recover_t **q,
1459 xlog_tid_t tid,
1460 xfs_lsn_t lsn)
1461{
1462 xlog_recover_t *trans;
1463
1464 trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
1465 trans->r_log_tid = tid;
1466 trans->r_lsn = lsn;
1467 xlog_recover_put_hashq(q, trans);
1468}
1469
1470STATIC int
1471xlog_recover_unlink_tid(
1472 xlog_recover_t **q,
1473 xlog_recover_t *trans)
1474{
1475 xlog_recover_t *tp;
1476 int found = 0;
1477
1478 ASSERT(trans != 0);
1479 if (trans == *q) {
1480 *q = (*q)->r_next;
1481 } else {
1482 tp = *q;
1483 while (tp != 0) {
1484 if (tp->r_next == trans) {
1485 found = 1;
1486 break;
1487 }
1488 tp = tp->r_next;
1489 }
1490 if (!found) {
1491 xlog_warn(
1492 "XFS: xlog_recover_unlink_tid: trans not found");
1493 ASSERT(0);
1494 return XFS_ERROR(EIO);
1495 }
1496 tp->r_next = tp->r_next->r_next;
1497 }
1498 return 0;
1499}
1500
1501STATIC void
1502xlog_recover_insert_item_backq(
1503 xlog_recover_item_t **q,
1504 xlog_recover_item_t *item)
1505{
1506 if (*q == 0) {
1507 item->ri_prev = item->ri_next = item;
1508 *q = item;
1509 } else {
1510 item->ri_next = *q;
1511 item->ri_prev = (*q)->ri_prev;
1512 (*q)->ri_prev = item;
1513 item->ri_prev->ri_next = item;
1514 }
1515}
1516
1517STATIC void
1518xlog_recover_insert_item_frontq(
1519 xlog_recover_item_t **q,
1520 xlog_recover_item_t *item)
1521{
1522 xlog_recover_insert_item_backq(q, item);
1523 *q = item;
1524}
1525
1526STATIC int
1527xlog_recover_reorder_trans(
1528 xlog_t *log,
1529 xlog_recover_t *trans)
1530{
1531 xlog_recover_item_t *first_item, *itemq, *itemq_next;
1532 xfs_buf_log_format_t *buf_f;
1533 xfs_buf_log_format_v1_t *obuf_f;
1534 ushort flags = 0;
1535
1536 first_item = itemq = trans->r_itemq;
1537 trans->r_itemq = NULL;
1538 do {
1539 itemq_next = itemq->ri_next;
1540 buf_f = (xfs_buf_log_format_t *)itemq->ri_buf[0].i_addr;
1541 switch (ITEM_TYPE(itemq)) {
1542 case XFS_LI_BUF:
1543 flags = buf_f->blf_flags;
1544 break;
1545 case XFS_LI_6_1_BUF:
1546 case XFS_LI_5_3_BUF:
1547 obuf_f = (xfs_buf_log_format_v1_t*)buf_f;
1548 flags = obuf_f->blf_flags;
1549 break;
1550 }
1551
1552 switch (ITEM_TYPE(itemq)) {
1553 case XFS_LI_BUF:
1554 case XFS_LI_6_1_BUF:
1555 case XFS_LI_5_3_BUF:
1556 if (!(flags & XFS_BLI_CANCEL)) {
1557 xlog_recover_insert_item_frontq(&trans->r_itemq,
1558 itemq);
1559 break;
1560 }
1561 case XFS_LI_INODE:
1562 case XFS_LI_6_1_INODE:
1563 case XFS_LI_5_3_INODE:
1564 case XFS_LI_DQUOT:
1565 case XFS_LI_QUOTAOFF:
1566 case XFS_LI_EFD:
1567 case XFS_LI_EFI:
1568 xlog_recover_insert_item_backq(&trans->r_itemq, itemq);
1569 break;
1570 default:
1571 xlog_warn(
1572 "XFS: xlog_recover_reorder_trans: unrecognized type of log operation");
1573 ASSERT(0);
1574 return XFS_ERROR(EIO);
1575 }
1576 itemq = itemq_next;
1577 } while (first_item != itemq);
1578 return 0;
1579}
1580
1581/*
1582 * Build up the table of buf cancel records so that we don't replay
1583 * cancelled data in the second pass. For buffer records that are
1584 * not cancel records, there is nothing to do here so we just return.
1585 *
1586 * If we get a cancel record which is already in the table, this indicates
1587 * that the buffer was cancelled multiple times. In order to ensure
1588 * that during pass 2 we keep the record in the table until we reach its
1589 * last occurrence in the log, we keep a reference count in the cancel
1590 * record in the table to tell us how many times we expect to see this
1591 * record during the second pass.
1592 */
1593STATIC void
1594xlog_recover_do_buffer_pass1(
1595 xlog_t *log,
1596 xfs_buf_log_format_t *buf_f)
1597{
1598 xfs_buf_cancel_t *bcp;
1599 xfs_buf_cancel_t *nextp;
1600 xfs_buf_cancel_t *prevp;
1601 xfs_buf_cancel_t **bucket;
1602 xfs_buf_log_format_v1_t *obuf_f;
1603 xfs_daddr_t blkno = 0;
1604 uint len = 0;
1605 ushort flags = 0;
1606
1607 switch (buf_f->blf_type) {
1608 case XFS_LI_BUF:
1609 blkno = buf_f->blf_blkno;
1610 len = buf_f->blf_len;
1611 flags = buf_f->blf_flags;
1612 break;
1613 case XFS_LI_6_1_BUF:
1614 case XFS_LI_5_3_BUF:
1615 obuf_f = (xfs_buf_log_format_v1_t*)buf_f;
1616 blkno = (xfs_daddr_t) obuf_f->blf_blkno;
1617 len = obuf_f->blf_len;
1618 flags = obuf_f->blf_flags;
1619 break;
1620 }
1621
1622 /*
1623 * If this isn't a cancel buffer item, then just return.
1624 */
1625 if (!(flags & XFS_BLI_CANCEL))
1626 return;
1627
1628 /*
1629 * Insert an xfs_buf_cancel record into the hash table of
1630 * them. If there is already an identical record, bump
1631 * its reference count.
1632 */
1633 bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
1634 XLOG_BC_TABLE_SIZE];
1635 /*
1636 * If the hash bucket is empty then just insert a new record into
1637 * the bucket.
1638 */
1639 if (*bucket == NULL) {
1640 bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
1641 KM_SLEEP);
1642 bcp->bc_blkno = blkno;
1643 bcp->bc_len = len;
1644 bcp->bc_refcount = 1;
1645 bcp->bc_next = NULL;
1646 *bucket = bcp;
1647 return;
1648 }
1649
1650 /*
1651 * The hash bucket is not empty, so search for duplicates of our
1652 * record. If we find one them just bump its refcount. If not
1653 * then add us at the end of the list.
1654 */
1655 prevp = NULL;
1656 nextp = *bucket;
1657 while (nextp != NULL) {
1658 if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
1659 nextp->bc_refcount++;
1660 return;
1661 }
1662 prevp = nextp;
1663 nextp = nextp->bc_next;
1664 }
1665 ASSERT(prevp != NULL);
1666 bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
1667 KM_SLEEP);
1668 bcp->bc_blkno = blkno;
1669 bcp->bc_len = len;
1670 bcp->bc_refcount = 1;
1671 bcp->bc_next = NULL;
1672 prevp->bc_next = bcp;
1673}
1674
1675/*
1676 * Check to see whether the buffer being recovered has a corresponding
1677 * entry in the buffer cancel record table. If it does then return 1
1678 * so that it will be cancelled, otherwise return 0. If the buffer is
1679 * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement
1680 * the refcount on the entry in the table and remove it from the table
1681 * if this is the last reference.
1682 *
1683 * We remove the cancel record from the table when we encounter its
1684 * last occurrence in the log so that if the same buffer is re-used
1685 * again after its last cancellation we actually replay the changes
1686 * made at that point.
1687 */
1688STATIC int
1689xlog_check_buffer_cancelled(
1690 xlog_t *log,
1691 xfs_daddr_t blkno,
1692 uint len,
1693 ushort flags)
1694{
1695 xfs_buf_cancel_t *bcp;
1696 xfs_buf_cancel_t *prevp;
1697 xfs_buf_cancel_t **bucket;
1698
1699 if (log->l_buf_cancel_table == NULL) {
1700 /*
1701 * There is nothing in the table built in pass one,
1702 * so this buffer must not be cancelled.
1703 */
1704 ASSERT(!(flags & XFS_BLI_CANCEL));
1705 return 0;
1706 }
1707
1708 bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
1709 XLOG_BC_TABLE_SIZE];
1710 bcp = *bucket;
1711 if (bcp == NULL) {
1712 /*
1713 * There is no corresponding entry in the table built
1714 * in pass one, so this buffer has not been cancelled.
1715 */
1716 ASSERT(!(flags & XFS_BLI_CANCEL));
1717 return 0;
1718 }
1719
1720 /*
1721 * Search for an entry in the buffer cancel table that
1722 * matches our buffer.
1723 */
1724 prevp = NULL;
1725 while (bcp != NULL) {
1726 if (bcp->bc_blkno == blkno && bcp->bc_len == len) {
1727 /*
1728 * We've go a match, so return 1 so that the
1729 * recovery of this buffer is cancelled.
1730 * If this buffer is actually a buffer cancel
1731 * log item, then decrement the refcount on the
1732 * one in the table and remove it if this is the
1733 * last reference.
1734 */
1735 if (flags & XFS_BLI_CANCEL) {
1736 bcp->bc_refcount--;
1737 if (bcp->bc_refcount == 0) {
1738 if (prevp == NULL) {
1739 *bucket = bcp->bc_next;
1740 } else {
1741 prevp->bc_next = bcp->bc_next;
1742 }
1743 kmem_free(bcp,
1744 sizeof(xfs_buf_cancel_t));
1745 }
1746 }
1747 return 1;
1748 }
1749 prevp = bcp;
1750 bcp = bcp->bc_next;
1751 }
1752 /*
1753 * We didn't find a corresponding entry in the table, so
1754 * return 0 so that the buffer is NOT cancelled.
1755 */
1756 ASSERT(!(flags & XFS_BLI_CANCEL));
1757 return 0;
1758}
1759
1760STATIC int
1761xlog_recover_do_buffer_pass2(
1762 xlog_t *log,
1763 xfs_buf_log_format_t *buf_f)
1764{
1765 xfs_buf_log_format_v1_t *obuf_f;
1766 xfs_daddr_t blkno = 0;
1767 ushort flags = 0;
1768 uint len = 0;
1769
1770 switch (buf_f->blf_type) {
1771 case XFS_LI_BUF:
1772 blkno = buf_f->blf_blkno;
1773 flags = buf_f->blf_flags;
1774 len = buf_f->blf_len;
1775 break;
1776 case XFS_LI_6_1_BUF:
1777 case XFS_LI_5_3_BUF:
1778 obuf_f = (xfs_buf_log_format_v1_t*)buf_f;
1779 blkno = (xfs_daddr_t) obuf_f->blf_blkno;
1780 flags = obuf_f->blf_flags;
1781 len = (xfs_daddr_t) obuf_f->blf_len;
1782 break;
1783 }
1784
1785 return xlog_check_buffer_cancelled(log, blkno, len, flags);
1786}
1787
1788/*
1789 * Perform recovery for a buffer full of inodes. In these buffers,
1790 * the only data which should be recovered is that which corresponds
1791 * to the di_next_unlinked pointers in the on disk inode structures.
1792 * The rest of the data for the inodes is always logged through the
1793 * inodes themselves rather than the inode buffer and is recovered
1794 * in xlog_recover_do_inode_trans().
1795 *
1796 * The only time when buffers full of inodes are fully recovered is
1797 * when the buffer is full of newly allocated inodes. In this case
1798 * the buffer will not be marked as an inode buffer and so will be
1799 * sent to xlog_recover_do_reg_buffer() below during recovery.
1800 */
1801STATIC int
1802xlog_recover_do_inode_buffer(
1803 xfs_mount_t *mp,
1804 xlog_recover_item_t *item,
1805 xfs_buf_t *bp,
1806 xfs_buf_log_format_t *buf_f)
1807{
1808 int i;
1809 int item_index;
1810 int bit;
1811 int nbits;
1812 int reg_buf_offset;
1813 int reg_buf_bytes;
1814 int next_unlinked_offset;
1815 int inodes_per_buf;
1816 xfs_agino_t *logged_nextp;
1817 xfs_agino_t *buffer_nextp;
1818 xfs_buf_log_format_v1_t *obuf_f;
1819 unsigned int *data_map = NULL;
1820 unsigned int map_size = 0;
1821
1822 switch (buf_f->blf_type) {
1823 case XFS_LI_BUF:
1824 data_map = buf_f->blf_data_map;
1825 map_size = buf_f->blf_map_size;
1826 break;
1827 case XFS_LI_6_1_BUF:
1828 case XFS_LI_5_3_BUF:
1829 obuf_f = (xfs_buf_log_format_v1_t*)buf_f;
1830 data_map = obuf_f->blf_data_map;
1831 map_size = obuf_f->blf_map_size;
1832 break;
1833 }
1834 /*
1835 * Set the variables corresponding to the current region to
1836 * 0 so that we'll initialize them on the first pass through
1837 * the loop.
1838 */
1839 reg_buf_offset = 0;
1840 reg_buf_bytes = 0;
1841 bit = 0;
1842 nbits = 0;
1843 item_index = 0;
1844 inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
1845 for (i = 0; i < inodes_per_buf; i++) {
1846 next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
1847 offsetof(xfs_dinode_t, di_next_unlinked);
1848
1849 while (next_unlinked_offset >=
1850 (reg_buf_offset + reg_buf_bytes)) {
1851 /*
1852 * The next di_next_unlinked field is beyond
1853 * the current logged region. Find the next
1854 * logged region that contains or is beyond
1855 * the current di_next_unlinked field.
1856 */
1857 bit += nbits;
1858 bit = xfs_next_bit(data_map, map_size, bit);
1859
1860 /*
1861 * If there are no more logged regions in the
1862 * buffer, then we're done.
1863 */
1864 if (bit == -1) {
1865 return 0;
1866 }
1867
1868 nbits = xfs_contig_bits(data_map, map_size,
1869 bit);
1870 ASSERT(nbits > 0);
1871 reg_buf_offset = bit << XFS_BLI_SHIFT;
1872 reg_buf_bytes = nbits << XFS_BLI_SHIFT;
1873 item_index++;
1874 }
1875
1876 /*
1877 * If the current logged region starts after the current
1878 * di_next_unlinked field, then move on to the next
1879 * di_next_unlinked field.
1880 */
1881 if (next_unlinked_offset < reg_buf_offset) {
1882 continue;
1883 }
1884
1885 ASSERT(item->ri_buf[item_index].i_addr != NULL);
1886 ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0);
1887 ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp));
1888
1889 /*
1890 * The current logged region contains a copy of the
1891 * current di_next_unlinked field. Extract its value
1892 * and copy it to the buffer copy.
1893 */
1894 logged_nextp = (xfs_agino_t *)
1895 ((char *)(item->ri_buf[item_index].i_addr) +
1896 (next_unlinked_offset - reg_buf_offset));
1897 if (unlikely(*logged_nextp == 0)) {
1898 xfs_fs_cmn_err(CE_ALERT, mp,
1899 "bad inode buffer log record (ptr = 0x%p, bp = 0x%p). XFS trying to replay bad (0) inode di_next_unlinked field",
1900 item, bp);
1901 XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
1902 XFS_ERRLEVEL_LOW, mp);
1903 return XFS_ERROR(EFSCORRUPTED);
1904 }
1905
1906 buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
1907 next_unlinked_offset);
1908 INT_SET(*buffer_nextp, ARCH_CONVERT, *logged_nextp);
1909 }
1910
1911 return 0;
1912}
1913
1914/*
1915 * Perform a 'normal' buffer recovery. Each logged region of the
1916 * buffer should be copied over the corresponding region in the
1917 * given buffer. The bitmap in the buf log format structure indicates
1918 * where to place the logged data.
1919 */
1920/*ARGSUSED*/
1921STATIC void
1922xlog_recover_do_reg_buffer(
1923 xfs_mount_t *mp,
1924 xlog_recover_item_t *item,
1925 xfs_buf_t *bp,
1926 xfs_buf_log_format_t *buf_f)
1927{
1928 int i;
1929 int bit;
1930 int nbits;
1931 xfs_buf_log_format_v1_t *obuf_f;
1932 unsigned int *data_map = NULL;
1933 unsigned int map_size = 0;
1934 int error;
1935
1936 switch (buf_f->blf_type) {
1937 case XFS_LI_BUF:
1938 data_map = buf_f->blf_data_map;
1939 map_size = buf_f->blf_map_size;
1940 break;
1941 case XFS_LI_6_1_BUF:
1942 case XFS_LI_5_3_BUF:
1943 obuf_f = (xfs_buf_log_format_v1_t*)buf_f;
1944 data_map = obuf_f->blf_data_map;
1945 map_size = obuf_f->blf_map_size;
1946 break;
1947 }
1948 bit = 0;
1949 i = 1; /* 0 is the buf format structure */
1950 while (1) {
1951 bit = xfs_next_bit(data_map, map_size, bit);
1952 if (bit == -1)
1953 break;
1954 nbits = xfs_contig_bits(data_map, map_size, bit);
1955 ASSERT(nbits > 0);
1956 ASSERT(item->ri_buf[i].i_addr != 0);
1957 ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0);
1958 ASSERT(XFS_BUF_COUNT(bp) >=
1959 ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT));
1960
1961 /*
1962 * Do a sanity check if this is a dquot buffer. Just checking
1963 * the first dquot in the buffer should do. XXXThis is
1964 * probably a good thing to do for other buf types also.
1965 */
1966 error = 0;
1967 if (buf_f->blf_flags & (XFS_BLI_UDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
1968 error = xfs_qm_dqcheck((xfs_disk_dquot_t *)
1969 item->ri_buf[i].i_addr,
1970 -1, 0, XFS_QMOPT_DOWARN,
1971 "dquot_buf_recover");
1972 }
1973 if (!error)
1974 memcpy(xfs_buf_offset(bp,
1975 (uint)bit << XFS_BLI_SHIFT), /* dest */
1976 item->ri_buf[i].i_addr, /* source */
1977 nbits<<XFS_BLI_SHIFT); /* length */
1978 i++;
1979 bit += nbits;
1980 }
1981
1982 /* Shouldn't be any more regions */
1983 ASSERT(i == item->ri_total);
1984}
1985
1986/*
1987 * Do some primitive error checking on ondisk dquot data structures.
1988 */
1989int
1990xfs_qm_dqcheck(
1991 xfs_disk_dquot_t *ddq,
1992 xfs_dqid_t id,
1993 uint type, /* used only when IO_dorepair is true */
1994 uint flags,
1995 char *str)
1996{
1997 xfs_dqblk_t *d = (xfs_dqblk_t *)ddq;
1998 int errs = 0;
1999
2000 /*
2001 * We can encounter an uninitialized dquot buffer for 2 reasons:
2002 * 1. If we crash while deleting the quotainode(s), and those blks got
2003 * used for user data. This is because we take the path of regular
2004 * file deletion; however, the size field of quotainodes is never
2005 * updated, so all the tricks that we play in itruncate_finish
2006 * don't quite matter.
2007 *
2008 * 2. We don't play the quota buffers when there's a quotaoff logitem.
2009 * But the allocation will be replayed so we'll end up with an
2010 * uninitialized quota block.
2011 *
2012 * This is all fine; things are still consistent, and we haven't lost
2013 * any quota information. Just don't complain about bad dquot blks.
2014 */
2015 if (INT_GET(ddq->d_magic, ARCH_CONVERT) != XFS_DQUOT_MAGIC) {
2016 if (flags & XFS_QMOPT_DOWARN)
2017 cmn_err(CE_ALERT,
2018 "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
2019 str, id,
2020 INT_GET(ddq->d_magic, ARCH_CONVERT), XFS_DQUOT_MAGIC);
2021 errs++;
2022 }
2023 if (INT_GET(ddq->d_version, ARCH_CONVERT) != XFS_DQUOT_VERSION) {
2024 if (flags & XFS_QMOPT_DOWARN)
2025 cmn_err(CE_ALERT,
2026 "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
2027 str, id,
2028 INT_GET(ddq->d_magic, ARCH_CONVERT), XFS_DQUOT_VERSION);
2029 errs++;
2030 }
2031
2032 if (INT_GET(ddq->d_flags, ARCH_CONVERT) != XFS_DQ_USER &&
2033 INT_GET(ddq->d_flags, ARCH_CONVERT) != XFS_DQ_GROUP) {
2034 if (flags & XFS_QMOPT_DOWARN)
2035 cmn_err(CE_ALERT,
2036 "%s : XFS dquot ID 0x%x, unknown flags 0x%x",
2037 str, id, INT_GET(ddq->d_flags, ARCH_CONVERT));
2038 errs++;
2039 }
2040
2041 if (id != -1 && id != INT_GET(ddq->d_id, ARCH_CONVERT)) {
2042 if (flags & XFS_QMOPT_DOWARN)
2043 cmn_err(CE_ALERT,
2044 "%s : ondisk-dquot 0x%p, ID mismatch: "
2045 "0x%x expected, found id 0x%x",
2046 str, ddq, id, INT_GET(ddq->d_id, ARCH_CONVERT));
2047 errs++;
2048 }
2049
2050 if (!errs && ddq->d_id) {
2051 if (INT_GET(ddq->d_blk_softlimit, ARCH_CONVERT) &&
2052 INT_GET(ddq->d_bcount, ARCH_CONVERT) >=
2053 INT_GET(ddq->d_blk_softlimit, ARCH_CONVERT)) {
2054 if (!ddq->d_btimer) {
2055 if (flags & XFS_QMOPT_DOWARN)
2056 cmn_err(CE_ALERT,
2057 "%s : Dquot ID 0x%x (0x%p) "
2058 "BLK TIMER NOT STARTED",
2059 str, (int)
2060 INT_GET(ddq->d_id, ARCH_CONVERT), ddq);
2061 errs++;
2062 }
2063 }
2064 if (INT_GET(ddq->d_ino_softlimit, ARCH_CONVERT) &&
2065 INT_GET(ddq->d_icount, ARCH_CONVERT) >=
2066 INT_GET(ddq->d_ino_softlimit, ARCH_CONVERT)) {
2067 if (!ddq->d_itimer) {
2068 if (flags & XFS_QMOPT_DOWARN)
2069 cmn_err(CE_ALERT,
2070 "%s : Dquot ID 0x%x (0x%p) "
2071 "INODE TIMER NOT STARTED",
2072 str, (int)
2073 INT_GET(ddq->d_id, ARCH_CONVERT), ddq);
2074 errs++;
2075 }
2076 }
2077 if (INT_GET(ddq->d_rtb_softlimit, ARCH_CONVERT) &&
2078 INT_GET(ddq->d_rtbcount, ARCH_CONVERT) >=
2079 INT_GET(ddq->d_rtb_softlimit, ARCH_CONVERT)) {
2080 if (!ddq->d_rtbtimer) {
2081 if (flags & XFS_QMOPT_DOWARN)
2082 cmn_err(CE_ALERT,
2083 "%s : Dquot ID 0x%x (0x%p) "
2084 "RTBLK TIMER NOT STARTED",
2085 str, (int)
2086 INT_GET(ddq->d_id, ARCH_CONVERT), ddq);
2087 errs++;
2088 }
2089 }
2090 }
2091
2092 if (!errs || !(flags & XFS_QMOPT_DQREPAIR))
2093 return errs;
2094
2095 if (flags & XFS_QMOPT_DOWARN)
2096 cmn_err(CE_NOTE, "Re-initializing dquot ID 0x%x", id);
2097
2098 /*
2099 * Typically, a repair is only requested by quotacheck.
2100 */
2101 ASSERT(id != -1);
2102 ASSERT(flags & XFS_QMOPT_DQREPAIR);
2103 memset(d, 0, sizeof(xfs_dqblk_t));
2104 INT_SET(d->dd_diskdq.d_magic, ARCH_CONVERT, XFS_DQUOT_MAGIC);
2105 INT_SET(d->dd_diskdq.d_version, ARCH_CONVERT, XFS_DQUOT_VERSION);
2106 INT_SET(d->dd_diskdq.d_id, ARCH_CONVERT, id);
2107 INT_SET(d->dd_diskdq.d_flags, ARCH_CONVERT, type);
2108
2109 return errs;
2110}
2111
2112/*
2113 * Perform a dquot buffer recovery.
2114 * Simple algorithm: if we have found a QUOTAOFF logitem of the same type
2115 * (ie. USR or GRP), then just toss this buffer away; don't recover it.
2116 * Else, treat it as a regular buffer and do recovery.
2117 */
2118STATIC void
2119xlog_recover_do_dquot_buffer(
2120 xfs_mount_t *mp,
2121 xlog_t *log,
2122 xlog_recover_item_t *item,
2123 xfs_buf_t *bp,
2124 xfs_buf_log_format_t *buf_f)
2125{
2126 uint type;
2127
2128 /*
2129 * Filesystems are required to send in quota flags at mount time.
2130 */
2131 if (mp->m_qflags == 0) {
2132 return;
2133 }
2134
2135 type = 0;
2136 if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF)
2137 type |= XFS_DQ_USER;
2138 if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF)
2139 type |= XFS_DQ_GROUP;
2140 /*
2141 * This type of quotas was turned off, so ignore this buffer
2142 */
2143 if (log->l_quotaoffs_flag & type)
2144 return;
2145
2146 xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
2147}
2148
2149/*
2150 * This routine replays a modification made to a buffer at runtime.
2151 * There are actually two types of buffer, regular and inode, which
2152 * are handled differently. Inode buffers are handled differently
2153 * in that we only recover a specific set of data from them, namely
2154 * the inode di_next_unlinked fields. This is because all other inode
2155 * data is actually logged via inode records and any data we replay
2156 * here which overlaps that may be stale.
2157 *
2158 * When meta-data buffers are freed at run time we log a buffer item
2159 * with the XFS_BLI_CANCEL bit set to indicate that previous copies
2160 * of the buffer in the log should not be replayed at recovery time.
2161 * This is so that if the blocks covered by the buffer are reused for
2162 * file data before we crash we don't end up replaying old, freed
2163 * meta-data into a user's file.
2164 *
2165 * To handle the cancellation of buffer log items, we make two passes
2166 * over the log during recovery. During the first we build a table of
2167 * those buffers which have been cancelled, and during the second we
2168 * only replay those buffers which do not have corresponding cancel
2169 * records in the table. See xlog_recover_do_buffer_pass[1,2] above
2170 * for more details on the implementation of the table of cancel records.
2171 */
2172STATIC int
2173xlog_recover_do_buffer_trans(
2174 xlog_t *log,
2175 xlog_recover_item_t *item,
2176 int pass)
2177{
2178 xfs_buf_log_format_t *buf_f;
2179 xfs_buf_log_format_v1_t *obuf_f;
2180 xfs_mount_t *mp;
2181 xfs_buf_t *bp;
2182 int error;
2183 int cancel;
2184 xfs_daddr_t blkno;
2185 int len;
2186 ushort flags;
2187
2188 buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
2189
2190 if (pass == XLOG_RECOVER_PASS1) {
2191 /*
2192 * In this pass we're only looking for buf items
2193 * with the XFS_BLI_CANCEL bit set.
2194 */
2195 xlog_recover_do_buffer_pass1(log, buf_f);
2196 return 0;
2197 } else {
2198 /*
2199 * In this pass we want to recover all the buffers
2200 * which have not been cancelled and are not
2201 * cancellation buffers themselves. The routine
2202 * we call here will tell us whether or not to
2203 * continue with the replay of this buffer.
2204 */
2205 cancel = xlog_recover_do_buffer_pass2(log, buf_f);
2206 if (cancel) {
2207 return 0;
2208 }
2209 }
2210 switch (buf_f->blf_type) {
2211 case XFS_LI_BUF:
2212 blkno = buf_f->blf_blkno;
2213 len = buf_f->blf_len;
2214 flags = buf_f->blf_flags;
2215 break;
2216 case XFS_LI_6_1_BUF:
2217 case XFS_LI_5_3_BUF:
2218 obuf_f = (xfs_buf_log_format_v1_t*)buf_f;
2219 blkno = obuf_f->blf_blkno;
2220 len = obuf_f->blf_len;
2221 flags = obuf_f->blf_flags;
2222 break;
2223 default:
2224 xfs_fs_cmn_err(CE_ALERT, log->l_mp,
2225 "xfs_log_recover: unknown buffer type 0x%x, dev %s",
2226 buf_f->blf_type, XFS_BUFTARG_NAME(log->l_targ));
2227 XFS_ERROR_REPORT("xlog_recover_do_buffer_trans",
2228 XFS_ERRLEVEL_LOW, log->l_mp);
2229 return XFS_ERROR(EFSCORRUPTED);
2230 }
2231
2232 mp = log->l_mp;
2233 if (flags & XFS_BLI_INODE_BUF) {
2234 bp = xfs_buf_read_flags(mp->m_ddev_targp, blkno, len,
2235 XFS_BUF_LOCK);
2236 } else {
2237 bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, 0);
2238 }
2239 if (XFS_BUF_ISERROR(bp)) {
2240 xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp,
2241 bp, blkno);
2242 error = XFS_BUF_GETERROR(bp);
2243 xfs_buf_relse(bp);
2244 return error;
2245 }
2246
2247 error = 0;
2248 if (flags & XFS_BLI_INODE_BUF) {
2249 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2250 } else if (flags & (XFS_BLI_UDQUOT_BUF | XFS_BLI_GDQUOT_BUF)) {
2251 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
2252 } else {
2253 xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
2254 }
2255 if (error)
2256 return XFS_ERROR(error);
2257
2258 /*
2259 * Perform delayed write on the buffer. Asynchronous writes will be
2260 * slower when taking into account all the buffers to be flushed.
2261 *
2262 * Also make sure that only inode buffers with good sizes stay in
2263 * the buffer cache. The kernel moves inodes in buffers of 1 block
2264 * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger. The inode
2265 * buffers in the log can be a different size if the log was generated
2266 * by an older kernel using unclustered inode buffers or a newer kernel
2267 * running with a different inode cluster size. Regardless, if the
2268 * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE)
2269 * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep
2270 * the buffer out of the buffer cache so that the buffer won't
2271 * overlap with future reads of those inodes.
2272 */
2273 if (XFS_DINODE_MAGIC ==
2274 INT_GET(*((__uint16_t *)(xfs_buf_offset(bp, 0))), ARCH_CONVERT) &&
2275 (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize,
2276 (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
2277 XFS_BUF_STALE(bp);
2278 error = xfs_bwrite(mp, bp);
2279 } else {
2280 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
2281 XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
2282 XFS_BUF_SET_FSPRIVATE(bp, mp);
2283 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2284 xfs_bdwrite(mp, bp);
2285 }
2286
2287 return (error);
2288}
2289
2290STATIC int
2291xlog_recover_do_inode_trans(
2292 xlog_t *log,
2293 xlog_recover_item_t *item,
2294 int pass)
2295{
2296 xfs_inode_log_format_t *in_f;
2297 xfs_mount_t *mp;
2298 xfs_buf_t *bp;
2299 xfs_imap_t imap;
2300 xfs_dinode_t *dip;
2301 xfs_ino_t ino;
2302 int len;
2303 xfs_caddr_t src;
2304 xfs_caddr_t dest;
2305 int error;
2306 int attr_index;
2307 uint fields;
2308 xfs_dinode_core_t *dicp;
2309
2310 if (pass == XLOG_RECOVER_PASS1) {
2311 return 0;
2312 }
2313
2314 in_f = (xfs_inode_log_format_t *)item->ri_buf[0].i_addr;
2315 ino = in_f->ilf_ino;
2316 mp = log->l_mp;
2317 if (ITEM_TYPE(item) == XFS_LI_INODE) {
2318 imap.im_blkno = (xfs_daddr_t)in_f->ilf_blkno;
2319 imap.im_len = in_f->ilf_len;
2320 imap.im_boffset = in_f->ilf_boffset;
2321 } else {
2322 /*
2323 * It's an old inode format record. We don't know where
2324 * its cluster is located on disk, and we can't allow
2325 * xfs_imap() to figure it out because the inode btrees
2326 * are not ready to be used. Therefore do not pass the
2327 * XFS_IMAP_LOOKUP flag to xfs_imap(). This will give
2328 * us only the single block in which the inode lives
2329 * rather than its cluster, so we must make sure to
2330 * invalidate the buffer when we write it out below.
2331 */
2332 imap.im_blkno = 0;
2333 xfs_imap(log->l_mp, NULL, ino, &imap, 0);
2334 }
2335
2336 /*
2337 * Inode buffers can be freed, look out for it,
2338 * and do not replay the inode.
2339 */
2340 if (xlog_check_buffer_cancelled(log, imap.im_blkno, imap.im_len, 0))
2341 return 0;
2342
2343 bp = xfs_buf_read_flags(mp->m_ddev_targp, imap.im_blkno, imap.im_len,
2344 XFS_BUF_LOCK);
2345 if (XFS_BUF_ISERROR(bp)) {
2346 xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
2347 bp, imap.im_blkno);
2348 error = XFS_BUF_GETERROR(bp);
2349 xfs_buf_relse(bp);
2350 return error;
2351 }
2352 error = 0;
2353 ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
2354 dip = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
2355
2356 /*
2357 * Make sure the place we're flushing out to really looks
2358 * like an inode!
2359 */
2360 if (unlikely(INT_GET(dip->di_core.di_magic, ARCH_CONVERT) != XFS_DINODE_MAGIC)) {
2361 xfs_buf_relse(bp);
2362 xfs_fs_cmn_err(CE_ALERT, mp,
2363 "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
2364 dip, bp, ino);
2365 XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)",
2366 XFS_ERRLEVEL_LOW, mp);
2367 return XFS_ERROR(EFSCORRUPTED);
2368 }
2369 dicp = (xfs_dinode_core_t*)(item->ri_buf[1].i_addr);
2370 if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
2371 xfs_buf_relse(bp);
2372 xfs_fs_cmn_err(CE_ALERT, mp,
2373 "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld",
2374 item, ino);
2375 XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)",
2376 XFS_ERRLEVEL_LOW, mp);
2377 return XFS_ERROR(EFSCORRUPTED);
2378 }
2379
2380 /* Skip replay when the on disk inode is newer than the log one */
2381 if (dicp->di_flushiter <
2382 INT_GET(dip->di_core.di_flushiter, ARCH_CONVERT)) {
2383 /*
2384 * Deal with the wrap case, DI_MAX_FLUSH is less
2385 * than smaller numbers
2386 */
2387 if ((INT_GET(dip->di_core.di_flushiter, ARCH_CONVERT)
2388 == DI_MAX_FLUSH) &&
2389 (dicp->di_flushiter < (DI_MAX_FLUSH>>1))) {
2390 /* do nothing */
2391 } else {
2392 xfs_buf_relse(bp);
2393 return 0;
2394 }
2395 }
2396 /* Take the opportunity to reset the flush iteration count */
2397 dicp->di_flushiter = 0;
2398
2399 if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) {
2400 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2401 (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
2402 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)",
2403 XFS_ERRLEVEL_LOW, mp, dicp);
2404 xfs_buf_relse(bp);
2405 xfs_fs_cmn_err(CE_ALERT, mp,
2406 "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2407 item, dip, bp, ino);
2408 return XFS_ERROR(EFSCORRUPTED);
2409 }
2410 } else if (unlikely((dicp->di_mode & S_IFMT) == S_IFDIR)) {
2411 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2412 (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
2413 (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
2414 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)",
2415 XFS_ERRLEVEL_LOW, mp, dicp);
2416 xfs_buf_relse(bp);
2417 xfs_fs_cmn_err(CE_ALERT, mp,
2418 "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2419 item, dip, bp, ino);
2420 return XFS_ERROR(EFSCORRUPTED);
2421 }
2422 }
2423 if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
2424 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)",
2425 XFS_ERRLEVEL_LOW, mp, dicp);
2426 xfs_buf_relse(bp);
2427 xfs_fs_cmn_err(CE_ALERT, mp,
2428 "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
2429 item, dip, bp, ino,
2430 dicp->di_nextents + dicp->di_anextents,
2431 dicp->di_nblocks);
2432 return XFS_ERROR(EFSCORRUPTED);
2433 }
2434 if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
2435 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)",
2436 XFS_ERRLEVEL_LOW, mp, dicp);
2437 xfs_buf_relse(bp);
2438 xfs_fs_cmn_err(CE_ALERT, mp,
2439 "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x",
2440 item, dip, bp, ino, dicp->di_forkoff);
2441 return XFS_ERROR(EFSCORRUPTED);
2442 }
2443 if (unlikely(item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t))) {
2444 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)",
2445 XFS_ERRLEVEL_LOW, mp, dicp);
2446 xfs_buf_relse(bp);
2447 xfs_fs_cmn_err(CE_ALERT, mp,
2448 "xfs_inode_recover: Bad inode log record length %d, rec ptr 0x%p",
2449 item->ri_buf[1].i_len, item);
2450 return XFS_ERROR(EFSCORRUPTED);
2451 }
2452
2453 /* The core is in in-core format */
2454 xfs_xlate_dinode_core((xfs_caddr_t)&dip->di_core,
2455 (xfs_dinode_core_t*)item->ri_buf[1].i_addr, -1);
2456
2457 /* the rest is in on-disk format */
2458 if (item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t)) {
2459 memcpy((xfs_caddr_t) dip + sizeof(xfs_dinode_core_t),
2460 item->ri_buf[1].i_addr + sizeof(xfs_dinode_core_t),
2461 item->ri_buf[1].i_len - sizeof(xfs_dinode_core_t));
2462 }
2463
2464 fields = in_f->ilf_fields;
2465 switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) {
2466 case XFS_ILOG_DEV:
2467 INT_SET(dip->di_u.di_dev, ARCH_CONVERT, in_f->ilf_u.ilfu_rdev);
2468
2469 break;
2470 case XFS_ILOG_UUID:
2471 dip->di_u.di_muuid = in_f->ilf_u.ilfu_uuid;
2472 break;
2473 }
2474
2475 if (in_f->ilf_size == 2)
2476 goto write_inode_buffer;
2477 len = item->ri_buf[2].i_len;
2478 src = item->ri_buf[2].i_addr;
2479 ASSERT(in_f->ilf_size <= 4);
2480 ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
2481 ASSERT(!(fields & XFS_ILOG_DFORK) ||
2482 (len == in_f->ilf_dsize));
2483
2484 switch (fields & XFS_ILOG_DFORK) {
2485 case XFS_ILOG_DDATA:
2486 case XFS_ILOG_DEXT:
2487 memcpy(&dip->di_u, src, len);
2488 break;
2489
2490 case XFS_ILOG_DBROOT:
2491 xfs_bmbt_to_bmdr((xfs_bmbt_block_t *)src, len,
2492 &(dip->di_u.di_bmbt),
2493 XFS_DFORK_DSIZE(dip, mp));
2494 break;
2495
2496 default:
2497 /*
2498 * There are no data fork flags set.
2499 */
2500 ASSERT((fields & XFS_ILOG_DFORK) == 0);
2501 break;
2502 }
2503
2504 /*
2505 * If we logged any attribute data, recover it. There may or
2506 * may not have been any other non-core data logged in this
2507 * transaction.
2508 */
2509 if (in_f->ilf_fields & XFS_ILOG_AFORK) {
2510 if (in_f->ilf_fields & XFS_ILOG_DFORK) {
2511 attr_index = 3;
2512 } else {
2513 attr_index = 2;
2514 }
2515 len = item->ri_buf[attr_index].i_len;
2516 src = item->ri_buf[attr_index].i_addr;
2517 ASSERT(len == in_f->ilf_asize);
2518
2519 switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
2520 case XFS_ILOG_ADATA:
2521 case XFS_ILOG_AEXT:
2522 dest = XFS_DFORK_APTR(dip);
2523 ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));
2524 memcpy(dest, src, len);
2525 break;
2526
2527 case XFS_ILOG_ABROOT:
2528 dest = XFS_DFORK_APTR(dip);
2529 xfs_bmbt_to_bmdr((xfs_bmbt_block_t *)src, len,
2530 (xfs_bmdr_block_t*)dest,
2531 XFS_DFORK_ASIZE(dip, mp));
2532 break;
2533
2534 default:
2535 xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag");
2536 ASSERT(0);
2537 xfs_buf_relse(bp);
2538 return XFS_ERROR(EIO);
2539 }
2540 }
2541
2542write_inode_buffer:
2543 if (ITEM_TYPE(item) == XFS_LI_INODE) {
2544 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
2545 XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
2546 XFS_BUF_SET_FSPRIVATE(bp, mp);
2547 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2548 xfs_bdwrite(mp, bp);
2549 } else {
2550 XFS_BUF_STALE(bp);
2551 error = xfs_bwrite(mp, bp);
2552 }
2553
2554 return (error);
2555}
2556
2557/*
2558 * Recover QUOTAOFF records. We simply make a note of it in the xlog_t
2559 * structure, so that we know not to do any dquot item or dquot buffer recovery,
2560 * of that type.
2561 */
2562STATIC int
2563xlog_recover_do_quotaoff_trans(
2564 xlog_t *log,
2565 xlog_recover_item_t *item,
2566 int pass)
2567{
2568 xfs_qoff_logformat_t *qoff_f;
2569
2570 if (pass == XLOG_RECOVER_PASS2) {
2571 return (0);
2572 }
2573
2574 qoff_f = (xfs_qoff_logformat_t *)item->ri_buf[0].i_addr;
2575 ASSERT(qoff_f);
2576
2577 /*
2578 * The logitem format's flag tells us if this was user quotaoff,
2579 * group quotaoff or both.
2580 */
2581 if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
2582 log->l_quotaoffs_flag |= XFS_DQ_USER;
2583 if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
2584 log->l_quotaoffs_flag |= XFS_DQ_GROUP;
2585
2586 return (0);
2587}
2588
2589/*
2590 * Recover a dquot record
2591 */
2592STATIC int
2593xlog_recover_do_dquot_trans(
2594 xlog_t *log,
2595 xlog_recover_item_t *item,
2596 int pass)
2597{
2598 xfs_mount_t *mp;
2599 xfs_buf_t *bp;
2600 struct xfs_disk_dquot *ddq, *recddq;
2601 int error;
2602 xfs_dq_logformat_t *dq_f;
2603 uint type;
2604
2605 if (pass == XLOG_RECOVER_PASS1) {
2606 return 0;
2607 }
2608 mp = log->l_mp;
2609
2610 /*
2611 * Filesystems are required to send in quota flags at mount time.
2612 */
2613 if (mp->m_qflags == 0)
2614 return (0);
2615
2616 recddq = (xfs_disk_dquot_t *)item->ri_buf[1].i_addr;
2617 ASSERT(recddq);
2618 /*
2619 * This type of quotas was turned off, so ignore this record.
2620 */
2621 type = INT_GET(recddq->d_flags, ARCH_CONVERT) &
2622 (XFS_DQ_USER | XFS_DQ_GROUP);
2623 ASSERT(type);
2624 if (log->l_quotaoffs_flag & type)
2625 return (0);
2626
2627 /*
2628 * At this point we know that quota was _not_ turned off.
2629 * Since the mount flags are not indicating to us otherwise, this
2630 * must mean that quota is on, and the dquot needs to be replayed.
2631 * Remember that we may not have fully recovered the superblock yet,
2632 * so we can't do the usual trick of looking at the SB quota bits.
2633 *
2634 * The other possibility, of course, is that the quota subsystem was
2635 * removed since the last mount - ENOSYS.
2636 */
2637 dq_f = (xfs_dq_logformat_t *)item->ri_buf[0].i_addr;
2638 ASSERT(dq_f);
2639 if ((error = xfs_qm_dqcheck(recddq,
2640 dq_f->qlf_id,
2641 0, XFS_QMOPT_DOWARN,
2642 "xlog_recover_do_dquot_trans (log copy)"))) {
2643 return XFS_ERROR(EIO);
2644 }
2645 ASSERT(dq_f->qlf_len == 1);
2646
2647 error = xfs_read_buf(mp, mp->m_ddev_targp,
2648 dq_f->qlf_blkno,
2649 XFS_FSB_TO_BB(mp, dq_f->qlf_len),
2650 0, &bp);
2651 if (error) {
2652 xfs_ioerror_alert("xlog_recover_do..(read#3)", mp,
2653 bp, dq_f->qlf_blkno);
2654 return error;
2655 }
2656 ASSERT(bp);
2657 ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
2658
2659 /*
2660 * At least the magic num portion should be on disk because this
2661 * was among a chunk of dquots created earlier, and we did some
2662 * minimal initialization then.
2663 */
2664 if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
2665 "xlog_recover_do_dquot_trans")) {
2666 xfs_buf_relse(bp);
2667 return XFS_ERROR(EIO);
2668 }
2669
2670 memcpy(ddq, recddq, item->ri_buf[1].i_len);
2671
2672 ASSERT(dq_f->qlf_size == 2);
2673 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
2674 XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
2675 XFS_BUF_SET_FSPRIVATE(bp, mp);
2676 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2677 xfs_bdwrite(mp, bp);
2678
2679 return (0);
2680}
2681
2682/*
2683 * This routine is called to create an in-core extent free intent
2684 * item from the efi format structure which was logged on disk.
2685 * It allocates an in-core efi, copies the extents from the format
2686 * structure into it, and adds the efi to the AIL with the given
2687 * LSN.
2688 */
2689STATIC void
2690xlog_recover_do_efi_trans(
2691 xlog_t *log,
2692 xlog_recover_item_t *item,
2693 xfs_lsn_t lsn,
2694 int pass)
2695{
2696 xfs_mount_t *mp;
2697 xfs_efi_log_item_t *efip;
2698 xfs_efi_log_format_t *efi_formatp;
2699 SPLDECL(s);
2700
2701 if (pass == XLOG_RECOVER_PASS1) {
2702 return;
2703 }
2704
2705 efi_formatp = (xfs_efi_log_format_t *)item->ri_buf[0].i_addr;
2706 ASSERT(item->ri_buf[0].i_len ==
2707 (sizeof(xfs_efi_log_format_t) +
2708 ((efi_formatp->efi_nextents - 1) * sizeof(xfs_extent_t))));
2709
2710 mp = log->l_mp;
2711 efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
2712 memcpy((char *)&(efip->efi_format), (char *)efi_formatp,
2713 sizeof(xfs_efi_log_format_t) +
2714 ((efi_formatp->efi_nextents - 1) * sizeof(xfs_extent_t)));
2715 efip->efi_next_extent = efi_formatp->efi_nextents;
2716 efip->efi_flags |= XFS_EFI_COMMITTED;
2717
2718 AIL_LOCK(mp,s);
2719 /*
2720 * xfs_trans_update_ail() drops the AIL lock.
2721 */
2722 xfs_trans_update_ail(mp, (xfs_log_item_t *)efip, lsn, s);
2723}
2724
2725
2726/*
2727 * This routine is called when an efd format structure is found in
2728 * a committed transaction in the log. It's purpose is to cancel
2729 * the corresponding efi if it was still in the log. To do this
2730 * it searches the AIL for the efi with an id equal to that in the
2731 * efd format structure. If we find it, we remove the efi from the
2732 * AIL and free it.
2733 */
2734STATIC void
2735xlog_recover_do_efd_trans(
2736 xlog_t *log,
2737 xlog_recover_item_t *item,
2738 int pass)
2739{
2740 xfs_mount_t *mp;
2741 xfs_efd_log_format_t *efd_formatp;
2742 xfs_efi_log_item_t *efip = NULL;
2743 xfs_log_item_t *lip;
2744 int gen;
2745 int nexts;
2746 __uint64_t efi_id;
2747 SPLDECL(s);
2748
2749 if (pass == XLOG_RECOVER_PASS1) {
2750 return;
2751 }
2752
2753 efd_formatp = (xfs_efd_log_format_t *)item->ri_buf[0].i_addr;
2754 ASSERT(item->ri_buf[0].i_len ==
2755 (sizeof(xfs_efd_log_format_t) +
2756 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_t))));
2757 efi_id = efd_formatp->efd_efi_id;
2758
2759 /*
2760 * Search for the efi with the id in the efd format structure
2761 * in the AIL.
2762 */
2763 mp = log->l_mp;
2764 AIL_LOCK(mp,s);
2765 lip = xfs_trans_first_ail(mp, &gen);
2766 while (lip != NULL) {
2767 if (lip->li_type == XFS_LI_EFI) {
2768 efip = (xfs_efi_log_item_t *)lip;
2769 if (efip->efi_format.efi_id == efi_id) {
2770 /*
2771 * xfs_trans_delete_ail() drops the
2772 * AIL lock.
2773 */
2774 xfs_trans_delete_ail(mp, lip, s);
2775 break;
2776 }
2777 }
2778 lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
2779 }
2780 if (lip == NULL) {
2781 AIL_UNLOCK(mp, s);
2782 }
2783
2784 /*
2785 * If we found it, then free it up. If it wasn't there, it
2786 * must have been overwritten in the log. Oh well.
2787 */
2788 if (lip != NULL) {
2789 nexts = efip->efi_format.efi_nextents;
2790 if (nexts > XFS_EFI_MAX_FAST_EXTENTS) {
2791 kmem_free(lip, sizeof(xfs_efi_log_item_t) +
2792 ((nexts - 1) * sizeof(xfs_extent_t)));
2793 } else {
2794 kmem_zone_free(xfs_efi_zone, efip);
2795 }
2796 }
2797}
2798
2799/*
2800 * Perform the transaction
2801 *
2802 * If the transaction modifies a buffer or inode, do it now. Otherwise,
2803 * EFIs and EFDs get queued up by adding entries into the AIL for them.
2804 */
2805STATIC int
2806xlog_recover_do_trans(
2807 xlog_t *log,
2808 xlog_recover_t *trans,
2809 int pass)
2810{
2811 int error = 0;
2812 xlog_recover_item_t *item, *first_item;
2813
2814 if ((error = xlog_recover_reorder_trans(log, trans)))
2815 return error;
2816 first_item = item = trans->r_itemq;
2817 do {
2818 /*
2819 * we don't need to worry about the block number being
2820 * truncated in > 1 TB buffers because in user-land,
2821 * we're now n32 or 64-bit so xfs_daddr_t is 64-bits so
2822 * the blkno's will get through the user-mode buffer
2823 * cache properly. The only bad case is o32 kernels
2824 * where xfs_daddr_t is 32-bits but mount will warn us
2825 * off a > 1 TB filesystem before we get here.
2826 */
2827 if ((ITEM_TYPE(item) == XFS_LI_BUF) ||
2828 (ITEM_TYPE(item) == XFS_LI_6_1_BUF) ||
2829 (ITEM_TYPE(item) == XFS_LI_5_3_BUF)) {
2830 if ((error = xlog_recover_do_buffer_trans(log, item,
2831 pass)))
2832 break;
2833 } else if ((ITEM_TYPE(item) == XFS_LI_INODE) ||
2834 (ITEM_TYPE(item) == XFS_LI_6_1_INODE) ||
2835 (ITEM_TYPE(item) == XFS_LI_5_3_INODE)) {
2836 if ((error = xlog_recover_do_inode_trans(log, item,
2837 pass)))
2838 break;
2839 } else if (ITEM_TYPE(item) == XFS_LI_EFI) {
2840 xlog_recover_do_efi_trans(log, item, trans->r_lsn,
2841 pass);
2842 } else if (ITEM_TYPE(item) == XFS_LI_EFD) {
2843 xlog_recover_do_efd_trans(log, item, pass);
2844 } else if (ITEM_TYPE(item) == XFS_LI_DQUOT) {
2845 if ((error = xlog_recover_do_dquot_trans(log, item,
2846 pass)))
2847 break;
2848 } else if ((ITEM_TYPE(item) == XFS_LI_QUOTAOFF)) {
2849 if ((error = xlog_recover_do_quotaoff_trans(log, item,
2850 pass)))
2851 break;
2852 } else {
2853 xlog_warn("XFS: xlog_recover_do_trans");
2854 ASSERT(0);
2855 error = XFS_ERROR(EIO);
2856 break;
2857 }
2858 item = item->ri_next;
2859 } while (first_item != item);
2860
2861 return error;
2862}
2863
2864/*
2865 * Free up any resources allocated by the transaction
2866 *
2867 * Remember that EFIs, EFDs, and IUNLINKs are handled later.
2868 */
2869STATIC void
2870xlog_recover_free_trans(
2871 xlog_recover_t *trans)
2872{
2873 xlog_recover_item_t *first_item, *item, *free_item;
2874 int i;
2875
2876 item = first_item = trans->r_itemq;
2877 do {
2878 free_item = item;
2879 item = item->ri_next;
2880 /* Free the regions in the item. */
2881 for (i = 0; i < free_item->ri_cnt; i++) {
2882 kmem_free(free_item->ri_buf[i].i_addr,
2883 free_item->ri_buf[i].i_len);
2884 }
2885 /* Free the item itself */
2886 kmem_free(free_item->ri_buf,
2887 (free_item->ri_total * sizeof(xfs_log_iovec_t)));
2888 kmem_free(free_item, sizeof(xlog_recover_item_t));
2889 } while (first_item != item);
2890 /* Free the transaction recover structure */
2891 kmem_free(trans, sizeof(xlog_recover_t));
2892}
2893
2894STATIC int
2895xlog_recover_commit_trans(
2896 xlog_t *log,
2897 xlog_recover_t **q,
2898 xlog_recover_t *trans,
2899 int pass)
2900{
2901 int error;
2902
2903 if ((error = xlog_recover_unlink_tid(q, trans)))
2904 return error;
2905 if ((error = xlog_recover_do_trans(log, trans, pass)))
2906 return error;
2907 xlog_recover_free_trans(trans); /* no error */
2908 return 0;
2909}
2910
2911STATIC int
2912xlog_recover_unmount_trans(
2913 xlog_recover_t *trans)
2914{
2915 /* Do nothing now */
2916 xlog_warn("XFS: xlog_recover_unmount_trans: Unmount LR");
2917 return 0;
2918}
2919
2920/*
2921 * There are two valid states of the r_state field. 0 indicates that the
2922 * transaction structure is in a normal state. We have either seen the
2923 * start of the transaction or the last operation we added was not a partial
2924 * operation. If the last operation we added to the transaction was a
2925 * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
2926 *
2927 * NOTE: skip LRs with 0 data length.
2928 */
2929STATIC int
2930xlog_recover_process_data(
2931 xlog_t *log,
2932 xlog_recover_t *rhash[],
2933 xlog_rec_header_t *rhead,
2934 xfs_caddr_t dp,
2935 int pass)
2936{
2937 xfs_caddr_t lp;
2938 int num_logops;
2939 xlog_op_header_t *ohead;
2940 xlog_recover_t *trans;
2941 xlog_tid_t tid;
2942 int error;
2943 unsigned long hash;
2944 uint flags;
2945
2946 lp = dp + INT_GET(rhead->h_len, ARCH_CONVERT);
2947 num_logops = INT_GET(rhead->h_num_logops, ARCH_CONVERT);
2948
2949 /* check the log format matches our own - else we can't recover */
2950 if (xlog_header_check_recover(log->l_mp, rhead))
2951 return (XFS_ERROR(EIO));
2952
2953 while ((dp < lp) && num_logops) {
2954 ASSERT(dp + sizeof(xlog_op_header_t) <= lp);
2955 ohead = (xlog_op_header_t *)dp;
2956 dp += sizeof(xlog_op_header_t);
2957 if (ohead->oh_clientid != XFS_TRANSACTION &&
2958 ohead->oh_clientid != XFS_LOG) {
2959 xlog_warn(
2960 "XFS: xlog_recover_process_data: bad clientid");
2961 ASSERT(0);
2962 return (XFS_ERROR(EIO));
2963 }
2964 tid = INT_GET(ohead->oh_tid, ARCH_CONVERT);
2965 hash = XLOG_RHASH(tid);
2966 trans = xlog_recover_find_tid(rhash[hash], tid);
2967 if (trans == NULL) { /* not found; add new tid */
2968 if (ohead->oh_flags & XLOG_START_TRANS)
2969 xlog_recover_new_tid(&rhash[hash], tid,
2970 INT_GET(rhead->h_lsn, ARCH_CONVERT));
2971 } else {
2972 ASSERT(dp+INT_GET(ohead->oh_len, ARCH_CONVERT) <= lp);
2973 flags = ohead->oh_flags & ~XLOG_END_TRANS;
2974 if (flags & XLOG_WAS_CONT_TRANS)
2975 flags &= ~XLOG_CONTINUE_TRANS;
2976 switch (flags) {
2977 case XLOG_COMMIT_TRANS:
2978 error = xlog_recover_commit_trans(log,
2979 &rhash[hash], trans, pass);
2980 break;
2981 case XLOG_UNMOUNT_TRANS:
2982 error = xlog_recover_unmount_trans(trans);
2983 break;
2984 case XLOG_WAS_CONT_TRANS:
2985 error = xlog_recover_add_to_cont_trans(trans,
2986 dp, INT_GET(ohead->oh_len,
2987 ARCH_CONVERT));
2988 break;
2989 case XLOG_START_TRANS:
2990 xlog_warn(
2991 "XFS: xlog_recover_process_data: bad transaction");
2992 ASSERT(0);
2993 error = XFS_ERROR(EIO);
2994 break;
2995 case 0:
2996 case XLOG_CONTINUE_TRANS:
2997 error = xlog_recover_add_to_trans(trans,
2998 dp, INT_GET(ohead->oh_len,
2999 ARCH_CONVERT));
3000 break;
3001 default:
3002 xlog_warn(
3003 "XFS: xlog_recover_process_data: bad flag");
3004 ASSERT(0);
3005 error = XFS_ERROR(EIO);
3006 break;
3007 }
3008 if (error)
3009 return error;
3010 }
3011 dp += INT_GET(ohead->oh_len, ARCH_CONVERT);
3012 num_logops--;
3013 }
3014 return 0;
3015}
3016
3017/*
3018 * Process an extent free intent item that was recovered from
3019 * the log. We need to free the extents that it describes.
3020 */
3021STATIC void
3022xlog_recover_process_efi(
3023 xfs_mount_t *mp,
3024 xfs_efi_log_item_t *efip)
3025{
3026 xfs_efd_log_item_t *efdp;
3027 xfs_trans_t *tp;
3028 int i;
3029 xfs_extent_t *extp;
3030 xfs_fsblock_t startblock_fsb;
3031
3032 ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED));
3033
3034 /*
3035 * First check the validity of the extents described by the
3036 * EFI. If any are bad, then assume that all are bad and
3037 * just toss the EFI.
3038 */
3039 for (i = 0; i < efip->efi_format.efi_nextents; i++) {
3040 extp = &(efip->efi_format.efi_extents[i]);
3041 startblock_fsb = XFS_BB_TO_FSB(mp,
3042 XFS_FSB_TO_DADDR(mp, extp->ext_start));
3043 if ((startblock_fsb == 0) ||
3044 (extp->ext_len == 0) ||
3045 (startblock_fsb >= mp->m_sb.sb_dblocks) ||
3046 (extp->ext_len >= mp->m_sb.sb_agblocks)) {
3047 /*
3048 * This will pull the EFI from the AIL and
3049 * free the memory associated with it.
3050 */
3051 xfs_efi_release(efip, efip->efi_format.efi_nextents);
3052 return;
3053 }
3054 }
3055
3056 tp = xfs_trans_alloc(mp, 0);
3057 xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
3058 efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
3059
3060 for (i = 0; i < efip->efi_format.efi_nextents; i++) {
3061 extp = &(efip->efi_format.efi_extents[i]);
3062 xfs_free_extent(tp, extp->ext_start, extp->ext_len);
3063 xfs_trans_log_efd_extent(tp, efdp, extp->ext_start,
3064 extp->ext_len);
3065 }
3066
3067 efip->efi_flags |= XFS_EFI_RECOVERED;
3068 xfs_trans_commit(tp, 0, NULL);
3069}
3070
3071/*
3072 * Verify that once we've encountered something other than an EFI
3073 * in the AIL that there are no more EFIs in the AIL.
3074 */
3075#if defined(DEBUG)
3076STATIC void
3077xlog_recover_check_ail(
3078 xfs_mount_t *mp,
3079 xfs_log_item_t *lip,
3080 int gen)
3081{
3082 int orig_gen = gen;
3083
3084 do {
3085 ASSERT(lip->li_type != XFS_LI_EFI);
3086 lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
3087 /*
3088 * The check will be bogus if we restart from the
3089 * beginning of the AIL, so ASSERT that we don't.
3090 * We never should since we're holding the AIL lock
3091 * the entire time.
3092 */
3093 ASSERT(gen == orig_gen);
3094 } while (lip != NULL);
3095}
3096#endif /* DEBUG */
3097
3098/*
3099 * When this is called, all of the EFIs which did not have
3100 * corresponding EFDs should be in the AIL. What we do now
3101 * is free the extents associated with each one.
3102 *
3103 * Since we process the EFIs in normal transactions, they
3104 * will be removed at some point after the commit. This prevents
3105 * us from just walking down the list processing each one.
3106 * We'll use a flag in the EFI to skip those that we've already
3107 * processed and use the AIL iteration mechanism's generation
3108 * count to try to speed this up at least a bit.
3109 *
3110 * When we start, we know that the EFIs are the only things in
3111 * the AIL. As we process them, however, other items are added
3112 * to the AIL. Since everything added to the AIL must come after
3113 * everything already in the AIL, we stop processing as soon as
3114 * we see something other than an EFI in the AIL.
3115 */
3116STATIC void
3117xlog_recover_process_efis(
3118 xlog_t *log)
3119{
3120 xfs_log_item_t *lip;
3121 xfs_efi_log_item_t *efip;
3122 int gen;
3123 xfs_mount_t *mp;
3124 SPLDECL(s);
3125
3126 mp = log->l_mp;
3127 AIL_LOCK(mp,s);
3128
3129 lip = xfs_trans_first_ail(mp, &gen);
3130 while (lip != NULL) {
3131 /*
3132 * We're done when we see something other than an EFI.
3133 */
3134 if (lip->li_type != XFS_LI_EFI) {
3135 xlog_recover_check_ail(mp, lip, gen);
3136 break;
3137 }
3138
3139 /*
3140 * Skip EFIs that we've already processed.
3141 */
3142 efip = (xfs_efi_log_item_t *)lip;
3143 if (efip->efi_flags & XFS_EFI_RECOVERED) {
3144 lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
3145 continue;
3146 }
3147
3148 AIL_UNLOCK(mp, s);
3149 xlog_recover_process_efi(mp, efip);
3150 AIL_LOCK(mp,s);
3151 lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
3152 }
3153 AIL_UNLOCK(mp, s);
3154}
3155
3156/*
3157 * This routine performs a transaction to null out a bad inode pointer
3158 * in an agi unlinked inode hash bucket.
3159 */
3160STATIC void
3161xlog_recover_clear_agi_bucket(
3162 xfs_mount_t *mp,
3163 xfs_agnumber_t agno,
3164 int bucket)
3165{
3166 xfs_trans_t *tp;
3167 xfs_agi_t *agi;
3168 xfs_buf_t *agibp;
3169 int offset;
3170 int error;
3171
3172 tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
3173 xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 0, 0, 0);
3174
3175 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
3176 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
3177 XFS_FSS_TO_BB(mp, 1), 0, &agibp);
3178 if (error) {
3179 xfs_trans_cancel(tp, XFS_TRANS_ABORT);
3180 return;
3181 }
3182
3183 agi = XFS_BUF_TO_AGI(agibp);
3184 if (INT_GET(agi->agi_magicnum, ARCH_CONVERT) != XFS_AGI_MAGIC) {
3185 xfs_trans_cancel(tp, XFS_TRANS_ABORT);
3186 return;
3187 }
3188 ASSERT(INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC);
3189
3190 INT_SET(agi->agi_unlinked[bucket], ARCH_CONVERT, NULLAGINO);
3191 offset = offsetof(xfs_agi_t, agi_unlinked) +
3192 (sizeof(xfs_agino_t) * bucket);
3193 xfs_trans_log_buf(tp, agibp, offset,
3194 (offset + sizeof(xfs_agino_t) - 1));
3195
3196 (void) xfs_trans_commit(tp, 0, NULL);
3197}
3198
3199/*
3200 * xlog_iunlink_recover
3201 *
3202 * This is called during recovery to process any inodes which
3203 * we unlinked but not freed when the system crashed. These
3204 * inodes will be on the lists in the AGI blocks. What we do
3205 * here is scan all the AGIs and fully truncate and free any
3206 * inodes found on the lists. Each inode is removed from the
3207 * lists when it has been fully truncated and is freed. The
3208 * freeing of the inode and its removal from the list must be
3209 * atomic.
3210 */
3211void
3212xlog_recover_process_iunlinks(
3213 xlog_t *log)
3214{
3215 xfs_mount_t *mp;
3216 xfs_agnumber_t agno;
3217 xfs_agi_t *agi;
3218 xfs_buf_t *agibp;
3219 xfs_buf_t *ibp;
3220 xfs_dinode_t *dip;
3221 xfs_inode_t *ip;
3222 xfs_agino_t agino;
3223 xfs_ino_t ino;
3224 int bucket;
3225 int error;
3226 uint mp_dmevmask;
3227
3228 mp = log->l_mp;
3229
3230 /*
3231 * Prevent any DMAPI event from being sent while in this function.
3232 */
3233 mp_dmevmask = mp->m_dmevmask;
3234 mp->m_dmevmask = 0;
3235
3236 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
3237 /*
3238 * Find the agi for this ag.
3239 */
3240 agibp = xfs_buf_read(mp->m_ddev_targp,
3241 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
3242 XFS_FSS_TO_BB(mp, 1), 0);
3243 if (XFS_BUF_ISERROR(agibp)) {
3244 xfs_ioerror_alert("xlog_recover_process_iunlinks(#1)",
3245 log->l_mp, agibp,
3246 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)));
3247 }
3248 agi = XFS_BUF_TO_AGI(agibp);
3249 ASSERT(XFS_AGI_MAGIC ==
3250 INT_GET(agi->agi_magicnum, ARCH_CONVERT));
3251
3252 for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
3253
3254 agino = INT_GET(agi->agi_unlinked[bucket], ARCH_CONVERT);
3255 while (agino != NULLAGINO) {
3256
3257 /*
3258 * Release the agi buffer so that it can
3259 * be acquired in the normal course of the
3260 * transaction to truncate and free the inode.
3261 */
3262 xfs_buf_relse(agibp);
3263
3264 ino = XFS_AGINO_TO_INO(mp, agno, agino);
3265 error = xfs_iget(mp, NULL, ino, 0, 0, &ip, 0);
3266 ASSERT(error || (ip != NULL));
3267
3268 if (!error) {
3269 /*
3270 * Get the on disk inode to find the
3271 * next inode in the bucket.
3272 */
3273 error = xfs_itobp(mp, NULL, ip, &dip,
3274 &ibp, 0);
3275 ASSERT(error || (dip != NULL));
3276 }
3277
3278 if (!error) {
3279 ASSERT(ip->i_d.di_nlink == 0);
3280
3281 /* setup for the next pass */
3282 agino = INT_GET(dip->di_next_unlinked,
3283 ARCH_CONVERT);
3284 xfs_buf_relse(ibp);
3285 /*
3286 * Prevent any DMAPI event from
3287 * being sent when the
3288 * reference on the inode is
3289 * dropped.
3290 */
3291 ip->i_d.di_dmevmask = 0;
3292
3293 /*
3294 * If this is a new inode, handle
3295 * it specially. Otherwise,
3296 * just drop our reference to the
3297 * inode. If there are no
3298 * other references, this will
3299 * send the inode to
3300 * xfs_inactive() which will
3301 * truncate the file and free
3302 * the inode.
3303 */
3304 if (ip->i_d.di_mode == 0)
3305 xfs_iput_new(ip, 0);
3306 else
3307 VN_RELE(XFS_ITOV(ip));
3308 } else {
3309 /*
3310 * We can't read in the inode
3311 * this bucket points to, or
3312 * this inode is messed up. Just
3313 * ditch this bucket of inodes. We
3314 * will lose some inodes and space,
3315 * but at least we won't hang. Call
3316 * xlog_recover_clear_agi_bucket()
3317 * to perform a transaction to clear
3318 * the inode pointer in the bucket.
3319 */
3320 xlog_recover_clear_agi_bucket(mp, agno,
3321 bucket);
3322
3323 agino = NULLAGINO;
3324 }
3325
3326 /*
3327 * Reacquire the agibuffer and continue around
3328 * the loop.
3329 */
3330 agibp = xfs_buf_read(mp->m_ddev_targp,
3331 XFS_AG_DADDR(mp, agno,
3332 XFS_AGI_DADDR(mp)),
3333 XFS_FSS_TO_BB(mp, 1), 0);
3334 if (XFS_BUF_ISERROR(agibp)) {
3335 xfs_ioerror_alert(
3336 "xlog_recover_process_iunlinks(#2)",
3337 log->l_mp, agibp,
3338 XFS_AG_DADDR(mp, agno,
3339 XFS_AGI_DADDR(mp)));
3340 }
3341 agi = XFS_BUF_TO_AGI(agibp);
3342 ASSERT(XFS_AGI_MAGIC == INT_GET(
3343 agi->agi_magicnum, ARCH_CONVERT));
3344 }
3345 }
3346
3347 /*
3348 * Release the buffer for the current agi so we can
3349 * go on to the next one.
3350 */
3351 xfs_buf_relse(agibp);
3352 }
3353
3354 mp->m_dmevmask = mp_dmevmask;
3355}
3356
3357
3358#ifdef DEBUG
3359STATIC void
3360xlog_pack_data_checksum(
3361 xlog_t *log,
3362 xlog_in_core_t *iclog,
3363 int size)
3364{
3365 int i;
3366 uint *up;
3367 uint chksum = 0;
3368
3369 up = (uint *)iclog->ic_datap;
3370 /* divide length by 4 to get # words */
3371 for (i = 0; i < (size >> 2); i++) {
3372 chksum ^= INT_GET(*up, ARCH_CONVERT);
3373 up++;
3374 }
3375 INT_SET(iclog->ic_header.h_chksum, ARCH_CONVERT, chksum);
3376}
3377#else
3378#define xlog_pack_data_checksum(log, iclog, size)
3379#endif
3380
3381/*
3382 * Stamp cycle number in every block
3383 */
3384void
3385xlog_pack_data(
3386 xlog_t *log,
3387 xlog_in_core_t *iclog,
3388 int roundoff)
3389{
3390 int i, j, k;
3391 int size = iclog->ic_offset + roundoff;
3392 uint cycle_lsn;
3393 xfs_caddr_t dp;
3394 xlog_in_core_2_t *xhdr;
3395
3396 xlog_pack_data_checksum(log, iclog, size);
3397
3398 cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
3399
3400 dp = iclog->ic_datap;
3401 for (i = 0; i < BTOBB(size) &&
3402 i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
3403 iclog->ic_header.h_cycle_data[i] = *(uint *)dp;
3404 *(uint *)dp = cycle_lsn;
3405 dp += BBSIZE;
3406 }
3407
3408 if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
3409 xhdr = (xlog_in_core_2_t *)&iclog->ic_header;
3410 for ( ; i < BTOBB(size); i++) {
3411 j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3412 k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3413 xhdr[j].hic_xheader.xh_cycle_data[k] = *(uint *)dp;
3414 *(uint *)dp = cycle_lsn;
3415 dp += BBSIZE;
3416 }
3417
3418 for (i = 1; i < log->l_iclog_heads; i++) {
3419 xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
3420 }
3421 }
3422}
3423
3424#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
3425STATIC void
3426xlog_unpack_data_checksum(
3427 xlog_rec_header_t *rhead,
3428 xfs_caddr_t dp,
3429 xlog_t *log)
3430{
3431 uint *up = (uint *)dp;
3432 uint chksum = 0;
3433 int i;
3434
3435 /* divide length by 4 to get # words */
3436 for (i=0; i < INT_GET(rhead->h_len, ARCH_CONVERT) >> 2; i++) {
3437 chksum ^= INT_GET(*up, ARCH_CONVERT);
3438 up++;
3439 }
3440 if (chksum != INT_GET(rhead->h_chksum, ARCH_CONVERT)) {
3441 if (rhead->h_chksum ||
3442 ((log->l_flags & XLOG_CHKSUM_MISMATCH) == 0)) {
3443 cmn_err(CE_DEBUG,
3444 "XFS: LogR chksum mismatch: was (0x%x) is (0x%x)",
3445 INT_GET(rhead->h_chksum, ARCH_CONVERT), chksum);
3446 cmn_err(CE_DEBUG,
3447"XFS: Disregard message if filesystem was created with non-DEBUG kernel");
3448 if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
3449 cmn_err(CE_DEBUG,
3450 "XFS: LogR this is a LogV2 filesystem");
3451 }
3452 log->l_flags |= XLOG_CHKSUM_MISMATCH;
3453 }
3454 }
3455}
3456#else
3457#define xlog_unpack_data_checksum(rhead, dp, log)
3458#endif
3459
3460STATIC void
3461xlog_unpack_data(
3462 xlog_rec_header_t *rhead,
3463 xfs_caddr_t dp,
3464 xlog_t *log)
3465{
3466 int i, j, k;
3467 xlog_in_core_2_t *xhdr;
3468
3469 for (i = 0; i < BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT)) &&
3470 i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
3471 *(uint *)dp = *(uint *)&rhead->h_cycle_data[i];
3472 dp += BBSIZE;
3473 }
3474
3475 if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
3476 xhdr = (xlog_in_core_2_t *)rhead;
3477 for ( ; i < BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT)); i++) {
3478 j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3479 k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3480 *(uint *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
3481 dp += BBSIZE;
3482 }
3483 }
3484
3485 xlog_unpack_data_checksum(rhead, dp, log);
3486}
3487
3488STATIC int
3489xlog_valid_rec_header(
3490 xlog_t *log,
3491 xlog_rec_header_t *rhead,
3492 xfs_daddr_t blkno)
3493{
3494 int hlen;
3495
3496 if (unlikely(
3497 (INT_GET(rhead->h_magicno, ARCH_CONVERT) !=
3498 XLOG_HEADER_MAGIC_NUM))) {
3499 XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
3500 XFS_ERRLEVEL_LOW, log->l_mp);
3501 return XFS_ERROR(EFSCORRUPTED);
3502 }
3503 if (unlikely(
3504 (!rhead->h_version ||
3505 (INT_GET(rhead->h_version, ARCH_CONVERT) &
3506 (~XLOG_VERSION_OKBITS)) != 0))) {
3507 xlog_warn("XFS: %s: unrecognised log version (%d).",
3508 __FUNCTION__, INT_GET(rhead->h_version, ARCH_CONVERT));
3509 return XFS_ERROR(EIO);
3510 }
3511
3512 /* LR body must have data or it wouldn't have been written */
3513 hlen = INT_GET(rhead->h_len, ARCH_CONVERT);
3514 if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
3515 XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
3516 XFS_ERRLEVEL_LOW, log->l_mp);
3517 return XFS_ERROR(EFSCORRUPTED);
3518 }
3519 if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
3520 XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
3521 XFS_ERRLEVEL_LOW, log->l_mp);
3522 return XFS_ERROR(EFSCORRUPTED);
3523 }
3524 return 0;
3525}
3526
3527/*
3528 * Read the log from tail to head and process the log records found.
3529 * Handle the two cases where the tail and head are in the same cycle
3530 * and where the active portion of the log wraps around the end of
3531 * the physical log separately. The pass parameter is passed through
3532 * to the routines called to process the data and is not looked at
3533 * here.
3534 */
3535STATIC int
3536xlog_do_recovery_pass(
3537 xlog_t *log,
3538 xfs_daddr_t head_blk,
3539 xfs_daddr_t tail_blk,
3540 int pass)
3541{
3542 xlog_rec_header_t *rhead;
3543 xfs_daddr_t blk_no;
3544 xfs_caddr_t bufaddr, offset;
3545 xfs_buf_t *hbp, *dbp;
3546 int error = 0, h_size;
3547 int bblks, split_bblks;
3548 int hblks, split_hblks, wrapped_hblks;
3549 xlog_recover_t *rhash[XLOG_RHASH_SIZE];
3550
3551 ASSERT(head_blk != tail_blk);
3552
3553 /*
3554 * Read the header of the tail block and get the iclog buffer size from
3555 * h_size. Use this to tell how many sectors make up the log header.
3556 */
3557 if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
3558 /*
3559 * When using variable length iclogs, read first sector of
3560 * iclog header and extract the header size from it. Get a
3561 * new hbp that is the correct size.
3562 */
3563 hbp = xlog_get_bp(log, 1);
3564 if (!hbp)
3565 return ENOMEM;
3566 if ((error = xlog_bread(log, tail_blk, 1, hbp)))
3567 goto bread_err1;
3568 offset = xlog_align(log, tail_blk, 1, hbp);
3569 rhead = (xlog_rec_header_t *)offset;
3570 error = xlog_valid_rec_header(log, rhead, tail_blk);
3571 if (error)
3572 goto bread_err1;
3573 h_size = INT_GET(rhead->h_size, ARCH_CONVERT);
3574 if ((INT_GET(rhead->h_version, ARCH_CONVERT)
3575 & XLOG_VERSION_2) &&
3576 (h_size > XLOG_HEADER_CYCLE_SIZE)) {
3577 hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
3578 if (h_size % XLOG_HEADER_CYCLE_SIZE)
3579 hblks++;
3580 xlog_put_bp(hbp);
3581 hbp = xlog_get_bp(log, hblks);
3582 } else {
3583 hblks = 1;
3584 }
3585 } else {
3586 ASSERT(log->l_sectbb_log == 0);
3587 hblks = 1;
3588 hbp = xlog_get_bp(log, 1);
3589 h_size = XLOG_BIG_RECORD_BSIZE;
3590 }
3591
3592 if (!hbp)
3593 return ENOMEM;
3594 dbp = xlog_get_bp(log, BTOBB(h_size));
3595 if (!dbp) {
3596 xlog_put_bp(hbp);
3597 return ENOMEM;
3598 }
3599
3600 memset(rhash, 0, sizeof(rhash));
3601 if (tail_blk <= head_blk) {
3602 for (blk_no = tail_blk; blk_no < head_blk; ) {
3603 if ((error = xlog_bread(log, blk_no, hblks, hbp)))
3604 goto bread_err2;
3605 offset = xlog_align(log, blk_no, hblks, hbp);
3606 rhead = (xlog_rec_header_t *)offset;
3607 error = xlog_valid_rec_header(log, rhead, blk_no);
3608 if (error)
3609 goto bread_err2;
3610
3611 /* blocks in data section */
3612 bblks = (int)BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT));
3613 error = xlog_bread(log, blk_no + hblks, bblks, dbp);
3614 if (error)
3615 goto bread_err2;
3616 offset = xlog_align(log, blk_no + hblks, bblks, dbp);
3617 xlog_unpack_data(rhead, offset, log);
3618 if ((error = xlog_recover_process_data(log,
3619 rhash, rhead, offset, pass)))
3620 goto bread_err2;
3621 blk_no += bblks + hblks;
3622 }
3623 } else {
3624 /*
3625 * Perform recovery around the end of the physical log.
3626 * When the head is not on the same cycle number as the tail,
3627 * we can't do a sequential recovery as above.
3628 */
3629 blk_no = tail_blk;
3630 while (blk_no < log->l_logBBsize) {
3631 /*
3632 * Check for header wrapping around physical end-of-log
3633 */
3634 offset = NULL;
3635 split_hblks = 0;
3636 wrapped_hblks = 0;
3637 if (blk_no + hblks <= log->l_logBBsize) {
3638 /* Read header in one read */
3639 error = xlog_bread(log, blk_no, hblks, hbp);
3640 if (error)
3641 goto bread_err2;
3642 offset = xlog_align(log, blk_no, hblks, hbp);
3643 } else {
3644 /* This LR is split across physical log end */
3645 if (blk_no != log->l_logBBsize) {
3646 /* some data before physical log end */
3647 ASSERT(blk_no <= INT_MAX);
3648 split_hblks = log->l_logBBsize - (int)blk_no;
3649 ASSERT(split_hblks > 0);
3650 if ((error = xlog_bread(log, blk_no,
3651 split_hblks, hbp)))
3652 goto bread_err2;
3653 offset = xlog_align(log, blk_no,
3654 split_hblks, hbp);
3655 }
3656 /*
3657 * Note: this black magic still works with
3658 * large sector sizes (non-512) only because:
3659 * - we increased the buffer size originally
3660 * by 1 sector giving us enough extra space
3661 * for the second read;
3662 * - the log start is guaranteed to be sector
3663 * aligned;
3664 * - we read the log end (LR header start)
3665 * _first_, then the log start (LR header end)
3666 * - order is important.
3667 */
3668 bufaddr = XFS_BUF_PTR(hbp);
3669 XFS_BUF_SET_PTR(hbp,
3670 bufaddr + BBTOB(split_hblks),
3671 BBTOB(hblks - split_hblks));
3672 wrapped_hblks = hblks - split_hblks;
3673 error = xlog_bread(log, 0, wrapped_hblks, hbp);
3674 if (error)
3675 goto bread_err2;
3676 XFS_BUF_SET_PTR(hbp, bufaddr, BBTOB(hblks));
3677 if (!offset)
3678 offset = xlog_align(log, 0,
3679 wrapped_hblks, hbp);
3680 }
3681 rhead = (xlog_rec_header_t *)offset;
3682 error = xlog_valid_rec_header(log, rhead,
3683 split_hblks ? blk_no : 0);
3684 if (error)
3685 goto bread_err2;
3686
3687 bblks = (int)BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT));
3688 blk_no += hblks;
3689
3690 /* Read in data for log record */
3691 if (blk_no + bblks <= log->l_logBBsize) {
3692 error = xlog_bread(log, blk_no, bblks, dbp);
3693 if (error)
3694 goto bread_err2;
3695 offset = xlog_align(log, blk_no, bblks, dbp);
3696 } else {
3697 /* This log record is split across the
3698 * physical end of log */
3699 offset = NULL;
3700 split_bblks = 0;
3701 if (blk_no != log->l_logBBsize) {
3702 /* some data is before the physical
3703 * end of log */
3704 ASSERT(!wrapped_hblks);
3705 ASSERT(blk_no <= INT_MAX);
3706 split_bblks =
3707 log->l_logBBsize - (int)blk_no;
3708 ASSERT(split_bblks > 0);
3709 if ((error = xlog_bread(log, blk_no,
3710 split_bblks, dbp)))
3711 goto bread_err2;
3712 offset = xlog_align(log, blk_no,
3713 split_bblks, dbp);
3714 }
3715 /*
3716 * Note: this black magic still works with
3717 * large sector sizes (non-512) only because:
3718 * - we increased the buffer size originally
3719 * by 1 sector giving us enough extra space
3720 * for the second read;
3721 * - the log start is guaranteed to be sector
3722 * aligned;
3723 * - we read the log end (LR header start)
3724 * _first_, then the log start (LR header end)
3725 * - order is important.
3726 */
3727 bufaddr = XFS_BUF_PTR(dbp);
3728 XFS_BUF_SET_PTR(dbp,
3729 bufaddr + BBTOB(split_bblks),
3730 BBTOB(bblks - split_bblks));
3731 if ((error = xlog_bread(log, wrapped_hblks,
3732 bblks - split_bblks, dbp)))
3733 goto bread_err2;
3734 XFS_BUF_SET_PTR(dbp, bufaddr, h_size);
3735 if (!offset)
3736 offset = xlog_align(log, wrapped_hblks,
3737 bblks - split_bblks, dbp);
3738 }
3739 xlog_unpack_data(rhead, offset, log);
3740 if ((error = xlog_recover_process_data(log, rhash,
3741 rhead, offset, pass)))
3742 goto bread_err2;
3743 blk_no += bblks;
3744 }
3745
3746 ASSERT(blk_no >= log->l_logBBsize);
3747 blk_no -= log->l_logBBsize;
3748
3749 /* read first part of physical log */
3750 while (blk_no < head_blk) {
3751 if ((error = xlog_bread(log, blk_no, hblks, hbp)))
3752 goto bread_err2;
3753 offset = xlog_align(log, blk_no, hblks, hbp);
3754 rhead = (xlog_rec_header_t *)offset;
3755 error = xlog_valid_rec_header(log, rhead, blk_no);
3756 if (error)
3757 goto bread_err2;
3758 bblks = (int)BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT));
3759 if ((error = xlog_bread(log, blk_no+hblks, bblks, dbp)))
3760 goto bread_err2;
3761 offset = xlog_align(log, blk_no+hblks, bblks, dbp);
3762 xlog_unpack_data(rhead, offset, log);
3763 if ((error = xlog_recover_process_data(log, rhash,
3764 rhead, offset, pass)))
3765 goto bread_err2;
3766 blk_no += bblks + hblks;
3767 }
3768 }
3769
3770 bread_err2:
3771 xlog_put_bp(dbp);
3772 bread_err1:
3773 xlog_put_bp(hbp);
3774 return error;
3775}
3776
3777/*
3778 * Do the recovery of the log. We actually do this in two phases.
3779 * The two passes are necessary in order to implement the function
3780 * of cancelling a record written into the log. The first pass
3781 * determines those things which have been cancelled, and the
3782 * second pass replays log items normally except for those which
3783 * have been cancelled. The handling of the replay and cancellations
3784 * takes place in the log item type specific routines.
3785 *
3786 * The table of items which have cancel records in the log is allocated
3787 * and freed at this level, since only here do we know when all of
3788 * the log recovery has been completed.
3789 */
3790STATIC int
3791xlog_do_log_recovery(
3792 xlog_t *log,
3793 xfs_daddr_t head_blk,
3794 xfs_daddr_t tail_blk)
3795{
3796 int error;
3797
3798 ASSERT(head_blk != tail_blk);
3799
3800 /*
3801 * First do a pass to find all of the cancelled buf log items.
3802 * Store them in the buf_cancel_table for use in the second pass.
3803 */
3804 log->l_buf_cancel_table =
3805 (xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE *
3806 sizeof(xfs_buf_cancel_t*),
3807 KM_SLEEP);
3808 error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3809 XLOG_RECOVER_PASS1);
3810 if (error != 0) {
3811 kmem_free(log->l_buf_cancel_table,
3812 XLOG_BC_TABLE_SIZE * sizeof(xfs_buf_cancel_t*));
3813 log->l_buf_cancel_table = NULL;
3814 return error;
3815 }
3816 /*
3817 * Then do a second pass to actually recover the items in the log.
3818 * When it is complete free the table of buf cancel items.
3819 */
3820 error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3821 XLOG_RECOVER_PASS2);
3822#ifdef DEBUG
3823 {
3824 int i;
3825
3826 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
3827 ASSERT(log->l_buf_cancel_table[i] == NULL);
3828 }
3829#endif /* DEBUG */
3830
3831 kmem_free(log->l_buf_cancel_table,
3832 XLOG_BC_TABLE_SIZE * sizeof(xfs_buf_cancel_t*));
3833 log->l_buf_cancel_table = NULL;
3834
3835 return error;
3836}
3837
3838/*
3839 * Do the actual recovery
3840 */
3841STATIC int
3842xlog_do_recover(
3843 xlog_t *log,
3844 xfs_daddr_t head_blk,
3845 xfs_daddr_t tail_blk)
3846{
3847 int error;
3848 xfs_buf_t *bp;
3849 xfs_sb_t *sbp;
3850
3851 /*
3852 * First replay the images in the log.
3853 */
3854 error = xlog_do_log_recovery(log, head_blk, tail_blk);
3855 if (error) {
3856 return error;
3857 }
3858
3859 XFS_bflush(log->l_mp->m_ddev_targp);
3860
3861 /*
3862 * If IO errors happened during recovery, bail out.
3863 */
3864 if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
3865 return (EIO);
3866 }
3867
3868 /*
3869 * We now update the tail_lsn since much of the recovery has completed
3870 * and there may be space available to use. If there were no extent
3871 * or iunlinks, we can free up the entire log and set the tail_lsn to
3872 * be the last_sync_lsn. This was set in xlog_find_tail to be the
3873 * lsn of the last known good LR on disk. If there are extent frees
3874 * or iunlinks they will have some entries in the AIL; so we look at
3875 * the AIL to determine how to set the tail_lsn.
3876 */
3877 xlog_assign_tail_lsn(log->l_mp);
3878
3879 /*
3880 * Now that we've finished replaying all buffer and inode
3881 * updates, re-read in the superblock.
3882 */
3883 bp = xfs_getsb(log->l_mp, 0);
3884 XFS_BUF_UNDONE(bp);
3885 XFS_BUF_READ(bp);
3886 xfsbdstrat(log->l_mp, bp);
3887 if ((error = xfs_iowait(bp))) {
3888 xfs_ioerror_alert("xlog_do_recover",
3889 log->l_mp, bp, XFS_BUF_ADDR(bp));
3890 ASSERT(0);
3891 xfs_buf_relse(bp);
3892 return error;
3893 }
3894
3895 /* Convert superblock from on-disk format */
3896 sbp = &log->l_mp->m_sb;
3897 xfs_xlatesb(XFS_BUF_TO_SBP(bp), sbp, 1, XFS_SB_ALL_BITS);
3898 ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
3899 ASSERT(XFS_SB_GOOD_VERSION(sbp));
3900 xfs_buf_relse(bp);
3901
3902 xlog_recover_check_summary(log);
3903
3904 /* Normal transactions can now occur */
3905 log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
3906 return 0;
3907}
3908
3909/*
3910 * Perform recovery and re-initialize some log variables in xlog_find_tail.
3911 *
3912 * Return error or zero.
3913 */
3914int
3915xlog_recover(
3916 xlog_t *log,
3917 int readonly)
3918{
3919 xfs_daddr_t head_blk, tail_blk;
3920 int error;
3921
3922 /* find the tail of the log */
3923 if ((error = xlog_find_tail(log, &head_blk, &tail_blk, readonly)))
3924 return error;
3925
3926 if (tail_blk != head_blk) {
3927 /* There used to be a comment here:
3928 *
3929 * disallow recovery on read-only mounts. note -- mount
3930 * checks for ENOSPC and turns it into an intelligent
3931 * error message.
3932 * ...but this is no longer true. Now, unless you specify
3933 * NORECOVERY (in which case this function would never be
3934 * called), we just go ahead and recover. We do this all
3935 * under the vfs layer, so we can get away with it unless
3936 * the device itself is read-only, in which case we fail.
3937 */
3938 if ((error = xfs_dev_is_read_only(log->l_mp,
3939 "recovery required"))) {
3940 return error;
3941 }
3942
3943 cmn_err(CE_NOTE,
3944 "Starting XFS recovery on filesystem: %s (dev: %s)",
3945 log->l_mp->m_fsname, XFS_BUFTARG_NAME(log->l_targ));
3946
3947 error = xlog_do_recover(log, head_blk, tail_blk);
3948 log->l_flags |= XLOG_RECOVERY_NEEDED;
3949 }
3950 return error;
3951}
3952
3953/*
3954 * In the first part of recovery we replay inodes and buffers and build
3955 * up the list of extent free items which need to be processed. Here
3956 * we process the extent free items and clean up the on disk unlinked
3957 * inode lists. This is separated from the first part of recovery so
3958 * that the root and real-time bitmap inodes can be read in from disk in
3959 * between the two stages. This is necessary so that we can free space
3960 * in the real-time portion of the file system.
3961 */
3962int
3963xlog_recover_finish(
3964 xlog_t *log,
3965 int mfsi_flags)
3966{
3967 /*
3968 * Now we're ready to do the transactions needed for the
3969 * rest of recovery. Start with completing all the extent
3970 * free intent records and then process the unlinked inode
3971 * lists. At this point, we essentially run in normal mode
3972 * except that we're still performing recovery actions
3973 * rather than accepting new requests.
3974 */
3975 if (log->l_flags & XLOG_RECOVERY_NEEDED) {
3976 xlog_recover_process_efis(log);
3977 /*
3978 * Sync the log to get all the EFIs out of the AIL.
3979 * This isn't absolutely necessary, but it helps in
3980 * case the unlink transactions would have problems
3981 * pushing the EFIs out of the way.
3982 */
3983 xfs_log_force(log->l_mp, (xfs_lsn_t)0,
3984 (XFS_LOG_FORCE | XFS_LOG_SYNC));
3985
3986 if ( (mfsi_flags & XFS_MFSI_NOUNLINK) == 0 ) {
3987 xlog_recover_process_iunlinks(log);
3988 }
3989
3990 xlog_recover_check_summary(log);
3991
3992 cmn_err(CE_NOTE,
3993 "Ending XFS recovery on filesystem: %s (dev: %s)",
3994 log->l_mp->m_fsname, XFS_BUFTARG_NAME(log->l_targ));
3995 log->l_flags &= ~XLOG_RECOVERY_NEEDED;
3996 } else {
3997 cmn_err(CE_DEBUG,
3998 "!Ending clean XFS mount for filesystem: %s",
3999 log->l_mp->m_fsname);
4000 }
4001 return 0;
4002}
4003
4004
4005#if defined(DEBUG)
4006/*
4007 * Read all of the agf and agi counters and check that they
4008 * are consistent with the superblock counters.
4009 */
4010void
4011xlog_recover_check_summary(
4012 xlog_t *log)
4013{
4014 xfs_mount_t *mp;
4015 xfs_agf_t *agfp;
4016 xfs_agi_t *agip;
4017 xfs_buf_t *agfbp;
4018 xfs_buf_t *agibp;
4019 xfs_daddr_t agfdaddr;
4020 xfs_daddr_t agidaddr;
4021 xfs_buf_t *sbbp;
4022#ifdef XFS_LOUD_RECOVERY
4023 xfs_sb_t *sbp;
4024#endif
4025 xfs_agnumber_t agno;
4026 __uint64_t freeblks;
4027 __uint64_t itotal;
4028 __uint64_t ifree;
4029
4030 mp = log->l_mp;
4031
4032 freeblks = 0LL;
4033 itotal = 0LL;
4034 ifree = 0LL;
4035 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
4036 agfdaddr = XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp));
4037 agfbp = xfs_buf_read(mp->m_ddev_targp, agfdaddr,
4038 XFS_FSS_TO_BB(mp, 1), 0);
4039 if (XFS_BUF_ISERROR(agfbp)) {
4040 xfs_ioerror_alert("xlog_recover_check_summary(agf)",
4041 mp, agfbp, agfdaddr);
4042 }
4043 agfp = XFS_BUF_TO_AGF(agfbp);
4044 ASSERT(XFS_AGF_MAGIC ==
4045 INT_GET(agfp->agf_magicnum, ARCH_CONVERT));
4046 ASSERT(XFS_AGF_GOOD_VERSION(
4047 INT_GET(agfp->agf_versionnum, ARCH_CONVERT)));
4048 ASSERT(INT_GET(agfp->agf_seqno, ARCH_CONVERT) == agno);
4049
4050 freeblks += INT_GET(agfp->agf_freeblks, ARCH_CONVERT) +
4051 INT_GET(agfp->agf_flcount, ARCH_CONVERT);
4052 xfs_buf_relse(agfbp);
4053
4054 agidaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
4055 agibp = xfs_buf_read(mp->m_ddev_targp, agidaddr,
4056 XFS_FSS_TO_BB(mp, 1), 0);
4057 if (XFS_BUF_ISERROR(agibp)) {
4058 xfs_ioerror_alert("xlog_recover_check_summary(agi)",
4059 mp, agibp, agidaddr);
4060 }
4061 agip = XFS_BUF_TO_AGI(agibp);
4062 ASSERT(XFS_AGI_MAGIC ==
4063 INT_GET(agip->agi_magicnum, ARCH_CONVERT));
4064 ASSERT(XFS_AGI_GOOD_VERSION(
4065 INT_GET(agip->agi_versionnum, ARCH_CONVERT)));
4066 ASSERT(INT_GET(agip->agi_seqno, ARCH_CONVERT) == agno);
4067
4068 itotal += INT_GET(agip->agi_count, ARCH_CONVERT);
4069 ifree += INT_GET(agip->agi_freecount, ARCH_CONVERT);
4070 xfs_buf_relse(agibp);
4071 }
4072
4073 sbbp = xfs_getsb(mp, 0);
4074#ifdef XFS_LOUD_RECOVERY
4075 sbp = &mp->m_sb;
4076 xfs_xlatesb(XFS_BUF_TO_SBP(sbbp), sbp, 1, XFS_SB_ALL_BITS);
4077 cmn_err(CE_NOTE,
4078 "xlog_recover_check_summary: sb_icount %Lu itotal %Lu",
4079 sbp->sb_icount, itotal);
4080 cmn_err(CE_NOTE,
4081 "xlog_recover_check_summary: sb_ifree %Lu itotal %Lu",
4082 sbp->sb_ifree, ifree);
4083 cmn_err(CE_NOTE,
4084 "xlog_recover_check_summary: sb_fdblocks %Lu freeblks %Lu",
4085 sbp->sb_fdblocks, freeblks);
4086#if 0
4087 /*
4088 * This is turned off until I account for the allocation
4089 * btree blocks which live in free space.
4090 */
4091 ASSERT(sbp->sb_icount == itotal);
4092 ASSERT(sbp->sb_ifree == ifree);
4093 ASSERT(sbp->sb_fdblocks == freeblks);
4094#endif
4095#endif
4096 xfs_buf_relse(sbbp);
4097}
4098#endif /* DEBUG */
diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h
new file mode 100644
index 000000000000..42158b442b55
--- /dev/null
+++ b/fs/xfs/xfs_log_recover.h
@@ -0,0 +1,81 @@
1/*
2 * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_LOG_RECOVER_H__
33#define __XFS_LOG_RECOVER_H__
34
35/*
36 * Macros, structures, prototypes for internal log manager use.
37 */
38
39#define XLOG_RHASH_BITS 4
40#define XLOG_RHASH_SIZE 16
41#define XLOG_RHASH_SHIFT 2
42#define XLOG_RHASH(tid) \
43 ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1))
44
45#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK / 2 + 1)
46
47
48/*
49 * item headers are in ri_buf[0]. Additional buffers follow.
50 */
51typedef struct xlog_recover_item {
52 struct xlog_recover_item *ri_next;
53 struct xlog_recover_item *ri_prev;
54 int ri_type;
55 int ri_cnt; /* count of regions found */
56 int ri_total; /* total regions */
57 xfs_log_iovec_t *ri_buf; /* ptr to regions buffer */
58} xlog_recover_item_t;
59
60struct xlog_tid;
61typedef struct xlog_recover {
62 struct xlog_recover *r_next;
63 xlog_tid_t r_log_tid; /* log's transaction id */
64 xfs_trans_header_t r_theader; /* trans header for partial */
65 int r_state; /* not needed */
66 xfs_lsn_t r_lsn; /* xact lsn */
67 xlog_recover_item_t *r_itemq; /* q for items */
68} xlog_recover_t;
69
70#define ITEM_TYPE(i) (*(ushort *)(i)->ri_buf[0].i_addr)
71
72/*
73 * This is the number of entries in the l_buf_cancel_table used during
74 * recovery.
75 */
76#define XLOG_BC_TABLE_SIZE 64
77
78#define XLOG_RECOVER_PASS1 1
79#define XLOG_RECOVER_PASS2 2
80
81#endif /* __XFS_LOG_RECOVER_H__ */
diff --git a/fs/xfs/xfs_mac.h b/fs/xfs/xfs_mac.h
new file mode 100644
index 000000000000..8d59aaffeb8e
--- /dev/null
+++ b/fs/xfs/xfs_mac.h
@@ -0,0 +1,120 @@
1/*
2 * Copyright (c) 2001-2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_MAC_H__
33#define __XFS_MAC_H__
34
35/*
36 * Mandatory Access Control
37 *
38 * Layout of a composite MAC label:
39 * ml_list contains the list of categories (MSEN) followed by the list of
40 * divisions (MINT). This is actually a header for the data structure which
41 * will have an ml_list with more than one element.
42 *
43 * -------------------------------
44 * | ml_msen_type | ml_mint_type |
45 * -------------------------------
46 * | ml_level | ml_grade |
47 * -------------------------------
48 * | ml_catcount |
49 * -------------------------------
50 * | ml_divcount |
51 * -------------------------------
52 * | category 1 |
53 * | . . . |
54 * | category N | (where N = ml_catcount)
55 * -------------------------------
56 * | division 1 |
57 * | . . . |
58 * | division M | (where M = ml_divcount)
59 * -------------------------------
60 */
61#define XFS_MAC_MAX_SETS 250
62typedef struct xfs_mac_label {
63 __uint8_t ml_msen_type; /* MSEN label type */
64 __uint8_t ml_mint_type; /* MINT label type */
65 __uint8_t ml_level; /* Hierarchical level */
66 __uint8_t ml_grade; /* Hierarchical grade */
67 __uint16_t ml_catcount; /* Category count */
68 __uint16_t ml_divcount; /* Division count */
69 /* Category set, then Division set */
70 __uint16_t ml_list[XFS_MAC_MAX_SETS];
71} xfs_mac_label_t;
72
73/* MSEN label type names. Choose an upper case ASCII character. */
74#define XFS_MSEN_ADMIN_LABEL 'A' /* Admin: low<admin != tcsec<high */
75#define XFS_MSEN_EQUAL_LABEL 'E' /* Wildcard - always equal */
76#define XFS_MSEN_HIGH_LABEL 'H' /* System High - always dominates */
77#define XFS_MSEN_MLD_HIGH_LABEL 'I' /* System High, multi-level dir */
78#define XFS_MSEN_LOW_LABEL 'L' /* System Low - always dominated */
79#define XFS_MSEN_MLD_LABEL 'M' /* TCSEC label on a multi-level dir */
80#define XFS_MSEN_MLD_LOW_LABEL 'N' /* System Low, multi-level dir */
81#define XFS_MSEN_TCSEC_LABEL 'T' /* TCSEC label */
82#define XFS_MSEN_UNKNOWN_LABEL 'U' /* unknown label */
83
84/* MINT label type names. Choose a lower case ASCII character. */
85#define XFS_MINT_BIBA_LABEL 'b' /* Dual of a TCSEC label */
86#define XFS_MINT_EQUAL_LABEL 'e' /* Wildcard - always equal */
87#define XFS_MINT_HIGH_LABEL 'h' /* High Grade - always dominates */
88#define XFS_MINT_LOW_LABEL 'l' /* Low Grade - always dominated */
89
90/* On-disk XFS extended attribute names */
91#define SGI_MAC_FILE "SGI_MAC_FILE"
92#define SGI_MAC_FILE_SIZE (sizeof(SGI_MAC_FILE)-1)
93
94
95#ifdef __KERNEL__
96
97#ifdef CONFIG_FS_POSIX_MAC
98
99/* NOT YET IMPLEMENTED */
100
101#define MACEXEC 00100
102#define MACWRITE 00200
103#define MACREAD 00400
104
105struct xfs_inode;
106extern int xfs_mac_iaccess(struct xfs_inode *, mode_t, cred_t *);
107
108#define _MAC_XFS_IACCESS(i,m,c) (xfs_mac_iaccess(i,m,c))
109#define _MAC_VACCESS(v,c,m) (xfs_mac_vaccess(v,c,m))
110#define _MAC_EXISTS xfs_mac_vhaslabel
111
112#else
113#define _MAC_XFS_IACCESS(i,m,c) (0)
114#define _MAC_VACCESS(v,c,m) (0)
115#define _MAC_EXISTS (NULL)
116#endif
117
118#endif /* __KERNEL__ */
119
120#endif /* __XFS_MAC_H__ */
diff --git a/fs/xfs/xfs_macros.c b/fs/xfs/xfs_macros.c
new file mode 100644
index 000000000000..ce4f46c6b3ab
--- /dev/null
+++ b/fs/xfs/xfs_macros.c
@@ -0,0 +1,2136 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#define XFS_MACRO_C
34
35#include "xfs.h"
36#include "xfs_macros.h"
37#include "xfs_types.h"
38#include "xfs_inum.h"
39#include "xfs_log.h"
40#include "xfs_trans.h"
41#include "xfs_sb.h"
42#include "xfs_ag.h"
43#include "xfs_dir.h"
44#include "xfs_dir2.h"
45#include "xfs_dmapi.h"
46#include "xfs_mount.h"
47#include "xfs_alloc_btree.h"
48#include "xfs_ialloc_btree.h"
49#include "xfs_bmap_btree.h"
50#include "xfs_btree.h"
51#include "xfs_attr_sf.h"
52#include "xfs_dir_sf.h"
53#include "xfs_dir2_sf.h"
54#include "xfs_dinode.h"
55#include "xfs_ialloc.h"
56#include "xfs_inode_item.h"
57#include "xfs_inode.h"
58#include "xfs_bmap.h"
59#include "xfs_rw.h"
60#include "xfs_log_priv.h"
61#include "xfs_da_btree.h"
62#include "xfs_attr_leaf.h"
63#include "xfs_dir_leaf.h"
64#include "xfs_dir2_data.h"
65#include "xfs_dir2_leaf.h"
66#include "xfs_dir2_block.h"
67#include "xfs_dir2_node.h"
68#include "xfs_bit.h"
69
70#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_ISNULLDSTARTBLOCK)
71int
72isnulldstartblock(xfs_dfsbno_t x)
73{
74 return ISNULLDSTARTBLOCK(x);
75}
76#endif
77
78#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_ISNULLSTARTBLOCK)
79int
80isnullstartblock(xfs_fsblock_t x)
81{
82 return ISNULLSTARTBLOCK(x);
83}
84#endif
85
86#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_NULLSTARTBLOCK)
87xfs_fsblock_t
88nullstartblock(int k)
89{
90 return NULLSTARTBLOCK(k);
91}
92#endif
93
94#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_STARTBLOCKVAL)
95xfs_filblks_t
96startblockval(xfs_fsblock_t x)
97{
98 return STARTBLOCKVAL(x);
99}
100#endif
101
102#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AG_CHECK_DADDR)
103void
104xfs_ag_check_daddr(xfs_mount_t *mp, xfs_daddr_t d, xfs_extlen_t len)
105{
106 XFS_AG_CHECK_DADDR(mp, d, len);
107}
108#endif
109
110#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AG_DADDR)
111xfs_daddr_t
112xfs_ag_daddr(xfs_mount_t *mp, xfs_agnumber_t agno, xfs_daddr_t d)
113{
114 return XFS_AG_DADDR(mp, agno, d);
115}
116#endif
117
118#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AG_MAXLEVELS)
119int
120xfs_ag_maxlevels(xfs_mount_t *mp)
121{
122 return XFS_AG_MAXLEVELS(mp);
123}
124#endif
125
126#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGB_TO_DADDR)
127xfs_daddr_t
128xfs_agb_to_daddr(xfs_mount_t *mp, xfs_agnumber_t agno, xfs_agblock_t agbno)
129{
130 return XFS_AGB_TO_DADDR(mp, agno, agbno);
131}
132#endif
133
134#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGB_TO_FSB)
135xfs_fsblock_t
136xfs_agb_to_fsb(xfs_mount_t *mp, xfs_agnumber_t agno, xfs_agblock_t agbno)
137{
138 return XFS_AGB_TO_FSB(mp, agno, agbno);
139}
140#endif
141
142#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGBLOCK_MAX)
143xfs_agblock_t
144xfs_agblock_max(xfs_agblock_t a, xfs_agblock_t b)
145{
146 return XFS_AGBLOCK_MAX(a, b);
147}
148#endif
149
150#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGBLOCK_MIN)
151xfs_agblock_t
152xfs_agblock_min(xfs_agblock_t a, xfs_agblock_t b)
153{
154 return XFS_AGBLOCK_MIN(a, b);
155}
156#endif
157
158#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGF_BLOCK)
159xfs_agblock_t
160xfs_agf_block(xfs_mount_t *mp)
161{
162 return XFS_AGF_BLOCK(mp);
163}
164#endif
165
166#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGF_GOOD_VERSION)
167int
168xfs_agf_good_version(unsigned v)
169{
170 return XFS_AGF_GOOD_VERSION(v);
171}
172#endif
173
174#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGFL_BLOCK)
175xfs_agblock_t
176xfs_agfl_block(xfs_mount_t *mp)
177{
178 return XFS_AGFL_BLOCK(mp);
179}
180#endif
181
182#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGI_BLOCK)
183xfs_agblock_t
184xfs_agi_block(xfs_mount_t *mp)
185{
186 return XFS_AGI_BLOCK(mp);
187}
188#endif
189
190#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGI_GOOD_VERSION)
191int
192xfs_agi_good_version(unsigned v)
193{
194 return XFS_AGI_GOOD_VERSION(v);
195}
196#endif
197
198#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGINO_TO_AGBNO)
199xfs_agblock_t
200xfs_agino_to_agbno(xfs_mount_t *mp, xfs_agino_t i)
201{
202 return XFS_AGINO_TO_AGBNO(mp, i);
203}
204#endif
205
206#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGINO_TO_INO)
207xfs_ino_t
208xfs_agino_to_ino(xfs_mount_t *mp, xfs_agnumber_t a, xfs_agino_t i)
209{
210 return XFS_AGINO_TO_INO(mp, a, i);
211}
212#endif
213
214#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGINO_TO_OFFSET)
215int
216xfs_agino_to_offset(xfs_mount_t *mp, xfs_agino_t i)
217{
218 return XFS_AGINO_TO_OFFSET(mp, i);
219}
220#endif
221
222#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ALLOC_BLOCK_MAXRECS)
223int
224xfs_alloc_block_maxrecs(int lev, xfs_btree_cur_t *cur)
225{
226 return XFS_ALLOC_BLOCK_MAXRECS(lev, cur);
227}
228#endif
229
230#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ALLOC_BLOCK_MINRECS)
231int
232xfs_alloc_block_minrecs(int lev, xfs_btree_cur_t *cur)
233{
234 return XFS_ALLOC_BLOCK_MINRECS(lev, cur);
235}
236#endif
237
238#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ALLOC_BLOCK_SIZE)
239/*ARGSUSED1*/
240int
241xfs_alloc_block_size(int lev, xfs_btree_cur_t *cur)
242{
243 return XFS_ALLOC_BLOCK_SIZE(lev, cur);
244}
245#endif
246
247#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ALLOC_KEY_ADDR)
248/*ARGSUSED3*/
249xfs_alloc_key_t *
250xfs_alloc_key_addr(xfs_alloc_block_t *bb, int i, xfs_btree_cur_t *cur)
251{
252 return XFS_ALLOC_KEY_ADDR(bb, i, cur);
253}
254#endif
255
256#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ALLOC_PTR_ADDR)
257xfs_alloc_ptr_t *
258xfs_alloc_ptr_addr(xfs_alloc_block_t *bb, int i, xfs_btree_cur_t *cur)
259{
260 return XFS_ALLOC_PTR_ADDR(bb, i, cur);
261}
262#endif
263
264#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ALLOC_REC_ADDR)
265/*ARGSUSED3*/
266xfs_alloc_rec_t *
267xfs_alloc_rec_addr(xfs_alloc_block_t *bb, int i, xfs_btree_cur_t *cur)
268{
269 return XFS_ALLOC_REC_ADDR(bb, i, cur);
270}
271#endif
272
273#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_LEAF_ENTSIZE_LOCAL)
274int
275xfs_attr_leaf_entsize_local(int nlen, int vlen)
276{
277 return XFS_ATTR_LEAF_ENTSIZE_LOCAL(nlen, vlen);
278}
279#endif
280
281#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX)
282int
283xfs_attr_leaf_entsize_local_max(int bsize)
284{
285 return XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX(bsize);
286}
287#endif
288
289#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_LEAF_ENTSIZE_REMOTE)
290int
291xfs_attr_leaf_entsize_remote(int nlen)
292{
293 return XFS_ATTR_LEAF_ENTSIZE_REMOTE(nlen);
294}
295#endif
296
297#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_LEAF_NAME)
298char *
299xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
300{
301 return XFS_ATTR_LEAF_NAME(leafp, idx);
302}
303#endif
304
305#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_LEAF_NAME_LOCAL)
306xfs_attr_leaf_name_local_t *
307xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
308{
309 return XFS_ATTR_LEAF_NAME_LOCAL(leafp, idx);
310}
311#endif
312
313#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_LEAF_NAME_REMOTE)
314xfs_attr_leaf_name_remote_t *
315xfs_attr_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
316{
317 return XFS_ATTR_LEAF_NAME_REMOTE(leafp, idx);
318}
319#endif
320
321#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_SF_ENTSIZE)
322int
323xfs_attr_sf_entsize(xfs_attr_sf_entry_t *sfep)
324{
325 return XFS_ATTR_SF_ENTSIZE(sfep);
326}
327#endif
328
329#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_SF_ENTSIZE_BYNAME)
330int
331xfs_attr_sf_entsize_byname(int nlen, int vlen)
332{
333 return XFS_ATTR_SF_ENTSIZE_BYNAME(nlen, vlen);
334}
335#endif
336
337#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_SF_NEXTENTRY)
338xfs_attr_sf_entry_t *
339xfs_attr_sf_nextentry(xfs_attr_sf_entry_t *sfep)
340{
341 return XFS_ATTR_SF_NEXTENTRY(sfep);
342}
343#endif
344
345#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_SF_TOTSIZE)
346int
347xfs_attr_sf_totsize(xfs_inode_t *dp)
348{
349 return XFS_ATTR_SF_TOTSIZE(dp);
350}
351#endif
352
353#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BHVTOI)
354xfs_inode_t *
355xfs_bhvtoi(bhv_desc_t *bhvp)
356{
357 return XFS_BHVTOI(bhvp);
358}
359#endif
360
361#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BHVTOM)
362xfs_mount_t *
363xfs_bhvtom(bhv_desc_t *bdp)
364{
365 return XFS_BHVTOM(bdp);
366}
367#endif
368
369#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_VFSTOM)
370xfs_mount_t *
371xfs_vfstom(vfs_t *vfs)
372{
373 return XFS_VFSTOM(vfs);
374}
375#endif
376
377#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BM_MAXLEVELS)
378int
379xfs_bm_maxlevels(xfs_mount_t *mp, int w)
380{
381 return XFS_BM_MAXLEVELS(mp, w);
382}
383#endif
384
385#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BLOCK_DMAXRECS)
386int
387xfs_bmap_block_dmaxrecs(int lev, xfs_btree_cur_t *cur)
388{
389 return XFS_BMAP_BLOCK_DMAXRECS(lev, cur);
390}
391#endif
392
393#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BLOCK_DMINRECS)
394int
395xfs_bmap_block_dminrecs(int lev, xfs_btree_cur_t *cur)
396{
397 return XFS_BMAP_BLOCK_DMINRECS(lev, cur);
398}
399#endif
400
401#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BLOCK_DSIZE)
402int
403xfs_bmap_block_dsize(int lev, xfs_btree_cur_t *cur)
404{
405 return XFS_BMAP_BLOCK_DSIZE(lev, cur);
406}
407#endif
408
409#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BLOCK_IMAXRECS)
410int
411xfs_bmap_block_imaxrecs(int lev, xfs_btree_cur_t *cur)
412{
413 return XFS_BMAP_BLOCK_IMAXRECS(lev, cur);
414}
415#endif
416
417#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BLOCK_IMINRECS)
418int
419xfs_bmap_block_iminrecs(int lev, xfs_btree_cur_t *cur)
420{
421 return XFS_BMAP_BLOCK_IMINRECS(lev, cur);
422}
423#endif
424
425#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BLOCK_ISIZE)
426int
427xfs_bmap_block_isize(int lev, xfs_btree_cur_t *cur)
428{
429 return XFS_BMAP_BLOCK_ISIZE(lev, cur);
430}
431#endif
432
433#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BROOT_KEY_ADDR)
434/*ARGSUSED3*/
435xfs_bmbt_key_t *
436xfs_bmap_broot_key_addr(xfs_bmbt_block_t *bb, int i, int sz)
437{
438 return XFS_BMAP_BROOT_KEY_ADDR(bb, i, sz);
439}
440#endif
441
442#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BROOT_MAXRECS)
443int
444xfs_bmap_broot_maxrecs(int sz)
445{
446 return XFS_BMAP_BROOT_MAXRECS(sz);
447}
448#endif
449
450#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BROOT_NUMRECS)
451int
452xfs_bmap_broot_numrecs(xfs_bmdr_block_t *bb)
453{
454 return XFS_BMAP_BROOT_NUMRECS(bb);
455}
456#endif
457
458#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BROOT_PTR_ADDR)
459xfs_bmbt_ptr_t *
460xfs_bmap_broot_ptr_addr(xfs_bmbt_block_t *bb, int i, int sz)
461{
462 return XFS_BMAP_BROOT_PTR_ADDR(bb, i, sz);
463}
464#endif
465
466#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BROOT_REC_ADDR)
467/*ARGSUSED3*/
468xfs_bmbt_rec_t *
469xfs_bmap_broot_rec_addr(xfs_bmbt_block_t *bb, int i, int sz)
470{
471 return XFS_BMAP_BROOT_REC_ADDR(bb, i, sz);
472}
473#endif
474
475#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BROOT_SPACE)
476int
477xfs_bmap_broot_space(xfs_bmdr_block_t *bb)
478{
479 return XFS_BMAP_BROOT_SPACE(bb);
480}
481#endif
482
483#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BROOT_SPACE_CALC)
484int
485xfs_bmap_broot_space_calc(int nrecs)
486{
487 return XFS_BMAP_BROOT_SPACE_CALC(nrecs);
488}
489#endif
490
491#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_IBLOCK_SIZE)
492/*ARGSUSED1*/
493int
494xfs_bmap_iblock_size(int lev, xfs_btree_cur_t *cur)
495{
496 return XFS_BMAP_IBLOCK_SIZE(lev, cur);
497}
498#endif
499
500#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_INIT)
501void
502xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
503{
504 XFS_BMAP_INIT(flp, fbp);
505}
506#endif
507
508#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_KEY_DADDR)
509/*ARGSUSED3*/
510xfs_bmbt_key_t *
511xfs_bmap_key_daddr(xfs_bmbt_block_t *bb, int i, xfs_btree_cur_t *cur)
512{
513 return XFS_BMAP_KEY_DADDR(bb, i, cur);
514}
515#endif
516
517#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_KEY_IADDR)
518/*ARGSUSED3*/
519xfs_bmbt_key_t *
520xfs_bmap_key_iaddr(xfs_bmbt_block_t *bb, int i, xfs_btree_cur_t *cur)
521{
522 return XFS_BMAP_KEY_IADDR(bb, i, cur);
523}
524#endif
525
526#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_PTR_DADDR)
527xfs_bmbt_ptr_t *
528xfs_bmap_ptr_daddr(xfs_bmbt_block_t *bb, int i, xfs_btree_cur_t *cur)
529{
530 return XFS_BMAP_PTR_DADDR(bb, i, cur);
531}
532#endif
533
534#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_PTR_IADDR)
535xfs_bmbt_ptr_t *
536xfs_bmap_ptr_iaddr(xfs_bmbt_block_t *bb, int i, xfs_btree_cur_t *cur)
537{
538 return XFS_BMAP_PTR_IADDR(bb, i, cur);
539}
540#endif
541
542#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_RBLOCK_DSIZE)
543/*ARGSUSED1*/
544int
545xfs_bmap_rblock_dsize(int lev, xfs_btree_cur_t *cur)
546{
547 return XFS_BMAP_RBLOCK_DSIZE(lev, cur);
548}
549#endif
550
551#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_RBLOCK_ISIZE)
552/*ARGSUSED1*/
553int
554xfs_bmap_rblock_isize(int lev, xfs_btree_cur_t *cur)
555{
556 return XFS_BMAP_RBLOCK_ISIZE(lev, cur);
557}
558#endif
559
560#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_REC_DADDR)
561/*ARGSUSED3*/
562xfs_bmbt_rec_t *
563xfs_bmap_rec_daddr(xfs_bmbt_block_t *bb, int i, xfs_btree_cur_t *cur)
564{
565 return XFS_BMAP_REC_DADDR(bb, i, cur);
566}
567#endif
568
569#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_REC_IADDR)
570/*ARGSUSED3*/
571xfs_bmbt_rec_t *
572xfs_bmap_rec_iaddr(xfs_bmbt_block_t *bb, int i, xfs_btree_cur_t *cur)
573{
574 return XFS_BMAP_REC_IADDR(bb, i, cur);
575}
576#endif
577
578#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_SANITY_CHECK)
579int
580xfs_bmap_sanity_check(xfs_mount_t *mp, xfs_bmbt_block_t *bb, int level)
581{
582 return XFS_BMAP_SANITY_CHECK(mp, bb, level);
583}
584#endif
585
586#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAPI_AFLAG)
587int
588xfs_bmapi_aflag(int w)
589{
590 return XFS_BMAPI_AFLAG(w);
591}
592#endif
593
594#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMDR_SPACE_CALC)
595int
596xfs_bmdr_space_calc(int nrecs)
597{
598 return XFS_BMDR_SPACE_CALC(nrecs);
599}
600#endif
601
602#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BNO_BLOCK)
603xfs_agblock_t
604xfs_bno_block(xfs_mount_t *mp)
605{
606 return XFS_BNO_BLOCK(mp);
607}
608#endif
609
610#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BTREE_LONG_PTRS)
611int
612xfs_btree_long_ptrs(xfs_btnum_t btnum)
613{
614 return XFS_BTREE_LONG_PTRS(btnum);
615}
616#endif
617
618#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_AGF)
619xfs_agf_t *
620xfs_buf_to_agf(xfs_buf_t *bp)
621{
622 return XFS_BUF_TO_AGF(bp);
623}
624#endif
625
626#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_AGFL)
627xfs_agfl_t *
628xfs_buf_to_agfl(xfs_buf_t *bp)
629{
630 return XFS_BUF_TO_AGFL(bp);
631}
632#endif
633
634#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_AGI)
635xfs_agi_t *
636xfs_buf_to_agi(xfs_buf_t *bp)
637{
638 return XFS_BUF_TO_AGI(bp);
639}
640#endif
641
642#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_ALLOC_BLOCK)
643xfs_alloc_block_t *
644xfs_buf_to_alloc_block(xfs_buf_t *bp)
645{
646 return XFS_BUF_TO_ALLOC_BLOCK(bp);
647}
648#endif
649
650#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_BLOCK)
651xfs_btree_block_t *
652xfs_buf_to_block(xfs_buf_t *bp)
653{
654 return XFS_BUF_TO_BLOCK(bp);
655}
656#endif
657
658#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_BMBT_BLOCK)
659xfs_bmbt_block_t *
660xfs_buf_to_bmbt_block(xfs_buf_t *bp)
661{
662 return XFS_BUF_TO_BMBT_BLOCK(bp);
663}
664#endif
665
666#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_DINODE)
667xfs_dinode_t *
668xfs_buf_to_dinode(xfs_buf_t *bp)
669{
670 return XFS_BUF_TO_DINODE(bp);
671}
672#endif
673
674#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_INOBT_BLOCK)
675xfs_inobt_block_t *
676xfs_buf_to_inobt_block(xfs_buf_t *bp)
677{
678 return XFS_BUF_TO_INOBT_BLOCK(bp);
679}
680#endif
681
682#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_LBLOCK)
683xfs_btree_lblock_t *
684xfs_buf_to_lblock(xfs_buf_t *bp)
685{
686 return XFS_BUF_TO_LBLOCK(bp);
687}
688#endif
689
690#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_SBLOCK)
691xfs_btree_sblock_t *
692xfs_buf_to_sblock(xfs_buf_t *bp)
693{
694 return XFS_BUF_TO_SBLOCK(bp);
695}
696#endif
697
698#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_SBP)
699xfs_sb_t *
700xfs_buf_to_sbp(xfs_buf_t *bp)
701{
702 return XFS_BUF_TO_SBP(bp);
703}
704#endif
705
706#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_ASIZE)
707int
708xfs_cfork_asize_disk(xfs_dinode_core_t *dcp, xfs_mount_t *mp)
709{
710 return XFS_CFORK_ASIZE_DISK(dcp, mp);
711}
712int
713xfs_cfork_asize(xfs_dinode_core_t *dcp, xfs_mount_t *mp)
714{
715 return XFS_CFORK_ASIZE(dcp, mp);
716}
717#endif
718
719#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_BOFF)
720int
721xfs_cfork_boff_disk(xfs_dinode_core_t *dcp)
722{
723 return XFS_CFORK_BOFF_DISK(dcp);
724}
725int
726xfs_cfork_boff(xfs_dinode_core_t *dcp)
727{
728 return XFS_CFORK_BOFF(dcp);
729}
730#endif
731
732#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_DSIZE)
733int
734xfs_cfork_dsize_disk(xfs_dinode_core_t *dcp, xfs_mount_t *mp)
735{
736 return XFS_CFORK_DSIZE_DISK(dcp, mp);
737}
738int
739xfs_cfork_dsize(xfs_dinode_core_t *dcp, xfs_mount_t *mp)
740{
741 return XFS_CFORK_DSIZE(dcp, mp);
742}
743#endif
744
745#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_FMT_SET)
746void
747xfs_cfork_fmt_set(xfs_dinode_core_t *dcp, int w, int n)
748{
749 XFS_CFORK_FMT_SET(dcp, w, n);
750}
751#endif
752
753#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_FORMAT)
754int
755xfs_cfork_format(xfs_dinode_core_t *dcp, int w)
756{
757 return XFS_CFORK_FORMAT(dcp, w);
758}
759#endif
760
761#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_NEXT_SET)
762void
763xfs_cfork_next_set(xfs_dinode_core_t *dcp, int w, int n)
764{
765 XFS_CFORK_NEXT_SET(dcp, w, n);
766}
767#endif
768
769#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_NEXTENTS)
770int
771xfs_cfork_nextents_disk(xfs_dinode_core_t *dcp, int w)
772{
773 return XFS_CFORK_NEXTENTS_DISK(dcp, w);
774}
775int
776xfs_cfork_nextents(xfs_dinode_core_t *dcp, int w)
777{
778 return XFS_CFORK_NEXTENTS(dcp, w);
779}
780#endif
781
782#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_Q)
783int
784xfs_cfork_q_disk(xfs_dinode_core_t *dcp)
785{
786 return XFS_CFORK_Q_DISK(dcp);
787}
788int
789xfs_cfork_q(xfs_dinode_core_t *dcp)
790{
791 return XFS_CFORK_Q(dcp);
792}
793#endif
794
795#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_SIZE)
796int
797xfs_cfork_size_disk(xfs_dinode_core_t *dcp, xfs_mount_t *mp, int w)
798{
799 return XFS_CFORK_SIZE_DISK(dcp, mp, w);
800}
801int
802xfs_cfork_size(xfs_dinode_core_t *dcp, xfs_mount_t *mp, int w)
803{
804 return XFS_CFORK_SIZE(dcp, mp, w);
805}
806#endif
807
808#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CNT_BLOCK)
809xfs_agblock_t
810xfs_cnt_block(xfs_mount_t *mp)
811{
812 return XFS_CNT_BLOCK(mp);
813}
814#endif
815
816#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DA_COOKIE_BNO)
817xfs_dablk_t
818xfs_da_cookie_bno(xfs_mount_t *mp, xfs_off_t cookie)
819{
820 return XFS_DA_COOKIE_BNO(mp, cookie);
821}
822#endif
823
824#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DA_COOKIE_ENTRY)
825int
826xfs_da_cookie_entry(xfs_mount_t *mp, xfs_off_t cookie)
827{
828 return XFS_DA_COOKIE_ENTRY(mp, cookie);
829}
830#endif
831
832#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DA_COOKIE_HASH)
833/*ARGSUSED1*/
834xfs_dahash_t
835xfs_da_cookie_hash(xfs_mount_t *mp, xfs_off_t cookie)
836{
837 return XFS_DA_COOKIE_HASH(mp, cookie);
838}
839#endif
840
841#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DA_MAKE_BNOENTRY)
842__uint32_t
843xfs_da_make_bnoentry(xfs_mount_t *mp, xfs_dablk_t bno, int entry)
844{
845 return XFS_DA_MAKE_BNOENTRY(mp, bno, entry);
846}
847#endif
848
849#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DA_MAKE_COOKIE)
850xfs_off_t
851xfs_da_make_cookie(xfs_mount_t *mp, xfs_dablk_t bno, int entry,
852 xfs_dahash_t hash)
853{
854 return XFS_DA_MAKE_COOKIE(mp, bno, entry, hash);
855}
856#endif
857
858#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DADDR_TO_AGBNO)
859xfs_agblock_t
860xfs_daddr_to_agbno(xfs_mount_t *mp, xfs_daddr_t d)
861{
862 return XFS_DADDR_TO_AGBNO(mp, d);
863}
864#endif
865
866#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DADDR_TO_AGNO)
867xfs_agnumber_t
868xfs_daddr_to_agno(xfs_mount_t *mp, xfs_daddr_t d)
869{
870 return XFS_DADDR_TO_AGNO(mp, d);
871}
872#endif
873
874#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DADDR_TO_FSB)
875xfs_fsblock_t
876xfs_daddr_to_fsb(xfs_mount_t *mp, xfs_daddr_t d)
877{
878 return XFS_DADDR_TO_FSB(mp, d);
879}
880#endif
881
882#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_APTR)
883char *
884xfs_dfork_aptr(xfs_dinode_t *dip)
885{
886 return XFS_DFORK_APTR(dip);
887}
888#endif
889
890#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_ASIZE)
891int
892xfs_dfork_asize(xfs_dinode_t *dip, xfs_mount_t *mp)
893{
894 return XFS_DFORK_ASIZE(dip, mp);
895}
896#endif
897
898#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_BOFF)
899int
900xfs_dfork_boff(xfs_dinode_t *dip)
901{
902 return XFS_DFORK_BOFF(dip);
903}
904#endif
905
906#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_DPTR)
907char *
908xfs_dfork_dptr(xfs_dinode_t *dip)
909{
910 return XFS_DFORK_DPTR(dip);
911}
912#endif
913
914#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_DSIZE)
915int
916xfs_dfork_dsize(xfs_dinode_t *dip, xfs_mount_t *mp)
917{
918 return XFS_DFORK_DSIZE(dip, mp);
919}
920#endif
921
922#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_NEXTENTS)
923int
924xfs_dfork_nextents(xfs_dinode_t *dip, int w)
925{
926 return XFS_DFORK_NEXTENTS(dip, w);
927}
928#endif
929
930#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_PTR)
931char *
932xfs_dfork_ptr(xfs_dinode_t *dip, int w)
933{
934 return XFS_DFORK_PTR(dip, w);
935}
936#endif
937
938#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_Q)
939int
940xfs_dfork_q(xfs_dinode_t *dip)
941{
942 return XFS_DFORK_Q(dip);
943}
944#endif
945
946#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_SIZE)
947int
948xfs_dfork_size(xfs_dinode_t *dip, xfs_mount_t *mp, int w)
949{
950 return XFS_DFORK_SIZE(dip, mp, w);
951}
952#endif
953
954#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DINODE_GOOD_VERSION)
955int
956xfs_dinode_good_version(int v)
957{
958 return XFS_DINODE_GOOD_VERSION(v);
959}
960#endif
961
962#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_LEAF_ENTSIZE_BYENTRY)
963int
964xfs_dir_leaf_entsize_byentry(xfs_dir_leaf_entry_t *entry)
965{
966 return XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry);
967}
968#endif
969
970#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_LEAF_ENTSIZE_BYNAME)
971int
972xfs_dir_leaf_entsize_byname(int len)
973{
974 return XFS_DIR_LEAF_ENTSIZE_BYNAME(len);
975}
976#endif
977
978#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_LEAF_NAMESTRUCT)
979xfs_dir_leaf_name_t *
980xfs_dir_leaf_namestruct(xfs_dir_leafblock_t *leafp, int offset)
981{
982 return XFS_DIR_LEAF_NAMESTRUCT(leafp, offset);
983}
984#endif
985
986#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_SF_ALLFIT)
987int
988xfs_dir_sf_allfit(int count, int totallen)
989{
990 return XFS_DIR_SF_ALLFIT(count, totallen);
991}
992#endif
993
994#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_SF_ENTSIZE_BYENTRY)
995int
996xfs_dir_sf_entsize_byentry(xfs_dir_sf_entry_t *sfep)
997{
998 return XFS_DIR_SF_ENTSIZE_BYENTRY(sfep);
999}
1000#endif
1001
1002#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_SF_ENTSIZE_BYNAME)
1003int
1004xfs_dir_sf_entsize_byname(int len)
1005{
1006 return XFS_DIR_SF_ENTSIZE_BYNAME(len);
1007}
1008#endif
1009
1010#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_SF_GET_DIRINO)
1011void
1012xfs_dir_sf_get_dirino(xfs_dir_ino_t *from, xfs_ino_t *to)
1013{
1014 XFS_DIR_SF_GET_DIRINO(from, to);
1015}
1016#endif
1017
1018#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_SF_NEXTENTRY)
1019xfs_dir_sf_entry_t *
1020xfs_dir_sf_nextentry(xfs_dir_sf_entry_t *sfep)
1021{
1022 return XFS_DIR_SF_NEXTENTRY(sfep);
1023}
1024#endif
1025
1026#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_SF_PUT_DIRINO)
1027void
1028xfs_dir_sf_put_dirino(xfs_ino_t *from, xfs_dir_ino_t *to)
1029{
1030 XFS_DIR_SF_PUT_DIRINO(from, to);
1031}
1032#endif
1033
1034#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_BLOCK_LEAF_P)
1035xfs_dir2_leaf_entry_t *
1036xfs_dir2_block_leaf_p(xfs_dir2_block_tail_t *btp)
1037{
1038 return XFS_DIR2_BLOCK_LEAF_P(btp);
1039}
1040#endif
1041
1042#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_BLOCK_TAIL_P)
1043xfs_dir2_block_tail_t *
1044xfs_dir2_block_tail_p(xfs_mount_t *mp, xfs_dir2_block_t *block)
1045{
1046 return XFS_DIR2_BLOCK_TAIL_P(mp, block);
1047}
1048#endif
1049
1050#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_BYTE_TO_DA)
1051xfs_dablk_t
1052xfs_dir2_byte_to_da(xfs_mount_t *mp, xfs_dir2_off_t by)
1053{
1054 return XFS_DIR2_BYTE_TO_DA(mp, by);
1055}
1056#endif
1057
1058#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_BYTE_TO_DATAPTR)
1059/* ARGSUSED */
1060xfs_dir2_dataptr_t
1061xfs_dir2_byte_to_dataptr(xfs_mount_t *mp, xfs_dir2_off_t by)
1062{
1063 return XFS_DIR2_BYTE_TO_DATAPTR(mp, by);
1064}
1065#endif
1066
1067#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_BYTE_TO_DB)
1068xfs_dir2_db_t
1069xfs_dir2_byte_to_db(xfs_mount_t *mp, xfs_dir2_off_t by)
1070{
1071 return XFS_DIR2_BYTE_TO_DB(mp, by);
1072}
1073#endif
1074
1075#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_BYTE_TO_OFF)
1076xfs_dir2_data_aoff_t
1077xfs_dir2_byte_to_off(xfs_mount_t *mp, xfs_dir2_off_t by)
1078{
1079 return XFS_DIR2_BYTE_TO_OFF(mp, by);
1080}
1081#endif
1082
1083#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DA_TO_BYTE)
1084xfs_dir2_off_t
1085xfs_dir2_da_to_byte(xfs_mount_t *mp, xfs_dablk_t da)
1086{
1087 return XFS_DIR2_DA_TO_BYTE(mp, da);
1088}
1089#endif
1090
1091#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DA_TO_DB)
1092xfs_dir2_db_t
1093xfs_dir2_da_to_db(xfs_mount_t *mp, xfs_dablk_t da)
1094{
1095 return XFS_DIR2_DA_TO_DB(mp, da);
1096}
1097#endif
1098
1099#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DATA_ENTRY_TAG_P)
1100xfs_dir2_data_off_t *
1101xfs_dir2_data_entry_tag_p(xfs_dir2_data_entry_t *dep)
1102{
1103 return XFS_DIR2_DATA_ENTRY_TAG_P(dep);
1104}
1105#endif
1106
1107#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DATA_ENTSIZE)
1108int
1109xfs_dir2_data_entsize(int n)
1110{
1111 return XFS_DIR2_DATA_ENTSIZE(n);
1112}
1113#endif
1114
1115#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DATA_UNUSED_TAG_P)
1116xfs_dir2_data_off_t *
1117xfs_dir2_data_unused_tag_p(xfs_dir2_data_unused_t *dup)
1118{
1119 return XFS_DIR2_DATA_UNUSED_TAG_P(dup);
1120}
1121#endif
1122
1123#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DATAPTR_TO_BYTE)
1124/* ARGSUSED */
1125xfs_dir2_off_t
1126xfs_dir2_dataptr_to_byte(xfs_mount_t *mp, xfs_dir2_dataptr_t dp)
1127{
1128 return XFS_DIR2_DATAPTR_TO_BYTE(mp, dp);
1129}
1130#endif
1131
1132#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DATAPTR_TO_DB)
1133xfs_dir2_db_t
1134xfs_dir2_dataptr_to_db(xfs_mount_t *mp, xfs_dir2_dataptr_t dp)
1135{
1136 return XFS_DIR2_DATAPTR_TO_DB(mp, dp);
1137}
1138#endif
1139
1140#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DATAPTR_TO_OFF)
1141xfs_dir2_data_aoff_t
1142xfs_dir2_dataptr_to_off(xfs_mount_t *mp, xfs_dir2_dataptr_t dp)
1143{
1144 return XFS_DIR2_DATAPTR_TO_OFF(mp, dp);
1145}
1146#endif
1147
1148#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DB_OFF_TO_BYTE)
1149xfs_dir2_off_t
1150xfs_dir2_db_off_to_byte(xfs_mount_t *mp, xfs_dir2_db_t db,
1151 xfs_dir2_data_aoff_t o)
1152{
1153 return XFS_DIR2_DB_OFF_TO_BYTE(mp, db, o);
1154}
1155#endif
1156
1157#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DB_OFF_TO_DATAPTR)
1158xfs_dir2_dataptr_t
1159xfs_dir2_db_off_to_dataptr(xfs_mount_t *mp, xfs_dir2_db_t db,
1160 xfs_dir2_data_aoff_t o)
1161{
1162 return XFS_DIR2_DB_OFF_TO_DATAPTR(mp, db, o);
1163}
1164#endif
1165
1166#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DB_TO_DA)
1167xfs_dablk_t
1168xfs_dir2_db_to_da(xfs_mount_t *mp, xfs_dir2_db_t db)
1169{
1170 return XFS_DIR2_DB_TO_DA(mp, db);
1171}
1172#endif
1173
1174#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DB_TO_FDB)
1175xfs_dir2_db_t
1176xfs_dir2_db_to_fdb(xfs_mount_t *mp, xfs_dir2_db_t db)
1177{
1178 return XFS_DIR2_DB_TO_FDB(mp, db);
1179}
1180#endif
1181
1182#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DB_TO_FDINDEX)
1183int
1184xfs_dir2_db_to_fdindex(xfs_mount_t *mp, xfs_dir2_db_t db)
1185{
1186 return XFS_DIR2_DB_TO_FDINDEX(mp, db);
1187}
1188#endif
1189
1190#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_LEAF_BESTS_P)
1191xfs_dir2_data_off_t *
1192xfs_dir2_leaf_bests_p(xfs_dir2_leaf_tail_t *ltp)
1193{
1194 return XFS_DIR2_LEAF_BESTS_P(ltp);
1195}
1196#endif
1197
1198#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_LEAF_TAIL_P)
1199xfs_dir2_leaf_tail_t *
1200xfs_dir2_leaf_tail_p(xfs_mount_t *mp, xfs_dir2_leaf_t *lp)
1201{
1202 return XFS_DIR2_LEAF_TAIL_P(mp, lp);
1203}
1204#endif
1205
1206#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_MAX_LEAF_ENTS)
1207int
1208xfs_dir2_max_leaf_ents(xfs_mount_t *mp)
1209{
1210 return XFS_DIR2_MAX_LEAF_ENTS(mp);
1211}
1212#endif
1213
1214#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_ENTSIZE_BYENTRY)
1215int
1216xfs_dir2_sf_entsize_byentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep)
1217{
1218 return XFS_DIR2_SF_ENTSIZE_BYENTRY(sfp, sfep);
1219}
1220#endif
1221
1222#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_FIRSTENTRY)
1223xfs_dir2_sf_entry_t *
1224xfs_dir2_sf_firstentry(xfs_dir2_sf_t *sfp)
1225{
1226 return XFS_DIR2_SF_FIRSTENTRY(sfp);
1227}
1228#endif
1229
1230#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_ENTSIZE_BYNAME)
1231int
1232xfs_dir2_sf_entsize_byname(xfs_dir2_sf_t *sfp, int len)
1233{
1234 return XFS_DIR2_SF_ENTSIZE_BYNAME(sfp, len);
1235}
1236#endif
1237
1238#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_GET_INUMBER)
1239xfs_intino_t
1240xfs_dir2_sf_get_inumber(xfs_dir2_sf_t *sfp, xfs_dir2_inou_t *from)
1241{
1242 return XFS_DIR2_SF_GET_INUMBER(sfp, from);
1243}
1244#endif
1245
1246#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_GET_OFFSET)
1247xfs_dir2_data_aoff_t
1248xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep)
1249{
1250 return XFS_DIR2_SF_GET_OFFSET(sfep);
1251}
1252#endif
1253
1254#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_HDR_SIZE)
1255int
1256xfs_dir2_sf_hdr_size(int i8count)
1257{
1258 return XFS_DIR2_SF_HDR_SIZE(i8count);
1259}
1260#endif
1261
1262#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_INUMBERP)
1263xfs_dir2_inou_t *
1264xfs_dir2_sf_inumberp(xfs_dir2_sf_entry_t *sfep)
1265{
1266 return XFS_DIR2_SF_INUMBERP(sfep);
1267}
1268#endif
1269
1270#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_NEXTENTRY)
1271xfs_dir2_sf_entry_t *
1272xfs_dir2_sf_nextentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep)
1273{
1274 return XFS_DIR2_SF_NEXTENTRY(sfp, sfep);
1275}
1276#endif
1277
1278#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_PUT_INUMBER)
1279void
1280xfs_dir2_sf_put_inumber(xfs_dir2_sf_t *sfp, xfs_ino_t *from, xfs_dir2_inou_t *to)
1281{
1282 XFS_DIR2_SF_PUT_INUMBER(sfp, from, to);
1283}
1284#endif
1285
1286#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_PUT_OFFSET)
1287void
1288xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off)
1289{
1290 XFS_DIR2_SF_PUT_OFFSET(sfep, off);
1291}
1292#endif
1293
1294#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_EXTFMT_INODE )
1295xfs_exntfmt_t
1296xfs_extfmt_inode(struct xfs_inode *ip)
1297{
1298 return XFS_EXTFMT_INODE(ip);
1299}
1300#endif
1301
1302#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_EXTLEN_MAX)
1303xfs_extlen_t
1304xfs_extlen_max(xfs_extlen_t a, xfs_extlen_t b)
1305{
1306 return XFS_EXTLEN_MAX(a, b);
1307}
1308#endif
1309
1310#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_EXTLEN_MIN)
1311xfs_extlen_t
1312xfs_extlen_min(xfs_extlen_t a, xfs_extlen_t b)
1313{
1314 return XFS_EXTLEN_MIN(a, b);
1315}
1316#endif
1317
1318#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FILBLKS_MAX)
1319xfs_filblks_t
1320xfs_filblks_max(xfs_filblks_t a, xfs_filblks_t b)
1321{
1322 return XFS_FILBLKS_MAX(a, b);
1323}
1324#endif
1325
1326#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FILBLKS_MIN)
1327xfs_filblks_t
1328xfs_filblks_min(xfs_filblks_t a, xfs_filblks_t b)
1329{
1330 return XFS_FILBLKS_MIN(a, b);
1331}
1332#endif
1333
1334#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FILEOFF_MAX)
1335xfs_fileoff_t
1336xfs_fileoff_max(xfs_fileoff_t a, xfs_fileoff_t b)
1337{
1338 return XFS_FILEOFF_MAX(a, b);
1339}
1340#endif
1341
1342#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FILEOFF_MIN)
1343xfs_fileoff_t
1344xfs_fileoff_min(xfs_fileoff_t a, xfs_fileoff_t b)
1345{
1346 return XFS_FILEOFF_MIN(a, b);
1347}
1348#endif
1349
1350#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FSB_SANITY_CHECK)
1351int
1352xfs_fsb_sanity_check(xfs_mount_t *mp, xfs_fsblock_t fsbno)
1353{
1354 return XFS_FSB_SANITY_CHECK(mp, fsbno);
1355}
1356#endif
1357
1358#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FSB_TO_AGBNO)
1359xfs_agblock_t
1360xfs_fsb_to_agbno(xfs_mount_t *mp, xfs_fsblock_t fsbno)
1361{
1362 return XFS_FSB_TO_AGBNO(mp, fsbno);
1363}
1364#endif
1365
1366#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FSB_TO_AGNO)
1367xfs_agnumber_t
1368xfs_fsb_to_agno(xfs_mount_t *mp, xfs_fsblock_t fsbno)
1369{
1370 return XFS_FSB_TO_AGNO(mp, fsbno);
1371}
1372#endif
1373
1374#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FSB_TO_DADDR)
1375xfs_daddr_t
1376xfs_fsb_to_daddr(xfs_mount_t *mp, xfs_fsblock_t fsbno)
1377{
1378 return XFS_FSB_TO_DADDR(mp, fsbno);
1379}
1380#endif
1381
1382#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FSB_TO_DB)
1383xfs_daddr_t
1384xfs_fsb_to_db(xfs_inode_t *ip, xfs_fsblock_t fsb)
1385{
1386 return XFS_FSB_TO_DB(ip, fsb);
1387}
1388#endif
1389
1390#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_HDR_BLOCK)
1391xfs_agblock_t
1392xfs_hdr_block(xfs_mount_t *mp, xfs_daddr_t d)
1393{
1394 return XFS_HDR_BLOCK(mp, d);
1395}
1396#endif
1397
1398#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IALLOC_BLOCKS)
1399xfs_extlen_t
1400xfs_ialloc_blocks(xfs_mount_t *mp)
1401{
1402 return XFS_IALLOC_BLOCKS(mp);
1403}
1404#endif
1405
1406#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IALLOC_FIND_FREE)
1407int
1408xfs_ialloc_find_free(xfs_inofree_t *fp)
1409{
1410 return XFS_IALLOC_FIND_FREE(fp);
1411}
1412#endif
1413
1414#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IALLOC_INODES)
1415int
1416xfs_ialloc_inodes(xfs_mount_t *mp)
1417{
1418 return XFS_IALLOC_INODES(mp);
1419}
1420#endif
1421
1422#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IBT_BLOCK)
1423xfs_agblock_t
1424xfs_ibt_block(xfs_mount_t *mp)
1425{
1426 return XFS_IBT_BLOCK(mp);
1427}
1428#endif
1429
1430#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_ASIZE)
1431int
1432xfs_ifork_asize(xfs_inode_t *ip)
1433{
1434 return XFS_IFORK_ASIZE(ip);
1435}
1436#endif
1437
1438#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_DSIZE)
1439int
1440xfs_ifork_dsize(xfs_inode_t *ip)
1441{
1442 return XFS_IFORK_DSIZE(ip);
1443}
1444#endif
1445
1446#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_FMT_SET)
1447void
1448xfs_ifork_fmt_set(xfs_inode_t *ip, int w, int n)
1449{
1450 XFS_IFORK_FMT_SET(ip, w, n);
1451}
1452#endif
1453
1454#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_FORMAT)
1455int
1456xfs_ifork_format(xfs_inode_t *ip, int w)
1457{
1458 return XFS_IFORK_FORMAT(ip, w);
1459}
1460#endif
1461
1462#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_NEXT_SET)
1463void
1464xfs_ifork_next_set(xfs_inode_t *ip, int w, int n)
1465{
1466 XFS_IFORK_NEXT_SET(ip, w, n);
1467}
1468#endif
1469
1470#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_NEXTENTS)
1471int
1472xfs_ifork_nextents(xfs_inode_t *ip, int w)
1473{
1474 return XFS_IFORK_NEXTENTS(ip, w);
1475}
1476#endif
1477
1478#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_PTR)
1479xfs_ifork_t *
1480xfs_ifork_ptr(xfs_inode_t *ip, int w)
1481{
1482 return XFS_IFORK_PTR(ip, w);
1483}
1484#endif
1485
1486#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_Q)
1487int
1488xfs_ifork_q(xfs_inode_t *ip)
1489{
1490 return XFS_IFORK_Q(ip);
1491}
1492#endif
1493
1494#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_SIZE)
1495int
1496xfs_ifork_size(xfs_inode_t *ip, int w)
1497{
1498 return XFS_IFORK_SIZE(ip, w);
1499}
1500#endif
1501
1502#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ILOG_FBROOT)
1503int
1504xfs_ilog_fbroot(int w)
1505{
1506 return XFS_ILOG_FBROOT(w);
1507}
1508#endif
1509
1510#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ILOG_FDATA)
1511int
1512xfs_ilog_fdata(int w)
1513{
1514 return XFS_ILOG_FDATA(w);
1515}
1516#endif
1517
1518#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ILOG_FEXT)
1519int
1520xfs_ilog_fext(int w)
1521{
1522 return XFS_ILOG_FEXT(w);
1523}
1524#endif
1525
1526#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IN_MAXLEVELS)
1527int
1528xfs_in_maxlevels(xfs_mount_t *mp)
1529{
1530 return XFS_IN_MAXLEVELS(mp);
1531}
1532#endif
1533
1534#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_AGBNO_BITS)
1535int
1536xfs_ino_agbno_bits(xfs_mount_t *mp)
1537{
1538 return XFS_INO_AGBNO_BITS(mp);
1539}
1540#endif
1541
1542#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_AGINO_BITS)
1543int
1544xfs_ino_agino_bits(xfs_mount_t *mp)
1545{
1546 return XFS_INO_AGINO_BITS(mp);
1547}
1548#endif
1549
1550#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_AGNO_BITS)
1551int
1552xfs_ino_agno_bits(xfs_mount_t *mp)
1553{
1554 return XFS_INO_AGNO_BITS(mp);
1555}
1556#endif
1557
1558#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_BITS)
1559int
1560xfs_ino_bits(xfs_mount_t *mp)
1561{
1562 return XFS_INO_BITS(mp);
1563}
1564#endif
1565
1566#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_MASK)
1567__uint32_t
1568xfs_ino_mask(int k)
1569{
1570 return XFS_INO_MASK(k);
1571}
1572#endif
1573
1574#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_OFFSET_BITS)
1575int
1576xfs_ino_offset_bits(xfs_mount_t *mp)
1577{
1578 return XFS_INO_OFFSET_BITS(mp);
1579}
1580#endif
1581
1582#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_TO_AGBNO)
1583xfs_agblock_t
1584xfs_ino_to_agbno(xfs_mount_t *mp, xfs_ino_t i)
1585{
1586 return XFS_INO_TO_AGBNO(mp, i);
1587}
1588#endif
1589
1590#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_TO_AGINO)
1591xfs_agino_t
1592xfs_ino_to_agino(xfs_mount_t *mp, xfs_ino_t i)
1593{
1594 return XFS_INO_TO_AGINO(mp, i);
1595}
1596#endif
1597
1598#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_TO_AGNO)
1599xfs_agnumber_t
1600xfs_ino_to_agno(xfs_mount_t *mp, xfs_ino_t i)
1601{
1602 return XFS_INO_TO_AGNO(mp, i);
1603}
1604#endif
1605
1606#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_TO_FSB)
1607xfs_fsblock_t
1608xfs_ino_to_fsb(xfs_mount_t *mp, xfs_ino_t i)
1609{
1610 return XFS_INO_TO_FSB(mp, i);
1611}
1612#endif
1613
1614#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_TO_OFFSET)
1615int
1616xfs_ino_to_offset(xfs_mount_t *mp, xfs_ino_t i)
1617{
1618 return XFS_INO_TO_OFFSET(mp, i);
1619}
1620#endif
1621
1622#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_BLOCK_MAXRECS)
1623int
1624xfs_inobt_block_maxrecs(int lev, xfs_btree_cur_t *cur)
1625{
1626 return XFS_INOBT_BLOCK_MAXRECS(lev, cur);
1627}
1628#endif
1629
1630#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_BLOCK_MINRECS)
1631int
1632xfs_inobt_block_minrecs(int lev, xfs_btree_cur_t *cur)
1633{
1634 return XFS_INOBT_BLOCK_MINRECS(lev, cur);
1635}
1636#endif
1637
1638#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_BLOCK_SIZE)
1639/*ARGSUSED1*/
1640int
1641xfs_inobt_block_size(int lev, xfs_btree_cur_t *cur)
1642{
1643 return XFS_INOBT_BLOCK_SIZE(lev, cur);
1644}
1645#endif
1646
1647#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_CLR_FREE)
1648void
1649xfs_inobt_clr_free(xfs_inobt_rec_t *rp, int i)
1650{
1651 XFS_INOBT_CLR_FREE(rp, i);
1652}
1653#endif
1654
1655#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_IS_FREE)
1656int
1657xfs_inobt_is_free(xfs_inobt_rec_t *rp, int i)
1658{
1659 return XFS_INOBT_IS_FREE(rp, i);
1660}
1661#endif
1662
1663#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_IS_LAST_REC)
1664int
1665xfs_inobt_is_last_rec(xfs_btree_cur_t *cur)
1666{
1667 return XFS_INOBT_IS_LAST_REC(cur);
1668}
1669#endif
1670
1671#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_KEY_ADDR)
1672/*ARGSUSED3*/
1673xfs_inobt_key_t *
1674xfs_inobt_key_addr(xfs_inobt_block_t *bb, int i, xfs_btree_cur_t *cur)
1675{
1676 return XFS_INOBT_KEY_ADDR(bb, i, cur);
1677}
1678#endif
1679
1680#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_MASK)
1681xfs_inofree_t
1682xfs_inobt_mask(int i)
1683{
1684 return XFS_INOBT_MASK(i);
1685}
1686#endif
1687
1688#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_MASKN)
1689xfs_inofree_t
1690xfs_inobt_maskn(int i, int n)
1691{
1692 return XFS_INOBT_MASKN(i, n);
1693}
1694#endif
1695
1696#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_PTR_ADDR)
1697xfs_inobt_ptr_t *
1698xfs_inobt_ptr_addr(xfs_inobt_block_t *bb, int i, xfs_btree_cur_t *cur)
1699{
1700 return XFS_INOBT_PTR_ADDR(bb, i, cur);
1701}
1702#endif
1703
1704#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_REC_ADDR)
1705/*ARGSUSED3*/
1706xfs_inobt_rec_t *
1707xfs_inobt_rec_addr(xfs_inobt_block_t *bb, int i, xfs_btree_cur_t *cur)
1708{
1709 return XFS_INOBT_REC_ADDR(bb, i, cur);
1710}
1711#endif
1712
1713#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_SET_FREE)
1714void
1715xfs_inobt_set_free(xfs_inobt_rec_t *rp, int i)
1716{
1717 XFS_INOBT_SET_FREE(rp, i);
1718}
1719#endif
1720
1721#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ITOBHV)
1722bhv_desc_t *
1723xfs_itobhv(xfs_inode_t *ip)
1724{
1725 return XFS_ITOBHV(ip);
1726}
1727#endif
1728
1729#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ITOV)
1730vnode_t *
1731xfs_itov(xfs_inode_t *ip)
1732{
1733 return XFS_ITOV(ip);
1734}
1735#endif
1736
1737#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LBLOG)
1738int
1739xfs_lblog(xfs_mount_t *mp)
1740{
1741 return XFS_LBLOG(mp);
1742}
1743#endif
1744
1745#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LBSIZE)
1746int
1747xfs_lbsize(xfs_mount_t *mp)
1748{
1749 return XFS_LBSIZE(mp);
1750}
1751#endif
1752
1753#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_ALL_FREE)
1754void
1755xfs_lic_all_free(xfs_log_item_chunk_t *cp)
1756{
1757 XFS_LIC_ALL_FREE(cp);
1758}
1759#endif
1760
1761#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_ARE_ALL_FREE)
1762int
1763xfs_lic_are_all_free(xfs_log_item_chunk_t *cp)
1764{
1765 return XFS_LIC_ARE_ALL_FREE(cp);
1766}
1767#endif
1768
1769#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_CLAIM)
1770void
1771xfs_lic_claim(xfs_log_item_chunk_t *cp, int slot)
1772{
1773 XFS_LIC_CLAIM(cp, slot);
1774}
1775#endif
1776
1777#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_DESC_TO_CHUNK)
1778xfs_log_item_chunk_t *
1779xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
1780{
1781 return XFS_LIC_DESC_TO_CHUNK(dp);
1782}
1783#endif
1784
1785#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_DESC_TO_SLOT)
1786int
1787xfs_lic_desc_to_slot(xfs_log_item_desc_t *dp)
1788{
1789 return XFS_LIC_DESC_TO_SLOT(dp);
1790}
1791#endif
1792
1793#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_INIT)
1794void
1795xfs_lic_init(xfs_log_item_chunk_t *cp)
1796{
1797 XFS_LIC_INIT(cp);
1798}
1799#endif
1800
1801#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_INIT_SLOT)
1802void
1803xfs_lic_init_slot(xfs_log_item_chunk_t *cp, int slot)
1804{
1805 XFS_LIC_INIT_SLOT(cp, slot);
1806}
1807#endif
1808
1809#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_ISFREE)
1810int
1811xfs_lic_isfree(xfs_log_item_chunk_t *cp, int slot)
1812{
1813 return XFS_LIC_ISFREE(cp, slot);
1814}
1815#endif
1816
1817#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_RELSE)
1818void
1819xfs_lic_relse(xfs_log_item_chunk_t *cp, int slot)
1820{
1821 XFS_LIC_RELSE(cp, slot);
1822}
1823#endif
1824
1825#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_SLOT)
1826xfs_log_item_desc_t *
1827xfs_lic_slot(xfs_log_item_chunk_t *cp, int slot)
1828{
1829 return XFS_LIC_SLOT(cp, slot);
1830}
1831#endif
1832
1833#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_VACANCY)
1834int
1835xfs_lic_vacancy(xfs_log_item_chunk_t *cp)
1836{
1837 return XFS_LIC_VACANCY(cp);
1838}
1839#endif
1840
1841#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LITINO)
1842int
1843xfs_litino(xfs_mount_t *mp)
1844{
1845 return XFS_LITINO(mp);
1846}
1847#endif
1848
1849#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MAKE_IPTR)
1850xfs_dinode_t *
1851xfs_make_iptr(xfs_mount_t *mp, xfs_buf_t *b, int o)
1852{
1853 return XFS_MAKE_IPTR(mp, b, o);
1854}
1855#endif
1856
1857#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MASK32HI)
1858__uint32_t
1859xfs_mask32hi(int n)
1860{
1861 return XFS_MASK32HI(n);
1862}
1863#endif
1864
1865#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MASK32LO)
1866__uint32_t
1867xfs_mask32lo(int n)
1868{
1869 return XFS_MASK32LO(n);
1870}
1871#endif
1872
1873#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MASK64HI)
1874__uint64_t
1875xfs_mask64hi(int n)
1876{
1877 return XFS_MASK64HI(n);
1878}
1879#endif
1880
1881#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MASK64LO)
1882__uint64_t
1883xfs_mask64lo(int n)
1884{
1885 return XFS_MASK64LO(n);
1886}
1887#endif
1888
1889#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MIN_FREELIST)
1890int
1891xfs_min_freelist(xfs_agf_t *a, xfs_mount_t *mp)
1892{
1893 return XFS_MIN_FREELIST(a, mp);
1894}
1895#endif
1896
1897#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MIN_FREELIST_PAG)
1898int
1899xfs_min_freelist_pag(xfs_perag_t *pag, xfs_mount_t *mp)
1900{
1901 return XFS_MIN_FREELIST_PAG(pag, mp);
1902}
1903#endif
1904
1905#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MIN_FREELIST_RAW)
1906int
1907xfs_min_freelist_raw(uint bl, uint cl, xfs_mount_t *mp)
1908{
1909 return XFS_MIN_FREELIST_RAW(bl, cl, mp);
1910}
1911#endif
1912
1913#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MTOVFS)
1914vfs_t *
1915xfs_mtovfs(xfs_mount_t *mp)
1916{
1917 return XFS_MTOVFS(mp);
1918}
1919#endif
1920
1921#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_OFFBNO_TO_AGINO)
1922xfs_agino_t
1923xfs_offbno_to_agino(xfs_mount_t *mp, xfs_agblock_t b, int o)
1924{
1925 return XFS_OFFBNO_TO_AGINO(mp, b, o);
1926}
1927#endif
1928
1929#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_PREALLOC_BLOCKS)
1930xfs_agblock_t
1931xfs_prealloc_blocks(xfs_mount_t *mp)
1932{
1933 return XFS_PREALLOC_BLOCKS(mp);
1934}
1935#endif
1936
1937#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_BLOCK)
1938xfs_agblock_t
1939xfs_sb_block(xfs_mount_t *mp)
1940{
1941 return XFS_SB_BLOCK(mp);
1942}
1943#endif
1944
1945#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_GOOD_VERSION)
1946int
1947xfs_sb_good_version(xfs_sb_t *sbp)
1948{
1949 return XFS_SB_GOOD_VERSION(sbp);
1950}
1951#endif
1952
1953#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_ADDATTR)
1954void
1955xfs_sb_version_addattr(xfs_sb_t *sbp)
1956{
1957 XFS_SB_VERSION_ADDATTR(sbp);
1958}
1959#endif
1960
1961#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_ADDDALIGN)
1962void
1963xfs_sb_version_adddalign(xfs_sb_t *sbp)
1964{
1965 XFS_SB_VERSION_ADDDALIGN(sbp);
1966}
1967#endif
1968
1969#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_ADDNLINK)
1970void
1971xfs_sb_version_addnlink(xfs_sb_t *sbp)
1972{
1973 XFS_SB_VERSION_ADDNLINK(sbp);
1974}
1975#endif
1976
1977#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_ADDQUOTA)
1978void
1979xfs_sb_version_addquota(xfs_sb_t *sbp)
1980{
1981 XFS_SB_VERSION_ADDQUOTA(sbp);
1982}
1983#endif
1984
1985#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_ADDSHARED)
1986void
1987xfs_sb_version_addshared(xfs_sb_t *sbp)
1988{
1989 XFS_SB_VERSION_ADDSHARED(sbp);
1990}
1991#endif
1992
1993#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASALIGN)
1994int
1995xfs_sb_version_hasalign(xfs_sb_t *sbp)
1996{
1997 return XFS_SB_VERSION_HASALIGN(sbp);
1998}
1999#endif
2000
2001#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASATTR)
2002int
2003xfs_sb_version_hasattr(xfs_sb_t *sbp)
2004{
2005 return XFS_SB_VERSION_HASATTR(sbp);
2006}
2007#endif
2008
2009#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASDALIGN)
2010int
2011xfs_sb_version_hasdalign(xfs_sb_t *sbp)
2012{
2013 return XFS_SB_VERSION_HASDALIGN(sbp);
2014}
2015#endif
2016
2017#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASDIRV2)
2018int
2019xfs_sb_version_hasdirv2(xfs_sb_t *sbp)
2020{
2021 return XFS_SB_VERSION_HASDIRV2(sbp);
2022}
2023#endif
2024
2025#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASEXTFLGBIT)
2026int
2027xfs_sb_version_hasextflgbit(xfs_sb_t *sbp)
2028{
2029 return XFS_SB_VERSION_HASEXTFLGBIT(sbp);
2030}
2031#endif
2032
2033#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASNLINK)
2034int
2035xfs_sb_version_hasnlink(xfs_sb_t *sbp)
2036{
2037 return XFS_SB_VERSION_HASNLINK(sbp);
2038}
2039#endif
2040
2041#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASQUOTA)
2042int
2043xfs_sb_version_hasquota(xfs_sb_t *sbp)
2044{
2045 return XFS_SB_VERSION_HASQUOTA(sbp);
2046}
2047#endif
2048
2049#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASSHARED)
2050int
2051xfs_sb_version_hasshared(xfs_sb_t *sbp)
2052{
2053 return XFS_SB_VERSION_HASSHARED(sbp);
2054}
2055#endif
2056
2057#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_NUM)
2058int
2059xfs_sb_version_num(xfs_sb_t *sbp)
2060{
2061 return XFS_SB_VERSION_NUM(sbp);
2062}
2063#endif
2064
2065#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_SUBALIGN)
2066void
2067xfs_sb_version_subalign(xfs_sb_t *sbp)
2068{
2069 XFS_SB_VERSION_SUBALIGN(sbp);
2070}
2071#endif
2072
2073#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_SUBSHARED)
2074void
2075xfs_sb_version_subshared(xfs_sb_t *sbp)
2076{
2077 XFS_SB_VERSION_SUBSHARED(sbp);
2078}
2079#endif
2080
2081#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASLOGV2)
2082int
2083xfs_sb_version_haslogv2(xfs_sb_t *sbp)
2084{
2085 return XFS_SB_VERSION_HASLOGV2(sbp);
2086}
2087#endif
2088
2089#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASSECTOR)
2090int
2091xfs_sb_version_hassector(xfs_sb_t *sbp)
2092{
2093 return XFS_SB_VERSION_HASSECTOR(sbp);
2094}
2095#endif
2096
2097#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_TONEW)
2098unsigned
2099xfs_sb_version_tonew(unsigned v)
2100{
2101 return XFS_SB_VERSION_TONEW(v);
2102}
2103#endif
2104
2105#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_TOOLD)
2106unsigned
2107xfs_sb_version_toold(unsigned v)
2108{
2109 return XFS_SB_VERSION_TOOLD(v);
2110}
2111#endif
2112
2113#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XLOG_GRANT_ADD_SPACE)
2114void
2115xlog_grant_add_space(xlog_t *log, int bytes, int type)
2116{
2117 XLOG_GRANT_ADD_SPACE(log, bytes, type);
2118}
2119#endif
2120
2121#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XLOG_GRANT_SUB_SPACE)
2122void
2123xlog_grant_sub_space(xlog_t *log, int bytes, int type)
2124{
2125 XLOG_GRANT_SUB_SPACE(log, bytes, type);
2126}
2127#endif
2128
2129#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASMOREBITS)
2130int
2131xfs_sb_version_hasmorebits(xfs_sb_t *sbp)
2132{
2133 return XFS_SB_VERSION_HASMOREBITS(sbp);
2134}
2135#endif
2136
diff --git a/fs/xfs/xfs_macros.h b/fs/xfs/xfs_macros.h
new file mode 100644
index 000000000000..0a9307514a48
--- /dev/null
+++ b/fs/xfs/xfs_macros.h
@@ -0,0 +1,104 @@
1/*
2 * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_MACROS_H__
33#define __XFS_MACROS_H__
34
35/*
36 * Set for debug kernels and simulation
37 * These replacements save space.
38 * Used in xfs_macros.c.
39 */
40#define XFS_WANT_SPACE_C \
41 (!defined(_STANDALONE) && defined(DEBUG))
42
43/*
44 * Set for debug simulation and kernel builds, but not for standalone.
45 * These replacements do not save space.
46 * Used in xfs_macros.c.
47 */
48#define XFS_WANT_FUNCS_C \
49 (!defined(_STANDALONE) && defined(DEBUG))
50
51/*
52 * Corresponding names used in .h files.
53 */
54#define XFS_WANT_SPACE (XFS_WANT_SPACE_C && !defined(XFS_MACRO_C))
55#define XFS_WANT_FUNCS (XFS_WANT_FUNCS_C && !defined(XFS_MACRO_C))
56
57/*
58 * These are the macros that get turned into functions to save space.
59 */
60#define XFSSO_NULLSTARTBLOCK 1
61#define XFSSO_XFS_AGB_TO_DADDR 1
62#define XFSSO_XFS_AGB_TO_FSB 1
63#define XFSSO_XFS_AGINO_TO_INO 1
64#define XFSSO_XFS_ALLOC_BLOCK_MINRECS 1
65#define XFSSO_XFS_ATTR_SF_NEXTENTRY 1
66#define XFSSO_XFS_BMAP_BLOCK_DMAXRECS 1
67#define XFSSO_XFS_BMAP_BLOCK_IMAXRECS 1
68#define XFSSO_XFS_BMAP_BLOCK_IMINRECS 1
69#define XFSSO_XFS_BMAP_INIT 1
70#define XFSSO_XFS_BMAP_PTR_IADDR 1
71#define XFSSO_XFS_BMAP_SANITY_CHECK 1
72#define XFSSO_XFS_BMAPI_AFLAG 1
73#define XFSSO_XFS_CFORK_SIZE 1
74#define XFSSO_XFS_DA_COOKIE_BNO 1
75#define XFSSO_XFS_DA_COOKIE_ENTRY 1
76#define XFSSO_XFS_DADDR_TO_AGBNO 1
77#define XFSSO_XFS_DADDR_TO_FSB 1
78#define XFSSO_XFS_DFORK_PTR 1
79#define XFSSO_XFS_DIR_SF_GET_DIRINO 1
80#define XFSSO_XFS_DIR_SF_NEXTENTRY 1
81#define XFSSO_XFS_DIR_SF_PUT_DIRINO 1
82#define XFSSO_XFS_FILBLKS_MIN 1
83#define XFSSO_XFS_FSB_SANITY_CHECK 1
84#define XFSSO_XFS_FSB_TO_DADDR 1
85#define XFSSO_XFS_FSB_TO_DB 1
86#define XFSSO_XFS_IALLOC_INODES 1
87#define XFSSO_XFS_IFORK_ASIZE 1
88#define XFSSO_XFS_IFORK_DSIZE 1
89#define XFSSO_XFS_IFORK_FORMAT 1
90#define XFSSO_XFS_IFORK_NEXT_SET 1
91#define XFSSO_XFS_IFORK_NEXTENTS 1
92#define XFSSO_XFS_IFORK_PTR 1
93#define XFSSO_XFS_ILOG_FBROOT 1
94#define XFSSO_XFS_ILOG_FEXT 1
95#define XFSSO_XFS_INO_MASK 1
96#define XFSSO_XFS_INO_TO_FSB 1
97#define XFSSO_XFS_INODE_CLEAR_READ_AHEAD 1
98#define XFSSO_XFS_MIN_FREELIST 1
99#define XFSSO_XFS_SB_GOOD_VERSION 1
100#define XFSSO_XFS_SB_VERSION_HASNLINK 1
101#define XFSSO_XLOG_GRANT_ADD_SPACE 1
102#define XFSSO_XLOG_GRANT_SUB_SPACE 1
103
104#endif /* __XFS_MACROS_H__ */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
new file mode 100644
index 000000000000..b57423caef9b
--- /dev/null
+++ b/fs/xfs/xfs_mount.c
@@ -0,0 +1,1586 @@
1/*
2 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34#include "xfs_macros.h"
35#include "xfs_types.h"
36#include "xfs_inum.h"
37#include "xfs_log.h"
38#include "xfs_trans.h"
39#include "xfs_sb.h"
40#include "xfs_ag.h"
41#include "xfs_dir.h"
42#include "xfs_dir2.h"
43#include "xfs_dmapi.h"
44#include "xfs_mount.h"
45#include "xfs_alloc_btree.h"
46#include "xfs_bmap_btree.h"
47#include "xfs_ialloc_btree.h"
48#include "xfs_btree.h"
49#include "xfs_ialloc.h"
50#include "xfs_attr_sf.h"
51#include "xfs_dir_sf.h"
52#include "xfs_dir2_sf.h"
53#include "xfs_dinode.h"
54#include "xfs_inode.h"
55#include "xfs_alloc.h"
56#include "xfs_rtalloc.h"
57#include "xfs_bmap.h"
58#include "xfs_error.h"
59#include "xfs_bit.h"
60#include "xfs_rw.h"
61#include "xfs_quota.h"
62#include "xfs_fsops.h"
63
64STATIC void xfs_mount_log_sbunit(xfs_mount_t *, __int64_t);
65STATIC int xfs_uuid_mount(xfs_mount_t *);
66STATIC void xfs_uuid_unmount(xfs_mount_t *mp);
67
68static struct {
69 short offset;
70 short type; /* 0 = integer
71 * 1 = binary / string (no translation)
72 */
73} xfs_sb_info[] = {
74 { offsetof(xfs_sb_t, sb_magicnum), 0 },
75 { offsetof(xfs_sb_t, sb_blocksize), 0 },
76 { offsetof(xfs_sb_t, sb_dblocks), 0 },
77 { offsetof(xfs_sb_t, sb_rblocks), 0 },
78 { offsetof(xfs_sb_t, sb_rextents), 0 },
79 { offsetof(xfs_sb_t, sb_uuid), 1 },
80 { offsetof(xfs_sb_t, sb_logstart), 0 },
81 { offsetof(xfs_sb_t, sb_rootino), 0 },
82 { offsetof(xfs_sb_t, sb_rbmino), 0 },
83 { offsetof(xfs_sb_t, sb_rsumino), 0 },
84 { offsetof(xfs_sb_t, sb_rextsize), 0 },
85 { offsetof(xfs_sb_t, sb_agblocks), 0 },
86 { offsetof(xfs_sb_t, sb_agcount), 0 },
87 { offsetof(xfs_sb_t, sb_rbmblocks), 0 },
88 { offsetof(xfs_sb_t, sb_logblocks), 0 },
89 { offsetof(xfs_sb_t, sb_versionnum), 0 },
90 { offsetof(xfs_sb_t, sb_sectsize), 0 },
91 { offsetof(xfs_sb_t, sb_inodesize), 0 },
92 { offsetof(xfs_sb_t, sb_inopblock), 0 },
93 { offsetof(xfs_sb_t, sb_fname[0]), 1 },
94 { offsetof(xfs_sb_t, sb_blocklog), 0 },
95 { offsetof(xfs_sb_t, sb_sectlog), 0 },
96 { offsetof(xfs_sb_t, sb_inodelog), 0 },
97 { offsetof(xfs_sb_t, sb_inopblog), 0 },
98 { offsetof(xfs_sb_t, sb_agblklog), 0 },
99 { offsetof(xfs_sb_t, sb_rextslog), 0 },
100 { offsetof(xfs_sb_t, sb_inprogress), 0 },
101 { offsetof(xfs_sb_t, sb_imax_pct), 0 },
102 { offsetof(xfs_sb_t, sb_icount), 0 },
103 { offsetof(xfs_sb_t, sb_ifree), 0 },
104 { offsetof(xfs_sb_t, sb_fdblocks), 0 },
105 { offsetof(xfs_sb_t, sb_frextents), 0 },
106 { offsetof(xfs_sb_t, sb_uquotino), 0 },
107 { offsetof(xfs_sb_t, sb_gquotino), 0 },
108 { offsetof(xfs_sb_t, sb_qflags), 0 },
109 { offsetof(xfs_sb_t, sb_flags), 0 },
110 { offsetof(xfs_sb_t, sb_shared_vn), 0 },
111 { offsetof(xfs_sb_t, sb_inoalignmt), 0 },
112 { offsetof(xfs_sb_t, sb_unit), 0 },
113 { offsetof(xfs_sb_t, sb_width), 0 },
114 { offsetof(xfs_sb_t, sb_dirblklog), 0 },
115 { offsetof(xfs_sb_t, sb_logsectlog), 0 },
116 { offsetof(xfs_sb_t, sb_logsectsize),0 },
117 { offsetof(xfs_sb_t, sb_logsunit), 0 },
118 { offsetof(xfs_sb_t, sb_features2), 0 },
119 { sizeof(xfs_sb_t), 0 }
120};
121
122/*
123 * Return a pointer to an initialized xfs_mount structure.
124 */
125xfs_mount_t *
126xfs_mount_init(void)
127{
128 xfs_mount_t *mp;
129
130 mp = kmem_zalloc(sizeof(*mp), KM_SLEEP);
131
132 AIL_LOCKINIT(&mp->m_ail_lock, "xfs_ail");
133 spinlock_init(&mp->m_sb_lock, "xfs_sb");
134 mutex_init(&mp->m_ilock, MUTEX_DEFAULT, "xfs_ilock");
135 initnsema(&mp->m_growlock, 1, "xfs_grow");
136 /*
137 * Initialize the AIL.
138 */
139 xfs_trans_ail_init(mp);
140
141 atomic_set(&mp->m_active_trans, 0);
142
143 return mp;
144}
145
146/*
147 * Free up the resources associated with a mount structure. Assume that
148 * the structure was initially zeroed, so we can tell which fields got
149 * initialized.
150 */
151void
152xfs_mount_free(
153 xfs_mount_t *mp,
154 int remove_bhv)
155{
156 if (mp->m_ihash)
157 xfs_ihash_free(mp);
158 if (mp->m_chash)
159 xfs_chash_free(mp);
160
161 if (mp->m_perag) {
162 int agno;
163
164 for (agno = 0; agno < mp->m_maxagi; agno++)
165 if (mp->m_perag[agno].pagb_list)
166 kmem_free(mp->m_perag[agno].pagb_list,
167 sizeof(xfs_perag_busy_t) *
168 XFS_PAGB_NUM_SLOTS);
169 kmem_free(mp->m_perag,
170 sizeof(xfs_perag_t) * mp->m_sb.sb_agcount);
171 }
172
173 AIL_LOCK_DESTROY(&mp->m_ail_lock);
174 spinlock_destroy(&mp->m_sb_lock);
175 mutex_destroy(&mp->m_ilock);
176 freesema(&mp->m_growlock);
177 if (mp->m_quotainfo)
178 XFS_QM_DONE(mp);
179
180 if (mp->m_fsname != NULL)
181 kmem_free(mp->m_fsname, mp->m_fsname_len);
182
183 if (remove_bhv) {
184 struct vfs *vfsp = XFS_MTOVFS(mp);
185
186 bhv_remove_all_vfsops(vfsp, 0);
187 VFS_REMOVEBHV(vfsp, &mp->m_bhv);
188 }
189
190 kmem_free(mp, sizeof(xfs_mount_t));
191}
192
193
194/*
195 * Check the validity of the SB found.
196 */
197STATIC int
198xfs_mount_validate_sb(
199 xfs_mount_t *mp,
200 xfs_sb_t *sbp)
201{
202 /*
203 * If the log device and data device have the
204 * same device number, the log is internal.
205 * Consequently, the sb_logstart should be non-zero. If
206 * we have a zero sb_logstart in this case, we may be trying to mount
207 * a volume filesystem in a non-volume manner.
208 */
209 if (sbp->sb_magicnum != XFS_SB_MAGIC) {
210 cmn_err(CE_WARN, "XFS: bad magic number");
211 return XFS_ERROR(EWRONGFS);
212 }
213
214 if (!XFS_SB_GOOD_VERSION(sbp)) {
215 cmn_err(CE_WARN, "XFS: bad version");
216 return XFS_ERROR(EWRONGFS);
217 }
218
219 if (unlikely(
220 sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
221 cmn_err(CE_WARN,
222 "XFS: filesystem is marked as having an external log; "
223 "specify logdev on the\nmount command line.");
224 XFS_CORRUPTION_ERROR("xfs_mount_validate_sb(1)",
225 XFS_ERRLEVEL_HIGH, mp, sbp);
226 return XFS_ERROR(EFSCORRUPTED);
227 }
228
229 if (unlikely(
230 sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
231 cmn_err(CE_WARN,
232 "XFS: filesystem is marked as having an internal log; "
233 "don't specify logdev on\nthe mount command line.");
234 XFS_CORRUPTION_ERROR("xfs_mount_validate_sb(2)",
235 XFS_ERRLEVEL_HIGH, mp, sbp);
236 return XFS_ERROR(EFSCORRUPTED);
237 }
238
239 /*
240 * More sanity checking. These were stolen directly from
241 * xfs_repair.
242 */
243 if (unlikely(
244 sbp->sb_agcount <= 0 ||
245 sbp->sb_sectsize < XFS_MIN_SECTORSIZE ||
246 sbp->sb_sectsize > XFS_MAX_SECTORSIZE ||
247 sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG ||
248 sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG ||
249 sbp->sb_blocksize < XFS_MIN_BLOCKSIZE ||
250 sbp->sb_blocksize > XFS_MAX_BLOCKSIZE ||
251 sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG ||
252 sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG ||
253 sbp->sb_inodesize < XFS_DINODE_MIN_SIZE ||
254 sbp->sb_inodesize > XFS_DINODE_MAX_SIZE ||
255 (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) ||
256 (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) ||
257 sbp->sb_imax_pct > 100)) {
258 cmn_err(CE_WARN, "XFS: SB sanity check 1 failed");
259 XFS_CORRUPTION_ERROR("xfs_mount_validate_sb(3)",
260 XFS_ERRLEVEL_LOW, mp, sbp);
261 return XFS_ERROR(EFSCORRUPTED);
262 }
263
264 /*
265 * Sanity check AG count, size fields against data size field
266 */
267 if (unlikely(
268 sbp->sb_dblocks == 0 ||
269 sbp->sb_dblocks >
270 (xfs_drfsbno_t)sbp->sb_agcount * sbp->sb_agblocks ||
271 sbp->sb_dblocks < (xfs_drfsbno_t)(sbp->sb_agcount - 1) *
272 sbp->sb_agblocks + XFS_MIN_AG_BLOCKS)) {
273 cmn_err(CE_WARN, "XFS: SB sanity check 2 failed");
274 XFS_ERROR_REPORT("xfs_mount_validate_sb(4)",
275 XFS_ERRLEVEL_LOW, mp);
276 return XFS_ERROR(EFSCORRUPTED);
277 }
278
279 ASSERT(PAGE_SHIFT >= sbp->sb_blocklog);
280 ASSERT(sbp->sb_blocklog >= BBSHIFT);
281
282#if XFS_BIG_BLKNOS /* Limited by ULONG_MAX of page cache index */
283 if (unlikely(
284 (sbp->sb_dblocks >> (PAGE_SHIFT - sbp->sb_blocklog)) > ULONG_MAX ||
285 (sbp->sb_rblocks >> (PAGE_SHIFT - sbp->sb_blocklog)) > ULONG_MAX)) {
286#else /* Limited by UINT_MAX of sectors */
287 if (unlikely(
288 (sbp->sb_dblocks << (sbp->sb_blocklog - BBSHIFT)) > UINT_MAX ||
289 (sbp->sb_rblocks << (sbp->sb_blocklog - BBSHIFT)) > UINT_MAX)) {
290#endif
291 cmn_err(CE_WARN,
292 "XFS: File system is too large to be mounted on this system.");
293 return XFS_ERROR(E2BIG);
294 }
295
296 if (unlikely(sbp->sb_inprogress)) {
297 cmn_err(CE_WARN, "XFS: file system busy");
298 XFS_ERROR_REPORT("xfs_mount_validate_sb(5)",
299 XFS_ERRLEVEL_LOW, mp);
300 return XFS_ERROR(EFSCORRUPTED);
301 }
302
303 /*
304 * Until this is fixed only page-sized or smaller data blocks work.
305 */
306 if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
307 cmn_err(CE_WARN,
308 "XFS: Attempted to mount file system with blocksize %d bytes",
309 sbp->sb_blocksize);
310 cmn_err(CE_WARN,
311 "XFS: Only page-sized (%d) or less blocksizes currently work.",
312 PAGE_SIZE);
313 return XFS_ERROR(ENOSYS);
314 }
315
316 return 0;
317}
318
319xfs_agnumber_t
320xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount)
321{
322 xfs_agnumber_t index, max_metadata;
323 xfs_perag_t *pag;
324 xfs_agino_t agino;
325 xfs_ino_t ino;
326 xfs_sb_t *sbp = &mp->m_sb;
327 xfs_ino_t max_inum = XFS_MAXINUMBER_32;
328
329 /* Check to see if the filesystem can overflow 32 bit inodes */
330 agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
331 ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
332
333 /* Clear the mount flag if no inode can overflow 32 bits
334 * on this filesystem, or if specifically requested..
335 */
336 if ((mp->m_flags & XFS_MOUNT_32BITINOOPT) && ino > max_inum) {
337 mp->m_flags |= XFS_MOUNT_32BITINODES;
338 } else {
339 mp->m_flags &= ~XFS_MOUNT_32BITINODES;
340 }
341
342 /* If we can overflow then setup the ag headers accordingly */
343 if (mp->m_flags & XFS_MOUNT_32BITINODES) {
344 /* Calculate how much should be reserved for inodes to
345 * meet the max inode percentage.
346 */
347 if (mp->m_maxicount) {
348 __uint64_t icount;
349
350 icount = sbp->sb_dblocks * sbp->sb_imax_pct;
351 do_div(icount, 100);
352 icount += sbp->sb_agblocks - 1;
353 do_div(icount, mp->m_ialloc_blks);
354 max_metadata = icount;
355 } else {
356 max_metadata = agcount;
357 }
358 for (index = 0; index < agcount; index++) {
359 ino = XFS_AGINO_TO_INO(mp, index, agino);
360 if (ino > max_inum) {
361 index++;
362 break;
363 }
364
365 /* This ag is prefered for inodes */
366 pag = &mp->m_perag[index];
367 pag->pagi_inodeok = 1;
368 if (index < max_metadata)
369 pag->pagf_metadata = 1;
370 }
371 } else {
372 /* Setup default behavior for smaller filesystems */
373 for (index = 0; index < agcount; index++) {
374 pag = &mp->m_perag[index];
375 pag->pagi_inodeok = 1;
376 }
377 }
378 return index;
379}
380
381/*
382 * xfs_xlatesb
383 *
384 * data - on disk version of sb
385 * sb - a superblock
386 * dir - conversion direction: <0 - convert sb to buf
387 * >0 - convert buf to sb
388 * fields - which fields to copy (bitmask)
389 */
390void
391xfs_xlatesb(
392 void *data,
393 xfs_sb_t *sb,
394 int dir,
395 __int64_t fields)
396{
397 xfs_caddr_t buf_ptr;
398 xfs_caddr_t mem_ptr;
399 xfs_sb_field_t f;
400 int first;
401 int size;
402
403 ASSERT(dir);
404 ASSERT(fields);
405
406 if (!fields)
407 return;
408
409 buf_ptr = (xfs_caddr_t)data;
410 mem_ptr = (xfs_caddr_t)sb;
411
412 while (fields) {
413 f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
414 first = xfs_sb_info[f].offset;
415 size = xfs_sb_info[f + 1].offset - first;
416
417 ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1);
418
419 if (size == 1 || xfs_sb_info[f].type == 1) {
420 if (dir > 0) {
421 memcpy(mem_ptr + first, buf_ptr + first, size);
422 } else {
423 memcpy(buf_ptr + first, mem_ptr + first, size);
424 }
425 } else {
426 switch (size) {
427 case 2:
428 INT_XLATE(*(__uint16_t*)(buf_ptr+first),
429 *(__uint16_t*)(mem_ptr+first),
430 dir, ARCH_CONVERT);
431 break;
432 case 4:
433 INT_XLATE(*(__uint32_t*)(buf_ptr+first),
434 *(__uint32_t*)(mem_ptr+first),
435 dir, ARCH_CONVERT);
436 break;
437 case 8:
438 INT_XLATE(*(__uint64_t*)(buf_ptr+first),
439 *(__uint64_t*)(mem_ptr+first), dir, ARCH_CONVERT);
440 break;
441 default:
442 ASSERT(0);
443 }
444 }
445
446 fields &= ~(1LL << f);
447 }
448}
449
450/*
451 * xfs_readsb
452 *
453 * Does the initial read of the superblock.
454 */
455int
456xfs_readsb(xfs_mount_t *mp)
457{
458 unsigned int sector_size;
459 unsigned int extra_flags;
460 xfs_buf_t *bp;
461 xfs_sb_t *sbp;
462 int error;
463
464 ASSERT(mp->m_sb_bp == NULL);
465 ASSERT(mp->m_ddev_targp != NULL);
466
467 /*
468 * Allocate a (locked) buffer to hold the superblock.
469 * This will be kept around at all times to optimize
470 * access to the superblock.
471 */
472 sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
473 extra_flags = XFS_BUF_LOCK | XFS_BUF_MANAGE | XFS_BUF_MAPPED;
474
475 bp = xfs_buf_read_flags(mp->m_ddev_targp, XFS_SB_DADDR,
476 BTOBB(sector_size), extra_flags);
477 if (!bp || XFS_BUF_ISERROR(bp)) {
478 cmn_err(CE_WARN, "XFS: SB read failed");
479 error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
480 goto fail;
481 }
482 ASSERT(XFS_BUF_ISBUSY(bp));
483 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
484
485 /*
486 * Initialize the mount structure from the superblock.
487 * But first do some basic consistency checking.
488 */
489 sbp = XFS_BUF_TO_SBP(bp);
490 xfs_xlatesb(XFS_BUF_PTR(bp), &(mp->m_sb), 1, XFS_SB_ALL_BITS);
491
492 error = xfs_mount_validate_sb(mp, &(mp->m_sb));
493 if (error) {
494 cmn_err(CE_WARN, "XFS: SB validate failed");
495 goto fail;
496 }
497
498 /*
499 * We must be able to do sector-sized and sector-aligned IO.
500 */
501 if (sector_size > mp->m_sb.sb_sectsize) {
502 cmn_err(CE_WARN,
503 "XFS: device supports only %u byte sectors (not %u)",
504 sector_size, mp->m_sb.sb_sectsize);
505 error = ENOSYS;
506 goto fail;
507 }
508
509 /*
510 * If device sector size is smaller than the superblock size,
511 * re-read the superblock so the buffer is correctly sized.
512 */
513 if (sector_size < mp->m_sb.sb_sectsize) {
514 XFS_BUF_UNMANAGE(bp);
515 xfs_buf_relse(bp);
516 sector_size = mp->m_sb.sb_sectsize;
517 bp = xfs_buf_read_flags(mp->m_ddev_targp, XFS_SB_DADDR,
518 BTOBB(sector_size), extra_flags);
519 if (!bp || XFS_BUF_ISERROR(bp)) {
520 cmn_err(CE_WARN, "XFS: SB re-read failed");
521 error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
522 goto fail;
523 }
524 ASSERT(XFS_BUF_ISBUSY(bp));
525 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
526 }
527
528 mp->m_sb_bp = bp;
529 xfs_buf_relse(bp);
530 ASSERT(XFS_BUF_VALUSEMA(bp) > 0);
531 return 0;
532
533 fail:
534 if (bp) {
535 XFS_BUF_UNMANAGE(bp);
536 xfs_buf_relse(bp);
537 }
538 return error;
539}
540
541
542/*
543 * xfs_mount_common
544 *
545 * Mount initialization code establishing various mount
546 * fields from the superblock associated with the given
547 * mount structure
548 */
549void
550xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
551{
552 int i;
553
554 mp->m_agfrotor = mp->m_agirotor = 0;
555 spinlock_init(&mp->m_agirotor_lock, "m_agirotor_lock");
556 mp->m_maxagi = mp->m_sb.sb_agcount;
557 mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG;
558 mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
559 mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
560 mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
561 mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
562 mp->m_litino = sbp->sb_inodesize -
563 ((uint)sizeof(xfs_dinode_core_t) + (uint)sizeof(xfs_agino_t));
564 mp->m_blockmask = sbp->sb_blocksize - 1;
565 mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
566 mp->m_blockwmask = mp->m_blockwsize - 1;
567 INIT_LIST_HEAD(&mp->m_del_inodes);
568
569 /*
570 * Setup for attributes, in case they get created.
571 * This value is for inodes getting attributes for the first time,
572 * the per-inode value is for old attribute values.
573 */
574 ASSERT(sbp->sb_inodesize >= 256 && sbp->sb_inodesize <= 2048);
575 switch (sbp->sb_inodesize) {
576 case 256:
577 mp->m_attroffset = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(2);
578 break;
579 case 512:
580 case 1024:
581 case 2048:
582 mp->m_attroffset = XFS_BMDR_SPACE_CALC(12);
583 break;
584 default:
585 ASSERT(0);
586 }
587 ASSERT(mp->m_attroffset < XFS_LITINO(mp));
588
589 for (i = 0; i < 2; i++) {
590 mp->m_alloc_mxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
591 xfs_alloc, i == 0);
592 mp->m_alloc_mnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
593 xfs_alloc, i == 0);
594 }
595 for (i = 0; i < 2; i++) {
596 mp->m_bmap_dmxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
597 xfs_bmbt, i == 0);
598 mp->m_bmap_dmnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
599 xfs_bmbt, i == 0);
600 }
601 for (i = 0; i < 2; i++) {
602 mp->m_inobt_mxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
603 xfs_inobt, i == 0);
604 mp->m_inobt_mnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
605 xfs_inobt, i == 0);
606 }
607
608 mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
609 mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
610 sbp->sb_inopblock);
611 mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
612}
613/*
614 * xfs_mountfs
615 *
616 * This function does the following on an initial mount of a file system:
617 * - reads the superblock from disk and init the mount struct
618 * - if we're a 32-bit kernel, do a size check on the superblock
619 * so we don't mount terabyte filesystems
620 * - init mount struct realtime fields
621 * - allocate inode hash table for fs
622 * - init directory manager
623 * - perform recovery and init the log manager
624 */
625int
626xfs_mountfs(
627 vfs_t *vfsp,
628 xfs_mount_t *mp,
629 int mfsi_flags)
630{
631 xfs_buf_t *bp;
632 xfs_sb_t *sbp = &(mp->m_sb);
633 xfs_inode_t *rip;
634 vnode_t *rvp = NULL;
635 int readio_log, writeio_log;
636 xfs_daddr_t d;
637 __uint64_t ret64;
638 __int64_t update_flags;
639 uint quotamount, quotaflags;
640 int agno;
641 int uuid_mounted = 0;
642 int error = 0;
643
644 if (mp->m_sb_bp == NULL) {
645 if ((error = xfs_readsb(mp))) {
646 return (error);
647 }
648 }
649 xfs_mount_common(mp, sbp);
650
651 /*
652 * Check if sb_agblocks is aligned at stripe boundary
653 * If sb_agblocks is NOT aligned turn off m_dalign since
654 * allocator alignment is within an ag, therefore ag has
655 * to be aligned at stripe boundary.
656 */
657 update_flags = 0LL;
658 if (mp->m_dalign && !(mfsi_flags & XFS_MFSI_SECOND)) {
659 /*
660 * If stripe unit and stripe width are not multiples
661 * of the fs blocksize turn off alignment.
662 */
663 if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
664 (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
665 if (mp->m_flags & XFS_MOUNT_RETERR) {
666 cmn_err(CE_WARN,
667 "XFS: alignment check 1 failed");
668 error = XFS_ERROR(EINVAL);
669 goto error1;
670 }
671 mp->m_dalign = mp->m_swidth = 0;
672 } else {
673 /*
674 * Convert the stripe unit and width to FSBs.
675 */
676 mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
677 if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) {
678 if (mp->m_flags & XFS_MOUNT_RETERR) {
679 error = XFS_ERROR(EINVAL);
680 goto error1;
681 }
682 xfs_fs_cmn_err(CE_WARN, mp,
683"stripe alignment turned off: sunit(%d)/swidth(%d) incompatible with agsize(%d)",
684 mp->m_dalign, mp->m_swidth,
685 sbp->sb_agblocks);
686
687 mp->m_dalign = 0;
688 mp->m_swidth = 0;
689 } else if (mp->m_dalign) {
690 mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
691 } else {
692 if (mp->m_flags & XFS_MOUNT_RETERR) {
693 xfs_fs_cmn_err(CE_WARN, mp,
694"stripe alignment turned off: sunit(%d) less than bsize(%d)",
695 mp->m_dalign,
696 mp->m_blockmask +1);
697 error = XFS_ERROR(EINVAL);
698 goto error1;
699 }
700 mp->m_swidth = 0;
701 }
702 }
703
704 /*
705 * Update superblock with new values
706 * and log changes
707 */
708 if (XFS_SB_VERSION_HASDALIGN(sbp)) {
709 if (sbp->sb_unit != mp->m_dalign) {
710 sbp->sb_unit = mp->m_dalign;
711 update_flags |= XFS_SB_UNIT;
712 }
713 if (sbp->sb_width != mp->m_swidth) {
714 sbp->sb_width = mp->m_swidth;
715 update_flags |= XFS_SB_WIDTH;
716 }
717 }
718 } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
719 XFS_SB_VERSION_HASDALIGN(&mp->m_sb)) {
720 mp->m_dalign = sbp->sb_unit;
721 mp->m_swidth = sbp->sb_width;
722 }
723
724 xfs_alloc_compute_maxlevels(mp);
725 xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK);
726 xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK);
727 xfs_ialloc_compute_maxlevels(mp);
728
729 if (sbp->sb_imax_pct) {
730 __uint64_t icount;
731
732 /* Make sure the maximum inode count is a multiple of the
733 * units we allocate inodes in.
734 */
735
736 icount = sbp->sb_dblocks * sbp->sb_imax_pct;
737 do_div(icount, 100);
738 do_div(icount, mp->m_ialloc_blks);
739 mp->m_maxicount = (icount * mp->m_ialloc_blks) <<
740 sbp->sb_inopblog;
741 } else
742 mp->m_maxicount = 0;
743
744 mp->m_maxioffset = xfs_max_file_offset(sbp->sb_blocklog);
745
746 /*
747 * XFS uses the uuid from the superblock as the unique
748 * identifier for fsid. We can not use the uuid from the volume
749 * since a single partition filesystem is identical to a single
750 * partition volume/filesystem.
751 */
752 if ((mfsi_flags & XFS_MFSI_SECOND) == 0 &&
753 (mp->m_flags & XFS_MOUNT_NOUUID) == 0) {
754 if (xfs_uuid_mount(mp)) {
755 error = XFS_ERROR(EINVAL);
756 goto error1;
757 }
758 uuid_mounted=1;
759 ret64 = uuid_hash64(&sbp->sb_uuid);
760 memcpy(&vfsp->vfs_fsid, &ret64, sizeof(ret64));
761 }
762
763 /*
764 * Set the default minimum read and write sizes unless
765 * already specified in a mount option.
766 * We use smaller I/O sizes when the file system
767 * is being used for NFS service (wsync mount option).
768 */
769 if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
770 if (mp->m_flags & XFS_MOUNT_WSYNC) {
771 readio_log = XFS_WSYNC_READIO_LOG;
772 writeio_log = XFS_WSYNC_WRITEIO_LOG;
773 } else {
774 readio_log = XFS_READIO_LOG_LARGE;
775 writeio_log = XFS_WRITEIO_LOG_LARGE;
776 }
777 } else {
778 readio_log = mp->m_readio_log;
779 writeio_log = mp->m_writeio_log;
780 }
781
782 /*
783 * Set the number of readahead buffers to use based on
784 * physical memory size.
785 */
786 if (xfs_physmem <= 4096) /* <= 16MB */
787 mp->m_nreadaheads = XFS_RW_NREADAHEAD_16MB;
788 else if (xfs_physmem <= 8192) /* <= 32MB */
789 mp->m_nreadaheads = XFS_RW_NREADAHEAD_32MB;
790 else
791 mp->m_nreadaheads = XFS_RW_NREADAHEAD_K32;
792 if (sbp->sb_blocklog > readio_log) {
793 mp->m_readio_log = sbp->sb_blocklog;
794 } else {
795 mp->m_readio_log = readio_log;
796 }
797 mp->m_readio_blocks = 1 << (mp->m_readio_log - sbp->sb_blocklog);
798 if (sbp->sb_blocklog > writeio_log) {
799 mp->m_writeio_log = sbp->sb_blocklog;
800 } else {
801 mp->m_writeio_log = writeio_log;
802 }
803 mp->m_writeio_blocks = 1 << (mp->m_writeio_log - sbp->sb_blocklog);
804
805 /*
806 * Set the inode cluster size based on the physical memory
807 * size. This may still be overridden by the file system
808 * block size if it is larger than the chosen cluster size.
809 */
810 if (xfs_physmem <= btoc(32 * 1024 * 1024)) { /* <= 32 MB */
811 mp->m_inode_cluster_size = XFS_INODE_SMALL_CLUSTER_SIZE;
812 } else {
813 mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE;
814 }
815 /*
816 * Set whether we're using inode alignment.
817 */
818 if (XFS_SB_VERSION_HASALIGN(&mp->m_sb) &&
819 mp->m_sb.sb_inoalignmt >=
820 XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size))
821 mp->m_inoalign_mask = mp->m_sb.sb_inoalignmt - 1;
822 else
823 mp->m_inoalign_mask = 0;
824 /*
825 * If we are using stripe alignment, check whether
826 * the stripe unit is a multiple of the inode alignment
827 */
828 if (mp->m_dalign && mp->m_inoalign_mask &&
829 !(mp->m_dalign & mp->m_inoalign_mask))
830 mp->m_sinoalign = mp->m_dalign;
831 else
832 mp->m_sinoalign = 0;
833 /*
834 * Check that the data (and log if separate) are an ok size.
835 */
836 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
837 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
838 cmn_err(CE_WARN, "XFS: size check 1 failed");
839 error = XFS_ERROR(E2BIG);
840 goto error1;
841 }
842 error = xfs_read_buf(mp, mp->m_ddev_targp,
843 d - XFS_FSS_TO_BB(mp, 1),
844 XFS_FSS_TO_BB(mp, 1), 0, &bp);
845 if (!error) {
846 xfs_buf_relse(bp);
847 } else {
848 cmn_err(CE_WARN, "XFS: size check 2 failed");
849 if (error == ENOSPC) {
850 error = XFS_ERROR(E2BIG);
851 }
852 goto error1;
853 }
854
855 if (((mfsi_flags & XFS_MFSI_CLIENT) == 0) &&
856 mp->m_logdev_targp != mp->m_ddev_targp) {
857 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
858 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
859 cmn_err(CE_WARN, "XFS: size check 3 failed");
860 error = XFS_ERROR(E2BIG);
861 goto error1;
862 }
863 error = xfs_read_buf(mp, mp->m_logdev_targp,
864 d - XFS_FSB_TO_BB(mp, 1),
865 XFS_FSB_TO_BB(mp, 1), 0, &bp);
866 if (!error) {
867 xfs_buf_relse(bp);
868 } else {
869 cmn_err(CE_WARN, "XFS: size check 3 failed");
870 if (error == ENOSPC) {
871 error = XFS_ERROR(E2BIG);
872 }
873 goto error1;
874 }
875 }
876
877 /*
878 * Initialize realtime fields in the mount structure
879 */
880 if ((error = xfs_rtmount_init(mp))) {
881 cmn_err(CE_WARN, "XFS: RT mount failed");
882 goto error1;
883 }
884
885 /*
886 * For client case we are done now
887 */
888 if (mfsi_flags & XFS_MFSI_CLIENT) {
889 return(0);
890 }
891
892 /*
893 * Copies the low order bits of the timestamp and the randomly
894 * set "sequence" number out of a UUID.
895 */
896 uuid_getnodeuniq(&sbp->sb_uuid, mp->m_fixedfsid);
897
898 /*
899 * The vfs structure needs to have a file system independent
900 * way of checking for the invariant file system ID. Since it
901 * can't look at mount structures it has a pointer to the data
902 * in the mount structure.
903 *
904 * File systems that don't support user level file handles (i.e.
905 * all of them except for XFS) will leave vfs_altfsid as NULL.
906 */
907 vfsp->vfs_altfsid = (xfs_fsid_t *)mp->m_fixedfsid;
908 mp->m_dmevmask = 0; /* not persistent; set after each mount */
909
910 /*
911 * Select the right directory manager.
912 */
913 mp->m_dirops =
914 XFS_SB_VERSION_HASDIRV2(&mp->m_sb) ?
915 xfsv2_dirops :
916 xfsv1_dirops;
917
918 /*
919 * Initialize directory manager's entries.
920 */
921 XFS_DIR_MOUNT(mp);
922
923 /*
924 * Initialize the attribute manager's entries.
925 */
926 mp->m_attr_magicpct = (mp->m_sb.sb_blocksize * 37) / 100;
927
928 /*
929 * Initialize the precomputed transaction reservations values.
930 */
931 xfs_trans_init(mp);
932
933 /*
934 * Allocate and initialize the inode hash table for this
935 * file system.
936 */
937 xfs_ihash_init(mp);
938 xfs_chash_init(mp);
939
940 /*
941 * Allocate and initialize the per-ag data.
942 */
943 init_rwsem(&mp->m_peraglock);
944 mp->m_perag =
945 kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t), KM_SLEEP);
946
947 mp->m_maxagi = xfs_initialize_perag(mp, sbp->sb_agcount);
948
949 /*
950 * log's mount-time initialization. Perform 1st part recovery if needed
951 */
952 if (likely(sbp->sb_logblocks > 0)) { /* check for volume case */
953 error = xfs_log_mount(mp, mp->m_logdev_targp,
954 XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
955 XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
956 if (error) {
957 cmn_err(CE_WARN, "XFS: log mount failed");
958 goto error2;
959 }
960 } else { /* No log has been defined */
961 cmn_err(CE_WARN, "XFS: no log defined");
962 XFS_ERROR_REPORT("xfs_mountfs_int(1)", XFS_ERRLEVEL_LOW, mp);
963 error = XFS_ERROR(EFSCORRUPTED);
964 goto error2;
965 }
966
967 /*
968 * Get and sanity-check the root inode.
969 * Save the pointer to it in the mount structure.
970 */
971 error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip, 0);
972 if (error) {
973 cmn_err(CE_WARN, "XFS: failed to read root inode");
974 goto error3;
975 }
976
977 ASSERT(rip != NULL);
978 rvp = XFS_ITOV(rip);
979
980 if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) {
981 cmn_err(CE_WARN, "XFS: corrupted root inode");
982 prdev("Root inode %llu is not a directory",
983 mp->m_ddev_targp, (unsigned long long)rip->i_ino);
984 xfs_iunlock(rip, XFS_ILOCK_EXCL);
985 XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
986 mp);
987 error = XFS_ERROR(EFSCORRUPTED);
988 goto error4;
989 }
990 mp->m_rootip = rip; /* save it */
991
992 xfs_iunlock(rip, XFS_ILOCK_EXCL);
993
994 /*
995 * Initialize realtime inode pointers in the mount structure
996 */
997 if ((error = xfs_rtmount_inodes(mp))) {
998 /*
999 * Free up the root inode.
1000 */
1001 cmn_err(CE_WARN, "XFS: failed to read RT inodes");
1002 goto error4;
1003 }
1004
1005 /*
1006 * If fs is not mounted readonly, then update the superblock
1007 * unit and width changes.
1008 */
1009 if (update_flags && !(vfsp->vfs_flag & VFS_RDONLY))
1010 xfs_mount_log_sbunit(mp, update_flags);
1011
1012 /*
1013 * Initialise the XFS quota management subsystem for this mount
1014 */
1015 if ((error = XFS_QM_INIT(mp, &quotamount, &quotaflags)))
1016 goto error4;
1017
1018 /*
1019 * Finish recovering the file system. This part needed to be
1020 * delayed until after the root and real-time bitmap inodes
1021 * were consistently read in.
1022 */
1023 error = xfs_log_mount_finish(mp, mfsi_flags);
1024 if (error) {
1025 cmn_err(CE_WARN, "XFS: log mount finish failed");
1026 goto error4;
1027 }
1028
1029 /*
1030 * Complete the quota initialisation, post-log-replay component.
1031 */
1032 if ((error = XFS_QM_MOUNT(mp, quotamount, quotaflags, mfsi_flags)))
1033 goto error4;
1034
1035 return 0;
1036
1037 error4:
1038 /*
1039 * Free up the root inode.
1040 */
1041 VN_RELE(rvp);
1042 error3:
1043 xfs_log_unmount_dealloc(mp);
1044 error2:
1045 xfs_ihash_free(mp);
1046 xfs_chash_free(mp);
1047 for (agno = 0; agno < sbp->sb_agcount; agno++)
1048 if (mp->m_perag[agno].pagb_list)
1049 kmem_free(mp->m_perag[agno].pagb_list,
1050 sizeof(xfs_perag_busy_t) * XFS_PAGB_NUM_SLOTS);
1051 kmem_free(mp->m_perag, sbp->sb_agcount * sizeof(xfs_perag_t));
1052 mp->m_perag = NULL;
1053 /* FALLTHROUGH */
1054 error1:
1055 if (uuid_mounted)
1056 xfs_uuid_unmount(mp);
1057 xfs_freesb(mp);
1058 return error;
1059}
1060
1061/*
1062 * xfs_unmountfs
1063 *
1064 * This flushes out the inodes,dquots and the superblock, unmounts the
1065 * log and makes sure that incore structures are freed.
1066 */
1067int
1068xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
1069{
1070 struct vfs *vfsp = XFS_MTOVFS(mp);
1071#if defined(DEBUG) || defined(INDUCE_IO_ERROR)
1072 int64_t fsid;
1073#endif
1074
1075 xfs_iflush_all(mp, XFS_FLUSH_ALL);
1076
1077 XFS_QM_DQPURGEALL(mp,
1078 XFS_QMOPT_UQUOTA | XFS_QMOPT_GQUOTA | XFS_QMOPT_UMOUNTING);
1079
1080 /*
1081 * Flush out the log synchronously so that we know for sure
1082 * that nothing is pinned. This is important because bflush()
1083 * will skip pinned buffers.
1084 */
1085 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
1086
1087 xfs_binval(mp->m_ddev_targp);
1088 if (mp->m_rtdev_targp) {
1089 xfs_binval(mp->m_rtdev_targp);
1090 }
1091
1092 xfs_unmountfs_writesb(mp);
1093
1094 xfs_unmountfs_wait(mp); /* wait for async bufs */
1095
1096 xfs_log_unmount(mp); /* Done! No more fs ops. */
1097
1098 xfs_freesb(mp);
1099
1100 /*
1101 * All inodes from this mount point should be freed.
1102 */
1103 ASSERT(mp->m_inodes == NULL);
1104
1105 /*
1106 * We may have bufs that are in the process of getting written still.
1107 * We must wait for the I/O completion of those. The sync flag here
1108 * does a two pass iteration thru the bufcache.
1109 */
1110 if (XFS_FORCED_SHUTDOWN(mp)) {
1111 xfs_incore_relse(mp->m_ddev_targp, 0, 1); /* synchronous */
1112 }
1113
1114 xfs_unmountfs_close(mp, cr);
1115 if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0)
1116 xfs_uuid_unmount(mp);
1117
1118#if defined(DEBUG) || defined(INDUCE_IO_ERROR)
1119 /*
1120 * clear all error tags on this filesystem
1121 */
1122 memcpy(&fsid, &vfsp->vfs_fsid, sizeof(int64_t));
1123 xfs_errortag_clearall_umount(fsid, mp->m_fsname, 0);
1124#endif
1125 XFS_IODONE(vfsp);
1126 xfs_mount_free(mp, 1);
1127 return 0;
1128}
1129
1130void
1131xfs_unmountfs_close(xfs_mount_t *mp, struct cred *cr)
1132{
1133 if (mp->m_logdev_targp != mp->m_ddev_targp)
1134 xfs_free_buftarg(mp->m_logdev_targp, 1);
1135 if (mp->m_rtdev_targp)
1136 xfs_free_buftarg(mp->m_rtdev_targp, 1);
1137 xfs_free_buftarg(mp->m_ddev_targp, 0);
1138}
1139
1140void
1141xfs_unmountfs_wait(xfs_mount_t *mp)
1142{
1143 if (mp->m_logdev_targp != mp->m_ddev_targp)
1144 xfs_wait_buftarg(mp->m_logdev_targp);
1145 if (mp->m_rtdev_targp)
1146 xfs_wait_buftarg(mp->m_rtdev_targp);
1147 xfs_wait_buftarg(mp->m_ddev_targp);
1148}
1149
1150int
1151xfs_unmountfs_writesb(xfs_mount_t *mp)
1152{
1153 xfs_buf_t *sbp;
1154 xfs_sb_t *sb;
1155 int error = 0;
1156
1157 /*
1158 * skip superblock write if fs is read-only, or
1159 * if we are doing a forced umount.
1160 */
1161 sbp = xfs_getsb(mp, 0);
1162 if (!(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY ||
1163 XFS_FORCED_SHUTDOWN(mp))) {
1164 /*
1165 * mark shared-readonly if desired
1166 */
1167 sb = XFS_BUF_TO_SBP(sbp);
1168 if (mp->m_mk_sharedro) {
1169 if (!(sb->sb_flags & XFS_SBF_READONLY))
1170 sb->sb_flags |= XFS_SBF_READONLY;
1171 if (!XFS_SB_VERSION_HASSHARED(sb))
1172 XFS_SB_VERSION_ADDSHARED(sb);
1173 xfs_fs_cmn_err(CE_NOTE, mp,
1174 "Unmounting, marking shared read-only");
1175 }
1176 XFS_BUF_UNDONE(sbp);
1177 XFS_BUF_UNREAD(sbp);
1178 XFS_BUF_UNDELAYWRITE(sbp);
1179 XFS_BUF_WRITE(sbp);
1180 XFS_BUF_UNASYNC(sbp);
1181 ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp);
1182 xfsbdstrat(mp, sbp);
1183 /* Nevermind errors we might get here. */
1184 error = xfs_iowait(sbp);
1185 if (error)
1186 xfs_ioerror_alert("xfs_unmountfs_writesb",
1187 mp, sbp, XFS_BUF_ADDR(sbp));
1188 if (error && mp->m_mk_sharedro)
1189 xfs_fs_cmn_err(CE_ALERT, mp, "Superblock write error detected while unmounting. Filesystem may not be marked shared readonly");
1190 }
1191 xfs_buf_relse(sbp);
1192 return (error);
1193}
1194
1195/*
1196 * xfs_mod_sb() can be used to copy arbitrary changes to the
1197 * in-core superblock into the superblock buffer to be logged.
1198 * It does not provide the higher level of locking that is
1199 * needed to protect the in-core superblock from concurrent
1200 * access.
1201 */
1202void
1203xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
1204{
1205 xfs_buf_t *bp;
1206 int first;
1207 int last;
1208 xfs_mount_t *mp;
1209 xfs_sb_t *sbp;
1210 xfs_sb_field_t f;
1211
1212 ASSERT(fields);
1213 if (!fields)
1214 return;
1215 mp = tp->t_mountp;
1216 bp = xfs_trans_getsb(tp, mp, 0);
1217 sbp = XFS_BUF_TO_SBP(bp);
1218 first = sizeof(xfs_sb_t);
1219 last = 0;
1220
1221 /* translate/copy */
1222
1223 xfs_xlatesb(XFS_BUF_PTR(bp), &(mp->m_sb), -1, fields);
1224
1225 /* find modified range */
1226
1227 f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
1228 ASSERT((1LL << f) & XFS_SB_MOD_BITS);
1229 first = xfs_sb_info[f].offset;
1230
1231 f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
1232 ASSERT((1LL << f) & XFS_SB_MOD_BITS);
1233 last = xfs_sb_info[f + 1].offset - 1;
1234
1235 xfs_trans_log_buf(tp, bp, first, last);
1236}
1237
1238/*
1239 * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply
1240 * a delta to a specified field in the in-core superblock. Simply
1241 * switch on the field indicated and apply the delta to that field.
1242 * Fields are not allowed to dip below zero, so if the delta would
1243 * do this do not apply it and return EINVAL.
1244 *
1245 * The SB_LOCK must be held when this routine is called.
1246 */
1247STATIC int
1248xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field,
1249 int delta, int rsvd)
1250{
1251 int scounter; /* short counter for 32 bit fields */
1252 long long lcounter; /* long counter for 64 bit fields */
1253 long long res_used, rem;
1254
1255 /*
1256 * With the in-core superblock spin lock held, switch
1257 * on the indicated field. Apply the delta to the
1258 * proper field. If the fields value would dip below
1259 * 0, then do not apply the delta and return EINVAL.
1260 */
1261 switch (field) {
1262 case XFS_SBS_ICOUNT:
1263 lcounter = (long long)mp->m_sb.sb_icount;
1264 lcounter += delta;
1265 if (lcounter < 0) {
1266 ASSERT(0);
1267 return (XFS_ERROR(EINVAL));
1268 }
1269 mp->m_sb.sb_icount = lcounter;
1270 return (0);
1271 case XFS_SBS_IFREE:
1272 lcounter = (long long)mp->m_sb.sb_ifree;
1273 lcounter += delta;
1274 if (lcounter < 0) {
1275 ASSERT(0);
1276 return (XFS_ERROR(EINVAL));
1277 }
1278 mp->m_sb.sb_ifree = lcounter;
1279 return (0);
1280 case XFS_SBS_FDBLOCKS:
1281
1282 lcounter = (long long)mp->m_sb.sb_fdblocks;
1283 res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
1284
1285 if (delta > 0) { /* Putting blocks back */
1286 if (res_used > delta) {
1287 mp->m_resblks_avail += delta;
1288 } else {
1289 rem = delta - res_used;
1290 mp->m_resblks_avail = mp->m_resblks;
1291 lcounter += rem;
1292 }
1293 } else { /* Taking blocks away */
1294
1295 lcounter += delta;
1296
1297 /*
1298 * If were out of blocks, use any available reserved blocks if
1299 * were allowed to.
1300 */
1301
1302 if (lcounter < 0) {
1303 if (rsvd) {
1304 lcounter = (long long)mp->m_resblks_avail + delta;
1305 if (lcounter < 0) {
1306 return (XFS_ERROR(ENOSPC));
1307 }
1308 mp->m_resblks_avail = lcounter;
1309 return (0);
1310 } else { /* not reserved */
1311 return (XFS_ERROR(ENOSPC));
1312 }
1313 }
1314 }
1315
1316 mp->m_sb.sb_fdblocks = lcounter;
1317 return (0);
1318 case XFS_SBS_FREXTENTS:
1319 lcounter = (long long)mp->m_sb.sb_frextents;
1320 lcounter += delta;
1321 if (lcounter < 0) {
1322 return (XFS_ERROR(ENOSPC));
1323 }
1324 mp->m_sb.sb_frextents = lcounter;
1325 return (0);
1326 case XFS_SBS_DBLOCKS:
1327 lcounter = (long long)mp->m_sb.sb_dblocks;
1328 lcounter += delta;
1329 if (lcounter < 0) {
1330 ASSERT(0);
1331 return (XFS_ERROR(EINVAL));
1332 }
1333 mp->m_sb.sb_dblocks = lcounter;
1334 return (0);
1335 case XFS_SBS_AGCOUNT:
1336 scounter = mp->m_sb.sb_agcount;
1337 scounter += delta;
1338 if (scounter < 0) {
1339 ASSERT(0);
1340 return (XFS_ERROR(EINVAL));
1341 }
1342 mp->m_sb.sb_agcount = scounter;
1343 return (0);
1344 case XFS_SBS_IMAX_PCT:
1345 scounter = mp->m_sb.sb_imax_pct;
1346 scounter += delta;
1347 if (scounter < 0) {
1348 ASSERT(0);
1349 return (XFS_ERROR(EINVAL));
1350 }
1351 mp->m_sb.sb_imax_pct = scounter;
1352 return (0);
1353 case XFS_SBS_REXTSIZE:
1354 scounter = mp->m_sb.sb_rextsize;
1355 scounter += delta;
1356 if (scounter < 0) {
1357 ASSERT(0);
1358 return (XFS_ERROR(EINVAL));
1359 }
1360 mp->m_sb.sb_rextsize = scounter;
1361 return (0);
1362 case XFS_SBS_RBMBLOCKS:
1363 scounter = mp->m_sb.sb_rbmblocks;
1364 scounter += delta;
1365 if (scounter < 0) {
1366 ASSERT(0);
1367 return (XFS_ERROR(EINVAL));
1368 }
1369 mp->m_sb.sb_rbmblocks = scounter;
1370 return (0);
1371 case XFS_SBS_RBLOCKS:
1372 lcounter = (long long)mp->m_sb.sb_rblocks;
1373 lcounter += delta;
1374 if (lcounter < 0) {
1375 ASSERT(0);
1376 return (XFS_ERROR(EINVAL));
1377 }
1378 mp->m_sb.sb_rblocks = lcounter;
1379 return (0);
1380 case XFS_SBS_REXTENTS:
1381 lcounter = (long long)mp->m_sb.sb_rextents;
1382 lcounter += delta;
1383 if (lcounter < 0) {
1384 ASSERT(0);
1385 return (XFS_ERROR(EINVAL));
1386 }
1387 mp->m_sb.sb_rextents = lcounter;
1388 return (0);
1389 case XFS_SBS_REXTSLOG:
1390 scounter = mp->m_sb.sb_rextslog;
1391 scounter += delta;
1392 if (scounter < 0) {
1393 ASSERT(0);
1394 return (XFS_ERROR(EINVAL));
1395 }
1396 mp->m_sb.sb_rextslog = scounter;
1397 return (0);
1398 default:
1399 ASSERT(0);
1400 return (XFS_ERROR(EINVAL));
1401 }
1402}
1403
1404/*
1405 * xfs_mod_incore_sb() is used to change a field in the in-core
1406 * superblock structure by the specified delta. This modification
1407 * is protected by the SB_LOCK. Just use the xfs_mod_incore_sb_unlocked()
1408 * routine to do the work.
1409 */
1410int
1411xfs_mod_incore_sb(xfs_mount_t *mp, xfs_sb_field_t field, int delta, int rsvd)
1412{
1413 unsigned long s;
1414 int status;
1415
1416 s = XFS_SB_LOCK(mp);
1417 status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
1418 XFS_SB_UNLOCK(mp, s);
1419 return (status);
1420}
1421
1422/*
1423 * xfs_mod_incore_sb_batch() is used to change more than one field
1424 * in the in-core superblock structure at a time. This modification
1425 * is protected by a lock internal to this module. The fields and
1426 * changes to those fields are specified in the array of xfs_mod_sb
1427 * structures passed in.
1428 *
1429 * Either all of the specified deltas will be applied or none of
1430 * them will. If any modified field dips below 0, then all modifications
1431 * will be backed out and EINVAL will be returned.
1432 */
1433int
1434xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd)
1435{
1436 unsigned long s;
1437 int status=0;
1438 xfs_mod_sb_t *msbp;
1439
1440 /*
1441 * Loop through the array of mod structures and apply each
1442 * individually. If any fail, then back out all those
1443 * which have already been applied. Do all of this within
1444 * the scope of the SB_LOCK so that all of the changes will
1445 * be atomic.
1446 */
1447 s = XFS_SB_LOCK(mp);
1448 msbp = &msb[0];
1449 for (msbp = &msbp[0]; msbp < (msb + nmsb); msbp++) {
1450 /*
1451 * Apply the delta at index n. If it fails, break
1452 * from the loop so we'll fall into the undo loop
1453 * below.
1454 */
1455 status = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
1456 msbp->msb_delta, rsvd);
1457 if (status != 0) {
1458 break;
1459 }
1460 }
1461
1462 /*
1463 * If we didn't complete the loop above, then back out
1464 * any changes made to the superblock. If you add code
1465 * between the loop above and here, make sure that you
1466 * preserve the value of status. Loop back until
1467 * we step below the beginning of the array. Make sure
1468 * we don't touch anything back there.
1469 */
1470 if (status != 0) {
1471 msbp--;
1472 while (msbp >= msb) {
1473 status = xfs_mod_incore_sb_unlocked(mp,
1474 msbp->msb_field, -(msbp->msb_delta), rsvd);
1475 ASSERT(status == 0);
1476 msbp--;
1477 }
1478 }
1479 XFS_SB_UNLOCK(mp, s);
1480 return (status);
1481}
1482
1483/*
1484 * xfs_getsb() is called to obtain the buffer for the superblock.
1485 * The buffer is returned locked and read in from disk.
1486 * The buffer should be released with a call to xfs_brelse().
1487 *
1488 * If the flags parameter is BUF_TRYLOCK, then we'll only return
1489 * the superblock buffer if it can be locked without sleeping.
1490 * If it can't then we'll return NULL.
1491 */
1492xfs_buf_t *
1493xfs_getsb(
1494 xfs_mount_t *mp,
1495 int flags)
1496{
1497 xfs_buf_t *bp;
1498
1499 ASSERT(mp->m_sb_bp != NULL);
1500 bp = mp->m_sb_bp;
1501 if (flags & XFS_BUF_TRYLOCK) {
1502 if (!XFS_BUF_CPSEMA(bp)) {
1503 return NULL;
1504 }
1505 } else {
1506 XFS_BUF_PSEMA(bp, PRIBIO);
1507 }
1508 XFS_BUF_HOLD(bp);
1509 ASSERT(XFS_BUF_ISDONE(bp));
1510 return (bp);
1511}
1512
1513/*
1514 * Used to free the superblock along various error paths.
1515 */
1516void
1517xfs_freesb(
1518 xfs_mount_t *mp)
1519{
1520 xfs_buf_t *bp;
1521
1522 /*
1523 * Use xfs_getsb() so that the buffer will be locked
1524 * when we call xfs_buf_relse().
1525 */
1526 bp = xfs_getsb(mp, 0);
1527 XFS_BUF_UNMANAGE(bp);
1528 xfs_buf_relse(bp);
1529 mp->m_sb_bp = NULL;
1530}
1531
1532/*
1533 * See if the UUID is unique among mounted XFS filesystems.
1534 * Mount fails if UUID is nil or a FS with the same UUID is already mounted.
1535 */
1536STATIC int
1537xfs_uuid_mount(
1538 xfs_mount_t *mp)
1539{
1540 if (uuid_is_nil(&mp->m_sb.sb_uuid)) {
1541 cmn_err(CE_WARN,
1542 "XFS: Filesystem %s has nil UUID - can't mount",
1543 mp->m_fsname);
1544 return -1;
1545 }
1546 if (!uuid_table_insert(&mp->m_sb.sb_uuid)) {
1547 cmn_err(CE_WARN,
1548 "XFS: Filesystem %s has duplicate UUID - can't mount",
1549 mp->m_fsname);
1550 return -1;
1551 }
1552 return 0;
1553}
1554
1555/*
1556 * Remove filesystem from the UUID table.
1557 */
1558STATIC void
1559xfs_uuid_unmount(
1560 xfs_mount_t *mp)
1561{
1562 uuid_table_remove(&mp->m_sb.sb_uuid);
1563}
1564
1565/*
1566 * Used to log changes to the superblock unit and width fields which could
1567 * be altered by the mount options. Only the first superblock is updated.
1568 */
1569STATIC void
1570xfs_mount_log_sbunit(
1571 xfs_mount_t *mp,
1572 __int64_t fields)
1573{
1574 xfs_trans_t *tp;
1575
1576 ASSERT(fields & (XFS_SB_UNIT|XFS_SB_WIDTH|XFS_SB_UUID));
1577
1578 tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT);
1579 if (xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
1580 XFS_DEFAULT_LOG_COUNT)) {
1581 xfs_trans_cancel(tp, 0);
1582 return;
1583 }
1584 xfs_mod_sb(tp, fields);
1585 xfs_trans_commit(tp, 0, NULL);
1586}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
new file mode 100644
index 000000000000..5fc6201dd8e2
--- /dev/null
+++ b/fs/xfs/xfs_mount.h
@@ -0,0 +1,573 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_MOUNT_H__
33#define __XFS_MOUNT_H__
34
35
36typedef struct xfs_trans_reservations {
37 uint tr_write; /* extent alloc trans */
38 uint tr_itruncate; /* truncate trans */
39 uint tr_rename; /* rename trans */
40 uint tr_link; /* link trans */
41 uint tr_remove; /* unlink trans */
42 uint tr_symlink; /* symlink trans */
43 uint tr_create; /* create trans */
44 uint tr_mkdir; /* mkdir trans */
45 uint tr_ifree; /* inode free trans */
46 uint tr_ichange; /* inode update trans */
47 uint tr_growdata; /* fs data section grow trans */
48 uint tr_swrite; /* sync write inode trans */
49 uint tr_addafork; /* cvt inode to attributed trans */
50 uint tr_writeid; /* write setuid/setgid file */
51 uint tr_attrinval; /* attr fork buffer invalidation */
52 uint tr_attrset; /* set/create an attribute */
53 uint tr_attrrm; /* remove an attribute */
54 uint tr_clearagi; /* clear bad agi unlinked ino bucket */
55 uint tr_growrtalloc; /* grow realtime allocations */
56 uint tr_growrtzero; /* grow realtime zeroing */
57 uint tr_growrtfree; /* grow realtime freeing */
58} xfs_trans_reservations_t;
59
60
61#ifndef __KERNEL__
62/*
63 * Moved here from xfs_ag.h to avoid reordering header files
64 */
65#define XFS_DADDR_TO_AGNO(mp,d) \
66 ((xfs_agnumber_t)(XFS_BB_TO_FSBT(mp, d) / (mp)->m_sb.sb_agblocks))
67#define XFS_DADDR_TO_AGBNO(mp,d) \
68 ((xfs_agblock_t)(XFS_BB_TO_FSBT(mp, d) % (mp)->m_sb.sb_agblocks))
69#else
70struct cred;
71struct log;
72struct vfs;
73struct vnode;
74struct xfs_mount_args;
75struct xfs_ihash;
76struct xfs_chash;
77struct xfs_inode;
78struct xfs_perag;
79struct xfs_iocore;
80struct xfs_bmbt_irec;
81struct xfs_bmap_free;
82
83#define AIL_LOCK_T lock_t
84#define AIL_LOCKINIT(x,y) spinlock_init(x,y)
85#define AIL_LOCK_DESTROY(x) spinlock_destroy(x)
86#define AIL_LOCK(mp,s) s=mutex_spinlock(&(mp)->m_ail_lock)
87#define AIL_UNLOCK(mp,s) mutex_spinunlock(&(mp)->m_ail_lock, s)
88
89
90/*
91 * Prototypes and functions for the Data Migration subsystem.
92 */
93
94typedef int (*xfs_send_data_t)(int, struct vnode *,
95 xfs_off_t, size_t, int, vrwlock_t *);
96typedef int (*xfs_send_mmap_t)(struct vm_area_struct *, uint);
97typedef int (*xfs_send_destroy_t)(struct vnode *, dm_right_t);
98typedef int (*xfs_send_namesp_t)(dm_eventtype_t, struct vfs *,
99 struct vnode *,
100 dm_right_t, struct vnode *, dm_right_t,
101 char *, char *, mode_t, int, int);
102typedef void (*xfs_send_unmount_t)(struct vfs *, struct vnode *,
103 dm_right_t, mode_t, int, int);
104
105typedef struct xfs_dmops {
106 xfs_send_data_t xfs_send_data;
107 xfs_send_mmap_t xfs_send_mmap;
108 xfs_send_destroy_t xfs_send_destroy;
109 xfs_send_namesp_t xfs_send_namesp;
110 xfs_send_unmount_t xfs_send_unmount;
111} xfs_dmops_t;
112
113#define XFS_SEND_DATA(mp, ev,vp,off,len,fl,lock) \
114 (*(mp)->m_dm_ops.xfs_send_data)(ev,vp,off,len,fl,lock)
115#define XFS_SEND_MMAP(mp, vma,fl) \
116 (*(mp)->m_dm_ops.xfs_send_mmap)(vma,fl)
117#define XFS_SEND_DESTROY(mp, vp,right) \
118 (*(mp)->m_dm_ops.xfs_send_destroy)(vp,right)
119#define XFS_SEND_NAMESP(mp, ev,b1,r1,b2,r2,n1,n2,mode,rval,fl) \
120 (*(mp)->m_dm_ops.xfs_send_namesp)(ev,NULL,b1,r1,b2,r2,n1,n2,mode,rval,fl)
121#define XFS_SEND_PREUNMOUNT(mp, vfs,b1,r1,b2,r2,n1,n2,mode,rval,fl) \
122 (*(mp)->m_dm_ops.xfs_send_namesp)(DM_EVENT_PREUNMOUNT,vfs,b1,r1,b2,r2,n1,n2,mode,rval,fl)
123#define XFS_SEND_UNMOUNT(mp, vfsp,vp,right,mode,rval,fl) \
124 (*(mp)->m_dm_ops.xfs_send_unmount)(vfsp,vp,right,mode,rval,fl)
125
126
127/*
128 * Prototypes and functions for the Quota Management subsystem.
129 */
130
131struct xfs_dquot;
132struct xfs_dqtrxops;
133struct xfs_quotainfo;
134
135typedef int (*xfs_qminit_t)(struct xfs_mount *, uint *, uint *);
136typedef int (*xfs_qmmount_t)(struct xfs_mount *, uint, uint, int);
137typedef int (*xfs_qmunmount_t)(struct xfs_mount *);
138typedef void (*xfs_qmdone_t)(struct xfs_mount *);
139typedef void (*xfs_dqrele_t)(struct xfs_dquot *);
140typedef int (*xfs_dqattach_t)(struct xfs_inode *, uint);
141typedef void (*xfs_dqdetach_t)(struct xfs_inode *);
142typedef int (*xfs_dqpurgeall_t)(struct xfs_mount *, uint);
143typedef int (*xfs_dqvopalloc_t)(struct xfs_mount *,
144 struct xfs_inode *, uid_t, gid_t, uint,
145 struct xfs_dquot **, struct xfs_dquot **);
146typedef void (*xfs_dqvopcreate_t)(struct xfs_trans *, struct xfs_inode *,
147 struct xfs_dquot *, struct xfs_dquot *);
148typedef int (*xfs_dqvoprename_t)(struct xfs_inode **);
149typedef struct xfs_dquot * (*xfs_dqvopchown_t)(
150 struct xfs_trans *, struct xfs_inode *,
151 struct xfs_dquot **, struct xfs_dquot *);
152typedef int (*xfs_dqvopchownresv_t)(struct xfs_trans *, struct xfs_inode *,
153 struct xfs_dquot *, struct xfs_dquot *, uint);
154
155typedef struct xfs_qmops {
156 xfs_qminit_t xfs_qminit;
157 xfs_qmdone_t xfs_qmdone;
158 xfs_qmmount_t xfs_qmmount;
159 xfs_qmunmount_t xfs_qmunmount;
160 xfs_dqrele_t xfs_dqrele;
161 xfs_dqattach_t xfs_dqattach;
162 xfs_dqdetach_t xfs_dqdetach;
163 xfs_dqpurgeall_t xfs_dqpurgeall;
164 xfs_dqvopalloc_t xfs_dqvopalloc;
165 xfs_dqvopcreate_t xfs_dqvopcreate;
166 xfs_dqvoprename_t xfs_dqvoprename;
167 xfs_dqvopchown_t xfs_dqvopchown;
168 xfs_dqvopchownresv_t xfs_dqvopchownresv;
169 struct xfs_dqtrxops *xfs_dqtrxops;
170} xfs_qmops_t;
171
172#define XFS_QM_INIT(mp, mnt, fl) \
173 (*(mp)->m_qm_ops.xfs_qminit)(mp, mnt, fl)
174#define XFS_QM_MOUNT(mp, mnt, fl, mfsi_flags) \
175 (*(mp)->m_qm_ops.xfs_qmmount)(mp, mnt, fl, mfsi_flags)
176#define XFS_QM_UNMOUNT(mp) \
177 (*(mp)->m_qm_ops.xfs_qmunmount)(mp)
178#define XFS_QM_DONE(mp) \
179 (*(mp)->m_qm_ops.xfs_qmdone)(mp)
180#define XFS_QM_DQRELE(mp, dq) \
181 (*(mp)->m_qm_ops.xfs_dqrele)(dq)
182#define XFS_QM_DQATTACH(mp, ip, fl) \
183 (*(mp)->m_qm_ops.xfs_dqattach)(ip, fl)
184#define XFS_QM_DQDETACH(mp, ip) \
185 (*(mp)->m_qm_ops.xfs_dqdetach)(ip)
186#define XFS_QM_DQPURGEALL(mp, fl) \
187 (*(mp)->m_qm_ops.xfs_dqpurgeall)(mp, fl)
188#define XFS_QM_DQVOPALLOC(mp, ip, uid, gid, fl, dq1, dq2) \
189 (*(mp)->m_qm_ops.xfs_dqvopalloc)(mp, ip, uid, gid, fl, dq1, dq2)
190#define XFS_QM_DQVOPCREATE(mp, tp, ip, dq1, dq2) \
191 (*(mp)->m_qm_ops.xfs_dqvopcreate)(tp, ip, dq1, dq2)
192#define XFS_QM_DQVOPRENAME(mp, ip) \
193 (*(mp)->m_qm_ops.xfs_dqvoprename)(ip)
194#define XFS_QM_DQVOPCHOWN(mp, tp, ip, dqp, dq) \
195 (*(mp)->m_qm_ops.xfs_dqvopchown)(tp, ip, dqp, dq)
196#define XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, dq1, dq2, fl) \
197 (*(mp)->m_qm_ops.xfs_dqvopchownresv)(tp, ip, dq1, dq2, fl)
198
199
200/*
201 * Prototypes and functions for I/O core modularization.
202 */
203
204typedef int (*xfs_ioinit_t)(struct vfs *,
205 struct xfs_mount_args *, int);
206typedef int (*xfs_bmapi_t)(struct xfs_trans *, void *,
207 xfs_fileoff_t, xfs_filblks_t, int,
208 xfs_fsblock_t *, xfs_extlen_t,
209 struct xfs_bmbt_irec *, int *,
210 struct xfs_bmap_free *);
211typedef int (*xfs_bmap_eof_t)(void *, xfs_fileoff_t, int, int *);
212typedef int (*xfs_iomap_write_direct_t)(
213 void *, loff_t, size_t, int,
214 struct xfs_bmbt_irec *, int *, int);
215typedef int (*xfs_iomap_write_delay_t)(
216 void *, loff_t, size_t, int,
217 struct xfs_bmbt_irec *, int *);
218typedef int (*xfs_iomap_write_allocate_t)(
219 void *, struct xfs_bmbt_irec *, int *);
220typedef int (*xfs_iomap_write_unwritten_t)(
221 void *, loff_t, size_t);
222typedef uint (*xfs_lck_map_shared_t)(void *);
223typedef void (*xfs_lock_t)(void *, uint);
224typedef void (*xfs_lock_demote_t)(void *, uint);
225typedef int (*xfs_lock_nowait_t)(void *, uint);
226typedef void (*xfs_unlk_t)(void *, unsigned int);
227typedef xfs_fsize_t (*xfs_size_t)(void *);
228typedef xfs_fsize_t (*xfs_iodone_t)(struct vfs *);
229
230typedef struct xfs_ioops {
231 xfs_ioinit_t xfs_ioinit;
232 xfs_bmapi_t xfs_bmapi_func;
233 xfs_bmap_eof_t xfs_bmap_eof_func;
234 xfs_iomap_write_direct_t xfs_iomap_write_direct;
235 xfs_iomap_write_delay_t xfs_iomap_write_delay;
236 xfs_iomap_write_allocate_t xfs_iomap_write_allocate;
237 xfs_iomap_write_unwritten_t xfs_iomap_write_unwritten;
238 xfs_lock_t xfs_ilock;
239 xfs_lck_map_shared_t xfs_lck_map_shared;
240 xfs_lock_demote_t xfs_ilock_demote;
241 xfs_lock_nowait_t xfs_ilock_nowait;
242 xfs_unlk_t xfs_unlock;
243 xfs_size_t xfs_size_func;
244 xfs_iodone_t xfs_iodone;
245} xfs_ioops_t;
246
247#define XFS_IOINIT(vfsp, args, flags) \
248 (*(mp)->m_io_ops.xfs_ioinit)(vfsp, args, flags)
249#define XFS_BMAPI(mp, trans,io,bno,len,f,first,tot,mval,nmap,flist) \
250 (*(mp)->m_io_ops.xfs_bmapi_func) \
251 (trans,(io)->io_obj,bno,len,f,first,tot,mval,nmap,flist)
252#define XFS_BMAP_EOF(mp, io, endoff, whichfork, eof) \
253 (*(mp)->m_io_ops.xfs_bmap_eof_func) \
254 ((io)->io_obj, endoff, whichfork, eof)
255#define XFS_IOMAP_WRITE_DIRECT(mp, io, offset, count, flags, mval, nmap, found)\
256 (*(mp)->m_io_ops.xfs_iomap_write_direct) \
257 ((io)->io_obj, offset, count, flags, mval, nmap, found)
258#define XFS_IOMAP_WRITE_DELAY(mp, io, offset, count, flags, mval, nmap) \
259 (*(mp)->m_io_ops.xfs_iomap_write_delay) \
260 ((io)->io_obj, offset, count, flags, mval, nmap)
261#define XFS_IOMAP_WRITE_ALLOCATE(mp, io, mval, nmap) \
262 (*(mp)->m_io_ops.xfs_iomap_write_allocate) \
263 ((io)->io_obj, mval, nmap)
264#define XFS_IOMAP_WRITE_UNWRITTEN(mp, io, offset, count) \
265 (*(mp)->m_io_ops.xfs_iomap_write_unwritten) \
266 ((io)->io_obj, offset, count)
267#define XFS_LCK_MAP_SHARED(mp, io) \
268 (*(mp)->m_io_ops.xfs_lck_map_shared)((io)->io_obj)
269#define XFS_ILOCK(mp, io, mode) \
270 (*(mp)->m_io_ops.xfs_ilock)((io)->io_obj, mode)
271#define XFS_ILOCK_NOWAIT(mp, io, mode) \
272 (*(mp)->m_io_ops.xfs_ilock_nowait)((io)->io_obj, mode)
273#define XFS_IUNLOCK(mp, io, mode) \
274 (*(mp)->m_io_ops.xfs_unlock)((io)->io_obj, mode)
275#define XFS_ILOCK_DEMOTE(mp, io, mode) \
276 (*(mp)->m_io_ops.xfs_ilock_demote)((io)->io_obj, mode)
277#define XFS_SIZE(mp, io) \
278 (*(mp)->m_io_ops.xfs_size_func)((io)->io_obj)
279#define XFS_IODONE(vfsp) \
280 (*(mp)->m_io_ops.xfs_iodone)(vfsp)
281
282
283typedef struct xfs_mount {
284 bhv_desc_t m_bhv; /* vfs xfs behavior */
285 xfs_tid_t m_tid; /* next unused tid for fs */
286 AIL_LOCK_T m_ail_lock; /* fs AIL mutex */
287 xfs_ail_entry_t m_ail; /* fs active log item list */
288 uint m_ail_gen; /* fs AIL generation count */
289 xfs_sb_t m_sb; /* copy of fs superblock */
290 lock_t m_sb_lock; /* sb counter mutex */
291 struct xfs_buf *m_sb_bp; /* buffer for superblock */
292 char *m_fsname; /* filesystem name */
293 int m_fsname_len; /* strlen of fs name */
294 int m_bsize; /* fs logical block size */
295 xfs_agnumber_t m_agfrotor; /* last ag where space found */
296 xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */
297 lock_t m_agirotor_lock;/* .. and lock protecting it */
298 xfs_agnumber_t m_maxagi; /* highest inode alloc group */
299 uint m_ihsize; /* size of next field */
300 struct xfs_ihash *m_ihash; /* fs private inode hash table*/
301 struct xfs_inode *m_inodes; /* active inode list */
302 struct list_head m_del_inodes; /* inodes to reclaim */
303 mutex_t m_ilock; /* inode list mutex */
304 uint m_ireclaims; /* count of calls to reclaim*/
305 uint m_readio_log; /* min read size log bytes */
306 uint m_readio_blocks; /* min read size blocks */
307 uint m_writeio_log; /* min write size log bytes */
308 uint m_writeio_blocks; /* min write size blocks */
309 struct log *m_log; /* log specific stuff */
310 int m_logbufs; /* number of log buffers */
311 int m_logbsize; /* size of each log buffer */
312 uint m_rsumlevels; /* rt summary levels */
313 uint m_rsumsize; /* size of rt summary, bytes */
314 struct xfs_inode *m_rbmip; /* pointer to bitmap inode */
315 struct xfs_inode *m_rsumip; /* pointer to summary inode */
316 struct xfs_inode *m_rootip; /* pointer to root directory */
317 struct xfs_quotainfo *m_quotainfo; /* disk quota information */
318 xfs_buftarg_t *m_ddev_targp; /* saves taking the address */
319 xfs_buftarg_t *m_logdev_targp;/* ptr to log device */
320 xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */
321#define m_dev m_ddev_targp->pbr_dev
322 __uint8_t m_dircook_elog; /* log d-cookie entry bits */
323 __uint8_t m_blkbit_log; /* blocklog + NBBY */
324 __uint8_t m_blkbb_log; /* blocklog - BBSHIFT */
325 __uint8_t m_agno_log; /* log #ag's */
326 __uint8_t m_agino_log; /* #bits for agino in inum */
327 __uint8_t m_nreadaheads; /* #readahead buffers */
328 __uint16_t m_inode_cluster_size;/* min inode buf size */
329 uint m_blockmask; /* sb_blocksize-1 */
330 uint m_blockwsize; /* sb_blocksize in words */
331 uint m_blockwmask; /* blockwsize-1 */
332 uint m_alloc_mxr[2]; /* XFS_ALLOC_BLOCK_MAXRECS */
333 uint m_alloc_mnr[2]; /* XFS_ALLOC_BLOCK_MINRECS */
334 uint m_bmap_dmxr[2]; /* XFS_BMAP_BLOCK_DMAXRECS */
335 uint m_bmap_dmnr[2]; /* XFS_BMAP_BLOCK_DMINRECS */
336 uint m_inobt_mxr[2]; /* XFS_INOBT_BLOCK_MAXRECS */
337 uint m_inobt_mnr[2]; /* XFS_INOBT_BLOCK_MINRECS */
338 uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */
339 uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
340 uint m_in_maxlevels; /* XFS_IN_MAXLEVELS */
341 struct xfs_perag *m_perag; /* per-ag accounting info */
342 struct rw_semaphore m_peraglock; /* lock for m_perag (pointer) */
343 sema_t m_growlock; /* growfs mutex */
344 int m_fixedfsid[2]; /* unchanged for life of FS */
345 uint m_dmevmask; /* DMI events for this FS */
346 uint m_flags; /* global mount flags */
347 uint m_attroffset; /* inode attribute offset */
348 uint m_dir_node_ents; /* #entries in a dir danode */
349 uint m_attr_node_ents; /* #entries in attr danode */
350 int m_ialloc_inos; /* inodes in inode allocation */
351 int m_ialloc_blks; /* blocks in inode allocation */
352 int m_litino; /* size of inode union area */
353 int m_inoalign_mask;/* mask sb_inoalignmt if used */
354 uint m_qflags; /* quota status flags */
355 xfs_trans_reservations_t m_reservations;/* precomputed res values */
356 __uint64_t m_maxicount; /* maximum inode count */
357 __uint64_t m_maxioffset; /* maximum inode offset */
358 __uint64_t m_resblks; /* total reserved blocks */
359 __uint64_t m_resblks_avail;/* available reserved blocks */
360#if XFS_BIG_INUMS
361 xfs_ino_t m_inoadd; /* add value for ino64_offset */
362#endif
363 int m_dalign; /* stripe unit */
364 int m_swidth; /* stripe width */
365 int m_sinoalign; /* stripe unit inode alignmnt */
366 int m_attr_magicpct;/* 37% of the blocksize */
367 int m_dir_magicpct; /* 37% of the dir blocksize */
368 __uint8_t m_mk_sharedro; /* mark shared ro on unmount */
369 __uint8_t m_inode_quiesce;/* call quiesce on new inodes.
370 field governed by m_ilock */
371 __uint8_t m_sectbb_log; /* sectlog - BBSHIFT */
372 __uint8_t m_dirversion; /* 1 or 2 */
373 xfs_dirops_t m_dirops; /* table of dir funcs */
374 int m_dirblksize; /* directory block sz--bytes */
375 int m_dirblkfsbs; /* directory block sz--fsbs */
376 xfs_dablk_t m_dirdatablk; /* blockno of dir data v2 */
377 xfs_dablk_t m_dirleafblk; /* blockno of dir non-data v2 */
378 xfs_dablk_t m_dirfreeblk; /* blockno of dirfreeindex v2 */
379 uint m_chsize; /* size of next field */
380 struct xfs_chash *m_chash; /* fs private inode per-cluster
381 * hash table */
382 struct xfs_dmops m_dm_ops; /* vector of DMI ops */
383 struct xfs_qmops m_qm_ops; /* vector of XQM ops */
384 struct xfs_ioops m_io_ops; /* vector of I/O ops */
385 atomic_t m_active_trans; /* number trans frozen */
386} xfs_mount_t;
387
388/*
389 * Flags for m_flags.
390 */
391#define XFS_MOUNT_WSYNC 0x00000001 /* for nfs - all metadata ops
392 must be synchronous except
393 for space allocations */
394#define XFS_MOUNT_INO64 0x00000002
395 /* 0x00000004 -- currently unused */
396 /* 0x00000008 -- currently unused */
397#define XFS_MOUNT_FS_SHUTDOWN 0x00000010 /* atomic stop of all filesystem
398 operations, typically for
399 disk errors in metadata */
400#define XFS_MOUNT_NOATIME 0x00000020 /* don't modify inode access
401 times on reads */
402#define XFS_MOUNT_RETERR 0x00000040 /* return alignment errors to
403 user */
404#define XFS_MOUNT_NOALIGN 0x00000080 /* turn off stripe alignment
405 allocations */
406 /* 0x00000100 -- currently unused */
407 /* 0x00000200 -- currently unused */
408#define XFS_MOUNT_NORECOVERY 0x00000400 /* no recovery - dirty fs */
409#define XFS_MOUNT_SHARED 0x00000800 /* shared mount */
410#define XFS_MOUNT_DFLT_IOSIZE 0x00001000 /* set default i/o size */
411#define XFS_MOUNT_OSYNCISOSYNC 0x00002000 /* o_sync is REALLY o_sync */
412 /* osyncisdsync is now default*/
413#define XFS_MOUNT_32BITINODES 0x00004000 /* do not create inodes above
414 * 32 bits in size */
415#define XFS_MOUNT_32BITINOOPT 0x00008000 /* saved mount option state */
416#define XFS_MOUNT_NOUUID 0x00010000 /* ignore uuid during mount */
417#define XFS_MOUNT_NOLOGFLUSH 0x00020000
418#define XFS_MOUNT_IDELETE 0x00040000 /* delete empty inode clusters*/
419#define XFS_MOUNT_SWALLOC 0x00080000 /* turn on stripe width
420 * allocation */
421#define XFS_MOUNT_IHASHSIZE 0x00100000 /* inode hash table size */
422#define XFS_MOUNT_DIRSYNC 0x00200000 /* synchronous directory ops */
423
424/*
425 * Default minimum read and write sizes.
426 */
427#define XFS_READIO_LOG_LARGE 16
428#define XFS_WRITEIO_LOG_LARGE 16
429
430/*
431 * Max and min values for UIO and mount-option defined I/O sizes;
432 * min value can't be less than a page. Currently unused.
433 */
434#define XFS_MAX_IO_LOG 16 /* 64K */
435#define XFS_MIN_IO_LOG PAGE_SHIFT
436
437/*
438 * Synchronous read and write sizes. This should be
439 * better for NFSv2 wsync filesystems.
440 */
441#define XFS_WSYNC_READIO_LOG 15 /* 32K */
442#define XFS_WSYNC_WRITEIO_LOG 14 /* 16K */
443
444#define XFS_MAXIOFFSET(mp) ((mp)->m_maxioffset)
445
446#define XFS_FORCED_SHUTDOWN(mp) ((mp)->m_flags & XFS_MOUNT_FS_SHUTDOWN)
447#define xfs_force_shutdown(m,f) \
448 VFS_FORCE_SHUTDOWN((XFS_MTOVFS(m)), f, __FILE__, __LINE__)
449
450/*
451 * Flags sent to xfs_force_shutdown.
452 */
453#define XFS_METADATA_IO_ERROR 0x1
454#define XFS_LOG_IO_ERROR 0x2
455#define XFS_FORCE_UMOUNT 0x4
456#define XFS_CORRUPT_INCORE 0x8 /* Corrupt in-memory data structures */
457#define XFS_SHUTDOWN_REMOTE_REQ 0x10 /* Shutdown came from remote cell */
458
459/*
460 * xflags for xfs_syncsub
461 */
462#define XFS_XSYNC_RELOC 0x01
463
464/*
465 * Flags for xfs_mountfs
466 */
467#define XFS_MFSI_SECOND 0x01 /* Secondary mount -- skip stuff */
468#define XFS_MFSI_CLIENT 0x02 /* Is a client -- skip lots of stuff */
469#define XFS_MFSI_NOUNLINK 0x08 /* Skip unlinked inode processing in */
470 /* log recovery */
471#define XFS_MFSI_NO_QUOTACHECK 0x10 /* Skip quotacheck processing */
472
473/*
474 * Macros for getting from mount to vfs and back.
475 */
476#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MTOVFS)
477struct vfs *xfs_mtovfs(xfs_mount_t *mp);
478#define XFS_MTOVFS(mp) xfs_mtovfs(mp)
479#else
480#define XFS_MTOVFS(mp) (bhvtovfs(&(mp)->m_bhv))
481#endif
482#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BHVTOM)
483xfs_mount_t *xfs_bhvtom(bhv_desc_t *bdp);
484#define XFS_BHVTOM(bdp) xfs_bhvtom(bdp)
485#else
486#define XFS_BHVTOM(bdp) ((xfs_mount_t *)BHV_PDATA(bdp))
487#endif
488#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_VFSTOM)
489xfs_mount_t *xfs_vfstom(vfs_t *vfs);
490#define XFS_VFSTOM(vfs) xfs_vfstom(vfs)
491#else
492#define XFS_VFSTOM(vfs) \
493 (XFS_BHVTOM(bhv_lookup(VFS_BHVHEAD(vfs), &xfs_vfsops)))
494#endif
495
496
497/*
498 * Moved here from xfs_ag.h to avoid reordering header files
499 */
500
501#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DADDR_TO_AGNO)
502xfs_agnumber_t xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d);
503#define XFS_DADDR_TO_AGNO(mp,d) xfs_daddr_to_agno(mp,d)
504#else
505
506static inline xfs_agnumber_t XFS_DADDR_TO_AGNO(xfs_mount_t *mp, xfs_daddr_t d)
507{
508 d = XFS_BB_TO_FSBT(mp, d);
509 do_div(d, mp->m_sb.sb_agblocks);
510 return (xfs_agnumber_t) d;
511}
512
513#endif
514#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DADDR_TO_AGBNO)
515xfs_agblock_t xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d);
516#define XFS_DADDR_TO_AGBNO(mp,d) xfs_daddr_to_agbno(mp,d)
517#else
518
519static inline xfs_agblock_t XFS_DADDR_TO_AGBNO(xfs_mount_t *mp, xfs_daddr_t d)
520{
521 d = XFS_BB_TO_FSBT(mp, d);
522 return (xfs_agblock_t) do_div(d, mp->m_sb.sb_agblocks);
523}
524
525#endif
526
527/*
528 * This structure is for use by the xfs_mod_incore_sb_batch() routine.
529 */
530typedef struct xfs_mod_sb {
531 xfs_sb_field_t msb_field; /* Field to modify, see below */
532 int msb_delta; /* Change to make to specified field */
533} xfs_mod_sb_t;
534
535#define XFS_MOUNT_ILOCK(mp) mutex_lock(&((mp)->m_ilock), PINOD)
536#define XFS_MOUNT_IUNLOCK(mp) mutex_unlock(&((mp)->m_ilock))
537#define XFS_SB_LOCK(mp) mutex_spinlock(&(mp)->m_sb_lock)
538#define XFS_SB_UNLOCK(mp,s) mutex_spinunlock(&(mp)->m_sb_lock,(s))
539
540extern xfs_mount_t *xfs_mount_init(void);
541extern void xfs_mod_sb(xfs_trans_t *, __int64_t);
542extern void xfs_mount_free(xfs_mount_t *mp, int remove_bhv);
543extern int xfs_mountfs(struct vfs *, xfs_mount_t *mp, int);
544
545extern int xfs_unmountfs(xfs_mount_t *, struct cred *);
546extern void xfs_unmountfs_wait(xfs_mount_t *);
547extern void xfs_unmountfs_close(xfs_mount_t *, struct cred *);
548extern int xfs_unmountfs_writesb(xfs_mount_t *);
549extern int xfs_unmount_flush(xfs_mount_t *, int);
550extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int, int);
551extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *,
552 uint, int);
553extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
554extern int xfs_readsb(xfs_mount_t *mp);
555extern void xfs_freesb(xfs_mount_t *);
556extern void xfs_do_force_shutdown(bhv_desc_t *, int, char *, int);
557extern int xfs_syncsub(xfs_mount_t *, int, int, int *);
558extern xfs_agnumber_t xfs_initialize_perag(xfs_mount_t *, xfs_agnumber_t);
559extern void xfs_xlatesb(void *, struct xfs_sb *, int, __int64_t);
560
561extern struct vfsops xfs_vfsops;
562extern struct vnodeops xfs_vnodeops;
563
564extern struct xfs_dmops xfs_dmcore_stub;
565extern struct xfs_qmops xfs_qmcore_stub;
566extern struct xfs_ioops xfs_iocore_xfs;
567
568extern int xfs_init(void);
569extern void xfs_cleanup(void);
570
571#endif /* __KERNEL__ */
572
573#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_qmops.c b/fs/xfs/xfs_qmops.c
new file mode 100644
index 000000000000..4f40c92863d5
--- /dev/null
+++ b/fs/xfs/xfs_qmops.c
@@ -0,0 +1,71 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#include "xfs.h"
33
34#include "xfs_macros.h"
35#include "xfs_types.h"
36#include "xfs_inum.h"
37#include "xfs_log.h"
38#include "xfs_trans.h"
39#include "xfs_sb.h"
40#include "xfs_ag.h"
41#include "xfs_dir.h"
42#include "xfs_dir2.h"
43#include "xfs_dmapi.h"
44#include "xfs_mount.h"
45
46
47STATIC struct xfs_dquot *
48xfs_dqvopchown_default(
49 struct xfs_trans *tp,
50 struct xfs_inode *ip,
51 struct xfs_dquot **dqp,
52 struct xfs_dquot *dq)
53{
54 return NULL;
55}
56
57xfs_qmops_t xfs_qmcore_stub = {
58 .xfs_qminit = (xfs_qminit_t) fs_noerr,
59 .xfs_qmdone = (xfs_qmdone_t) fs_noerr,
60 .xfs_qmmount = (xfs_qmmount_t) fs_noerr,
61 .xfs_qmunmount = (xfs_qmunmount_t) fs_noerr,
62 .xfs_dqrele = (xfs_dqrele_t) fs_noerr,
63 .xfs_dqattach = (xfs_dqattach_t) fs_noerr,
64 .xfs_dqdetach = (xfs_dqdetach_t) fs_noerr,
65 .xfs_dqpurgeall = (xfs_dqpurgeall_t) fs_noerr,
66 .xfs_dqvopalloc = (xfs_dqvopalloc_t) fs_noerr,
67 .xfs_dqvopcreate = (xfs_dqvopcreate_t) fs_noerr,
68 .xfs_dqvoprename = (xfs_dqvoprename_t) fs_noerr,
69 .xfs_dqvopchown = xfs_dqvopchown_default,
70 .xfs_dqvopchownresv = (xfs_dqvopchownresv_t) fs_noerr,
71};
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
new file mode 100644
index 000000000000..703ec4efcb41
--- /dev/null
+++ b/fs/xfs/xfs_quota.h
@@ -0,0 +1,356 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_QUOTA_H__
33#define __XFS_QUOTA_H__
34
35/*
36 * The ondisk form of a dquot structure.
37 */
38#define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */
39#define XFS_DQUOT_VERSION (u_int8_t)0x01 /* latest version number */
40
41/*
42 * uid_t and gid_t are hard-coded to 32 bits in the inode.
43 * Hence, an 'id' in a dquot is 32 bits..
44 */
45typedef __int32_t xfs_dqid_t;
46
47/*
48 * Eventhough users may not have quota limits occupying all 64-bits,
49 * they may need 64-bit accounting. Hence, 64-bit quota-counters,
50 * and quota-limits. This is a waste in the common case, but hey ...
51 */
52typedef __uint64_t xfs_qcnt_t;
53typedef __uint16_t xfs_qwarncnt_t;
54
55/*
56 * This is the main portion of the on-disk representation of quota
57 * information for a user. This is the q_core of the xfs_dquot_t that
58 * is kept in kernel memory. We pad this with some more expansion room
59 * to construct the on disk structure.
60 */
61typedef struct xfs_disk_dquot {
62/*16*/ u_int16_t d_magic; /* dquot magic = XFS_DQUOT_MAGIC */
63/*8 */ u_int8_t d_version; /* dquot version */
64/*8 */ u_int8_t d_flags; /* XFS_DQ_USER/PROJ/GROUP */
65/*32*/ xfs_dqid_t d_id; /* user,project,group id */
66/*64*/ xfs_qcnt_t d_blk_hardlimit;/* absolute limit on disk blks */
67/*64*/ xfs_qcnt_t d_blk_softlimit;/* preferred limit on disk blks */
68/*64*/ xfs_qcnt_t d_ino_hardlimit;/* maximum # allocated inodes */
69/*64*/ xfs_qcnt_t d_ino_softlimit;/* preferred inode limit */
70/*64*/ xfs_qcnt_t d_bcount; /* disk blocks owned by the user */
71/*64*/ xfs_qcnt_t d_icount; /* inodes owned by the user */
72/*32*/ __int32_t d_itimer; /* zero if within inode limits if not,
73 this is when we refuse service */
74/*32*/ __int32_t d_btimer; /* similar to above; for disk blocks */
75/*16*/ xfs_qwarncnt_t d_iwarns; /* warnings issued wrt num inodes */
76/*16*/ xfs_qwarncnt_t d_bwarns; /* warnings issued wrt disk blocks */
77/*32*/ __int32_t d_pad0; /* 64 bit align */
78/*64*/ xfs_qcnt_t d_rtb_hardlimit;/* absolute limit on realtime blks */
79/*64*/ xfs_qcnt_t d_rtb_softlimit;/* preferred limit on RT disk blks */
80/*64*/ xfs_qcnt_t d_rtbcount; /* realtime blocks owned */
81/*32*/ __int32_t d_rtbtimer; /* similar to above; for RT disk blocks */
82/*16*/ xfs_qwarncnt_t d_rtbwarns; /* warnings issued wrt RT disk blocks */
83/*16*/ __uint16_t d_pad;
84} xfs_disk_dquot_t;
85
86/*
87 * This is what goes on disk. This is separated from the xfs_disk_dquot because
88 * carrying the unnecessary padding would be a waste of memory.
89 */
90typedef struct xfs_dqblk {
91 xfs_disk_dquot_t dd_diskdq; /* portion that lives incore as well */
92 char dd_fill[32]; /* filling for posterity */
93} xfs_dqblk_t;
94
95/*
96 * flags for q_flags field in the dquot.
97 */
98#define XFS_DQ_USER 0x0001 /* a user quota */
99/* #define XFS_DQ_PROJ 0x0002 -- project quota (IRIX) */
100#define XFS_DQ_GROUP 0x0004 /* a group quota */
101#define XFS_DQ_FLOCKED 0x0008 /* flush lock taken */
102#define XFS_DQ_DIRTY 0x0010 /* dquot is dirty */
103#define XFS_DQ_WANT 0x0020 /* for lookup/reclaim race */
104#define XFS_DQ_INACTIVE 0x0040 /* dq off mplist & hashlist */
105#define XFS_DQ_MARKER 0x0080 /* sentinel */
106
107/*
108 * In the worst case, when both user and group quotas are on,
109 * we can have a max of three dquots changing in a single transaction.
110 */
111#define XFS_DQUOT_LOGRES(mp) (sizeof(xfs_disk_dquot_t) * 3)
112
113
114/*
115 * These are the structures used to lay out dquots and quotaoff
116 * records on the log. Quite similar to those of inodes.
117 */
118
119/*
120 * log format struct for dquots.
121 * The first two fields must be the type and size fitting into
122 * 32 bits : log_recovery code assumes that.
123 */
124typedef struct xfs_dq_logformat {
125 __uint16_t qlf_type; /* dquot log item type */
126 __uint16_t qlf_size; /* size of this item */
127 xfs_dqid_t qlf_id; /* usr/grp id number : 32 bits */
128 __int64_t qlf_blkno; /* blkno of dquot buffer */
129 __int32_t qlf_len; /* len of dquot buffer */
130 __uint32_t qlf_boffset; /* off of dquot in buffer */
131} xfs_dq_logformat_t;
132
133/*
134 * log format struct for QUOTAOFF records.
135 * The first two fields must be the type and size fitting into
136 * 32 bits : log_recovery code assumes that.
137 * We write two LI_QUOTAOFF logitems per quotaoff, the last one keeps a pointer
138 * to the first and ensures that the first logitem is taken out of the AIL
139 * only when the last one is securely committed.
140 */
141typedef struct xfs_qoff_logformat {
142 unsigned short qf_type; /* quotaoff log item type */
143 unsigned short qf_size; /* size of this item */
144 unsigned int qf_flags; /* USR and/or GRP */
145 char qf_pad[12]; /* padding for future */
146} xfs_qoff_logformat_t;
147
148
149/*
150 * Disk quotas status in m_qflags, and also sb_qflags. 16 bits.
151 */
152#define XFS_UQUOTA_ACCT 0x0001 /* user quota accounting ON */
153#define XFS_UQUOTA_ENFD 0x0002 /* user quota limits enforced */
154#define XFS_UQUOTA_CHKD 0x0004 /* quotacheck run on usr quotas */
155#define XFS_PQUOTA_ACCT 0x0008 /* (IRIX) project quota accounting ON */
156#define XFS_GQUOTA_ENFD 0x0010 /* group quota limits enforced */
157#define XFS_GQUOTA_CHKD 0x0020 /* quotacheck run on grp quotas */
158#define XFS_GQUOTA_ACCT 0x0040 /* group quota accounting ON */
159
160/*
161 * Incore only flags for quotaoff - these bits get cleared when quota(s)
162 * are in the process of getting turned off. These flags are in m_qflags but
163 * never in sb_qflags.
164 */
165#define XFS_UQUOTA_ACTIVE 0x0080 /* uquotas are being turned off */
166#define XFS_GQUOTA_ACTIVE 0x0100 /* gquotas are being turned off */
167
168/*
169 * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
170 * quota will be not be switched off as long as that inode lock is held.
171 */
172#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \
173 XFS_GQUOTA_ACTIVE))
174#define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACTIVE)
175#define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACTIVE)
176
177/*
178 * Flags to tell various functions what to do. Not all of these are meaningful
179 * to a single function. None of these XFS_QMOPT_* flags are meant to have
180 * persistent values (ie. their values can and will change between versions)
181 */
182#define XFS_QMOPT_DQLOCK 0x0000001 /* dqlock */
183#define XFS_QMOPT_DQALLOC 0x0000002 /* alloc dquot ondisk if needed */
184#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */
185#define XFS_QMOPT_GQUOTA 0x0000008 /* group dquot requested */
186#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */
187#define XFS_QMOPT_DQSUSER 0x0000020 /* don't cache super users dquot */
188#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */
189#define XFS_QMOPT_QUOTAOFF 0x0000080 /* quotas are being turned off */
190#define XFS_QMOPT_UMOUNTING 0x0000100 /* filesys is being unmounted */
191#define XFS_QMOPT_DOLOG 0x0000200 /* log buf changes (in quotacheck) */
192#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if necessary */
193#define XFS_QMOPT_ILOCKED 0x0000800 /* inode is already locked (excl) */
194#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot, if damaged. */
195
196/*
197 * flags to xfs_trans_mod_dquot to indicate which field needs to be
198 * modified.
199 */
200#define XFS_QMOPT_RES_REGBLKS 0x0010000
201#define XFS_QMOPT_RES_RTBLKS 0x0020000
202#define XFS_QMOPT_BCOUNT 0x0040000
203#define XFS_QMOPT_ICOUNT 0x0080000
204#define XFS_QMOPT_RTBCOUNT 0x0100000
205#define XFS_QMOPT_DELBCOUNT 0x0200000
206#define XFS_QMOPT_DELRTBCOUNT 0x0400000
207#define XFS_QMOPT_RES_INOS 0x0800000
208
209/*
210 * flags for dqflush and dqflush_all.
211 */
212#define XFS_QMOPT_SYNC 0x1000000
213#define XFS_QMOPT_ASYNC 0x2000000
214#define XFS_QMOPT_DELWRI 0x4000000
215
216/*
217 * flags for dqalloc.
218 */
219#define XFS_QMOPT_INHERIT 0x8000000
220
221/*
222 * flags to xfs_trans_mod_dquot.
223 */
224#define XFS_TRANS_DQ_RES_BLKS XFS_QMOPT_RES_REGBLKS
225#define XFS_TRANS_DQ_RES_RTBLKS XFS_QMOPT_RES_RTBLKS
226#define XFS_TRANS_DQ_RES_INOS XFS_QMOPT_RES_INOS
227#define XFS_TRANS_DQ_BCOUNT XFS_QMOPT_BCOUNT
228#define XFS_TRANS_DQ_DELBCOUNT XFS_QMOPT_DELBCOUNT
229#define XFS_TRANS_DQ_ICOUNT XFS_QMOPT_ICOUNT
230#define XFS_TRANS_DQ_RTBCOUNT XFS_QMOPT_RTBCOUNT
231#define XFS_TRANS_DQ_DELRTBCOUNT XFS_QMOPT_DELRTBCOUNT
232
233
234#define XFS_QMOPT_QUOTALL (XFS_QMOPT_UQUOTA|XFS_QMOPT_GQUOTA)
235#define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
236
237#ifdef __KERNEL__
238/*
239 * This check is done typically without holding the inode lock;
240 * that may seem racey, but it is harmless in the context that it is used.
241 * The inode cannot go inactive as long a reference is kept, and
242 * therefore if dquot(s) were attached, they'll stay consistent.
243 * If, for example, the ownership of the inode changes while
244 * we didn't have the inode locked, the appropriate dquot(s) will be
245 * attached atomically.
246 */
247#define XFS_NOT_DQATTACHED(mp, ip) ((XFS_IS_UQUOTA_ON(mp) &&\
248 (ip)->i_udquot == NULL) || \
249 (XFS_IS_GQUOTA_ON(mp) && \
250 (ip)->i_gdquot == NULL))
251
252#define XFS_QM_NEED_QUOTACHECK(mp) ((XFS_IS_UQUOTA_ON(mp) && \
253 (mp->m_sb.sb_qflags & \
254 XFS_UQUOTA_CHKD) == 0) || \
255 (XFS_IS_GQUOTA_ON(mp) && \
256 (mp->m_sb.sb_qflags & \
257 XFS_GQUOTA_CHKD) == 0))
258
259#define XFS_MOUNT_QUOTA_ALL (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
260 XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
261 XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD)
262#define XFS_MOUNT_QUOTA_MASK (XFS_MOUNT_QUOTA_ALL | XFS_UQUOTA_ACTIVE | \
263 XFS_GQUOTA_ACTIVE)
264
265
266/*
267 * The structure kept inside the xfs_trans_t keep track of dquot changes
268 * within a transaction and apply them later.
269 */
270typedef struct xfs_dqtrx {
271 struct xfs_dquot *qt_dquot; /* the dquot this refers to */
272 ulong qt_blk_res; /* blks reserved on a dquot */
273 ulong qt_blk_res_used; /* blks used from the reservation */
274 ulong qt_ino_res; /* inode reserved on a dquot */
275 ulong qt_ino_res_used; /* inodes used from the reservation */
276 long qt_bcount_delta; /* dquot blk count changes */
277 long qt_delbcnt_delta; /* delayed dquot blk count changes */
278 long qt_icount_delta; /* dquot inode count changes */
279 ulong qt_rtblk_res; /* # blks reserved on a dquot */
280 ulong qt_rtblk_res_used;/* # blks used from reservation */
281 long qt_rtbcount_delta;/* dquot realtime blk changes */
282 long qt_delrtb_delta; /* delayed RT blk count changes */
283} xfs_dqtrx_t;
284
285/*
286 * Dquot transaction functions, used if quota is enabled.
287 */
288typedef void (*qo_dup_dqinfo_t)(struct xfs_trans *, struct xfs_trans *);
289typedef void (*qo_mod_dquot_byino_t)(struct xfs_trans *,
290 struct xfs_inode *, uint, long);
291typedef void (*qo_free_dqinfo_t)(struct xfs_trans *);
292typedef void (*qo_apply_dquot_deltas_t)(struct xfs_trans *);
293typedef void (*qo_unreserve_and_mod_dquots_t)(struct xfs_trans *);
294typedef int (*qo_reserve_quota_nblks_t)(
295 struct xfs_trans *, struct xfs_mount *,
296 struct xfs_inode *, long, long, uint);
297typedef int (*qo_reserve_quota_bydquots_t)(
298 struct xfs_trans *, struct xfs_mount *,
299 struct xfs_dquot *, struct xfs_dquot *,
300 long, long, uint);
301typedef struct xfs_dqtrxops {
302 qo_dup_dqinfo_t qo_dup_dqinfo;
303 qo_free_dqinfo_t qo_free_dqinfo;
304 qo_mod_dquot_byino_t qo_mod_dquot_byino;
305 qo_apply_dquot_deltas_t qo_apply_dquot_deltas;
306 qo_reserve_quota_nblks_t qo_reserve_quota_nblks;
307 qo_reserve_quota_bydquots_t qo_reserve_quota_bydquots;
308 qo_unreserve_and_mod_dquots_t qo_unreserve_and_mod_dquots;
309} xfs_dqtrxops_t;
310
311#define XFS_DQTRXOP(mp, tp, op, args...) \
312 ((mp)->m_qm_ops.xfs_dqtrxops ? \
313 ((mp)->m_qm_ops.xfs_dqtrxops->op)(tp, ## args) : 0)
314
315#define XFS_DQTRXOP_VOID(mp, tp, op, args...) \
316 ((mp)->m_qm_ops.xfs_dqtrxops ? \
317 ((mp)->m_qm_ops.xfs_dqtrxops->op)(tp, ## args) : (void)0)
318
319#define XFS_TRANS_DUP_DQINFO(mp, otp, ntp) \
320 XFS_DQTRXOP_VOID(mp, otp, qo_dup_dqinfo, ntp)
321#define XFS_TRANS_FREE_DQINFO(mp, tp) \
322 XFS_DQTRXOP_VOID(mp, tp, qo_free_dqinfo)
323#define XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, field, delta) \
324 XFS_DQTRXOP_VOID(mp, tp, qo_mod_dquot_byino, ip, field, delta)
325#define XFS_TRANS_APPLY_DQUOT_DELTAS(mp, tp) \
326 XFS_DQTRXOP_VOID(mp, tp, qo_apply_dquot_deltas)
327#define XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, nblks, ninos, fl) \
328 XFS_DQTRXOP(mp, tp, qo_reserve_quota_nblks, mp, ip, nblks, ninos, fl)
329#define XFS_TRANS_RESERVE_QUOTA_BYDQUOTS(mp, tp, ud, gd, nb, ni, fl) \
330 XFS_DQTRXOP(mp, tp, qo_reserve_quota_bydquots, mp, ud, gd, nb, ni, fl)
331#define XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(mp, tp) \
332 XFS_DQTRXOP_VOID(mp, tp, qo_unreserve_and_mod_dquots)
333
334#define XFS_TRANS_RESERVE_BLKQUOTA(mp, tp, ip, nblks) \
335 XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, nblks, 0, \
336 XFS_QMOPT_RES_REGBLKS)
337#define XFS_TRANS_RESERVE_BLKQUOTA_FORCE(mp, tp, ip, nblks) \
338 XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, nblks, 0, \
339 XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES)
340#define XFS_TRANS_UNRESERVE_BLKQUOTA(mp, tp, ip, nblks) \
341 XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, -(nblks), 0, \
342 XFS_QMOPT_RES_REGBLKS)
343#define XFS_TRANS_RESERVE_QUOTA(mp, tp, ud, gd, nb, ni, f) \
344 XFS_TRANS_RESERVE_QUOTA_BYDQUOTS(mp, tp, ud, gd, nb, ni, \
345 f | XFS_QMOPT_RES_REGBLKS)
346#define XFS_TRANS_UNRESERVE_QUOTA(mp, tp, ud, gd, nb, ni, f) \
347 XFS_TRANS_RESERVE_QUOTA_BYDQUOTS(mp, tp, ud, gd, -(nb), -(ni), \
348 f | XFS_QMOPT_RES_REGBLKS)
349
350extern int xfs_qm_dqcheck(xfs_disk_dquot_t *, xfs_dqid_t, uint, uint, char *);
351
352extern struct bhv_vfsops xfs_qmops;
353
354#endif /* __KERNEL__ */
355
356#endif /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/xfs_refcache.h b/fs/xfs/xfs_refcache.h
new file mode 100644
index 000000000000..cd8ddfd35d69
--- /dev/null
+++ b/fs/xfs/xfs_refcache.h
@@ -0,0 +1,66 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_REFCACHE_H__
33#define __XFS_REFCACHE_H__
34
35#ifdef HAVE_REFCACHE
36/*
37 * Maximum size (in inodes) for the NFS reference cache
38 */
39#define XFS_REFCACHE_SIZE_MAX 512
40
41struct xfs_inode;
42struct xfs_mount;
43
44extern void xfs_refcache_insert(struct xfs_inode *);
45extern void xfs_refcache_purge_ip(struct xfs_inode *);
46extern void xfs_refcache_purge_mp(struct xfs_mount *);
47extern void xfs_refcache_purge_some(struct xfs_mount *);
48extern void xfs_refcache_resize(int);
49extern void xfs_refcache_destroy(void);
50
51extern void xfs_refcache_iunlock(struct xfs_inode *, uint);
52
53#else
54
55#define xfs_refcache_insert(ip) do { } while (0)
56#define xfs_refcache_purge_ip(ip) do { } while (0)
57#define xfs_refcache_purge_mp(mp) do { } while (0)
58#define xfs_refcache_purge_some(mp) do { } while (0)
59#define xfs_refcache_resize(size) do { } while (0)
60#define xfs_refcache_destroy() do { } while (0)
61
62#define xfs_refcache_iunlock(ip, flags) xfs_iunlock(ip, flags)
63
64#endif
65
66#endif /* __XFS_REFCACHE_H__ */
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
new file mode 100644
index 000000000000..cb13f9a1d45b
--- /dev/null
+++ b/fs/xfs/xfs_rename.c
@@ -0,0 +1,673 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34#include "xfs_macros.h"
35#include "xfs_types.h"
36#include "xfs_inum.h"
37#include "xfs_log.h"
38#include "xfs_trans.h"
39#include "xfs_sb.h"
40#include "xfs_dir.h"
41#include "xfs_dir2.h"
42#include "xfs_dmapi.h"
43#include "xfs_mount.h"
44#include "xfs_bmap_btree.h"
45#include "xfs_attr_sf.h"
46#include "xfs_dir_sf.h"
47#include "xfs_dir2_sf.h"
48#include "xfs_dinode.h"
49#include "xfs_inode_item.h"
50#include "xfs_inode.h"
51#include "xfs_bmap.h"
52#include "xfs_error.h"
53#include "xfs_quota.h"
54#include "xfs_refcache.h"
55#include "xfs_utils.h"
56#include "xfs_trans_space.h"
57#include "xfs_da_btree.h"
58#include "xfs_dir_leaf.h"
59
60
61/*
62 * Given an array of up to 4 inode pointers, unlock the pointed to inodes.
63 * If there are fewer than 4 entries in the array, the empty entries will
64 * be at the end and will have NULL pointers in them.
65 */
66STATIC void
67xfs_rename_unlock4(
68 xfs_inode_t **i_tab,
69 uint lock_mode)
70{
71 int i;
72
73 xfs_iunlock(i_tab[0], lock_mode);
74 for (i = 1; i < 4; i++) {
75 if (i_tab[i] == NULL) {
76 break;
77 }
78 /*
79 * Watch out for duplicate entries in the table.
80 */
81 if (i_tab[i] != i_tab[i-1]) {
82 xfs_iunlock(i_tab[i], lock_mode);
83 }
84 }
85}
86
87#ifdef DEBUG
88int xfs_rename_skip, xfs_rename_nskip;
89#endif
90
91/*
92 * The following routine will acquire the locks required for a rename
93 * operation. The code understands the semantics of renames and will
94 * validate that name1 exists under dp1 & that name2 may or may not
95 * exist under dp2.
96 *
97 * We are renaming dp1/name1 to dp2/name2.
98 *
99 * Return ENOENT if dp1 does not exist, other lookup errors, or 0 for success.
100 */
101STATIC int
102xfs_lock_for_rename(
103 xfs_inode_t *dp1, /* old (source) directory inode */
104 xfs_inode_t *dp2, /* new (target) directory inode */
105 vname_t *vname1,/* old entry name */
106 vname_t *vname2,/* new entry name */
107 xfs_inode_t **ipp1, /* inode of old entry */
108 xfs_inode_t **ipp2, /* inode of new entry, if it
109 already exists, NULL otherwise. */
110 xfs_inode_t **i_tab,/* array of inode returned, sorted */
111 int *num_inodes) /* number of inodes in array */
112{
113 xfs_inode_t *ip1, *ip2, *temp;
114 xfs_ino_t inum1, inum2;
115 int error;
116 int i, j;
117 uint lock_mode;
118 int diff_dirs = (dp1 != dp2);
119
120 ip2 = NULL;
121
122 /*
123 * First, find out the current inums of the entries so that we
124 * can determine the initial locking order. We'll have to
125 * sanity check stuff after all the locks have been acquired
126 * to see if we still have the right inodes, directories, etc.
127 */
128 lock_mode = xfs_ilock_map_shared(dp1);
129 error = xfs_get_dir_entry(vname1, &ip1);
130 if (error) {
131 xfs_iunlock_map_shared(dp1, lock_mode);
132 return error;
133 }
134
135 inum1 = ip1->i_ino;
136
137 ASSERT(ip1);
138 ITRACE(ip1);
139
140 /*
141 * Unlock dp1 and lock dp2 if they are different.
142 */
143
144 if (diff_dirs) {
145 xfs_iunlock_map_shared(dp1, lock_mode);
146 lock_mode = xfs_ilock_map_shared(dp2);
147 }
148
149 error = xfs_dir_lookup_int(XFS_ITOBHV(dp2), lock_mode,
150 vname2, &inum2, &ip2);
151 if (error == ENOENT) { /* target does not need to exist. */
152 inum2 = 0;
153 } else if (error) {
154 /*
155 * If dp2 and dp1 are the same, the next line unlocks dp1.
156 * Got it?
157 */
158 xfs_iunlock_map_shared(dp2, lock_mode);
159 IRELE (ip1);
160 return error;
161 } else {
162 ITRACE(ip2);
163 }
164
165 /*
166 * i_tab contains a list of pointers to inodes. We initialize
167 * the table here & we'll sort it. We will then use it to
168 * order the acquisition of the inode locks.
169 *
170 * Note that the table may contain duplicates. e.g., dp1 == dp2.
171 */
172 i_tab[0] = dp1;
173 i_tab[1] = dp2;
174 i_tab[2] = ip1;
175 if (inum2 == 0) {
176 *num_inodes = 3;
177 i_tab[3] = NULL;
178 } else {
179 *num_inodes = 4;
180 i_tab[3] = ip2;
181 }
182
183 /*
184 * Sort the elements via bubble sort. (Remember, there are at
185 * most 4 elements to sort, so this is adequate.)
186 */
187 for (i=0; i < *num_inodes; i++) {
188 for (j=1; j < *num_inodes; j++) {
189 if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
190 temp = i_tab[j];
191 i_tab[j] = i_tab[j-1];
192 i_tab[j-1] = temp;
193 }
194 }
195 }
196
197 /*
198 * We have dp2 locked. If it isn't first, unlock it.
199 * If it is first, tell xfs_lock_inodes so it can skip it
200 * when locking. if dp1 == dp2, xfs_lock_inodes will skip both
201 * since they are equal. xfs_lock_inodes needs all these inodes
202 * so that it can unlock and retry if there might be a dead-lock
203 * potential with the log.
204 */
205
206 if (i_tab[0] == dp2 && lock_mode == XFS_ILOCK_SHARED) {
207#ifdef DEBUG
208 xfs_rename_skip++;
209#endif
210 xfs_lock_inodes(i_tab, *num_inodes, 1, XFS_ILOCK_SHARED);
211 } else {
212#ifdef DEBUG
213 xfs_rename_nskip++;
214#endif
215 xfs_iunlock_map_shared(dp2, lock_mode);
216 xfs_lock_inodes(i_tab, *num_inodes, 0, XFS_ILOCK_SHARED);
217 }
218
219 /*
220 * Set the return value. Null out any unused entries in i_tab.
221 */
222 *ipp1 = *ipp2 = NULL;
223 for (i=0; i < *num_inodes; i++) {
224 if (i_tab[i]->i_ino == inum1) {
225 *ipp1 = i_tab[i];
226 }
227 if (i_tab[i]->i_ino == inum2) {
228 *ipp2 = i_tab[i];
229 }
230 }
231 for (;i < 4; i++) {
232 i_tab[i] = NULL;
233 }
234 return 0;
235}
236
237
238int rename_which_error_return = 0;
239
240/*
241 * xfs_rename
242 */
243int
244xfs_rename(
245 bhv_desc_t *src_dir_bdp,
246 vname_t *src_vname,
247 vnode_t *target_dir_vp,
248 vname_t *target_vname,
249 cred_t *credp)
250{
251 xfs_trans_t *tp;
252 xfs_inode_t *src_dp, *target_dp, *src_ip, *target_ip;
253 xfs_mount_t *mp;
254 int new_parent; /* moving to a new dir */
255 int src_is_directory; /* src_name is a directory */
256 int error;
257 xfs_bmap_free_t free_list;
258 xfs_fsblock_t first_block;
259 int cancel_flags;
260 int committed;
261 xfs_inode_t *inodes[4];
262 int target_ip_dropped = 0; /* dropped target_ip link? */
263 vnode_t *src_dir_vp;
264 bhv_desc_t *target_dir_bdp;
265 int spaceres;
266 int target_link_zero = 0;
267 int num_inodes;
268 char *src_name = VNAME(src_vname);
269 char *target_name = VNAME(target_vname);
270 int src_namelen = VNAMELEN(src_vname);
271 int target_namelen = VNAMELEN(target_vname);
272
273 src_dir_vp = BHV_TO_VNODE(src_dir_bdp);
274 vn_trace_entry(src_dir_vp, "xfs_rename", (inst_t *)__return_address);
275 vn_trace_entry(target_dir_vp, "xfs_rename", (inst_t *)__return_address);
276
277 /*
278 * Find the XFS behavior descriptor for the target directory
279 * vnode since it was not handed to us.
280 */
281 target_dir_bdp = vn_bhv_lookup_unlocked(VN_BHV_HEAD(target_dir_vp),
282 &xfs_vnodeops);
283 if (target_dir_bdp == NULL) {
284 return XFS_ERROR(EXDEV);
285 }
286
287 src_dp = XFS_BHVTOI(src_dir_bdp);
288 target_dp = XFS_BHVTOI(target_dir_bdp);
289 mp = src_dp->i_mount;
290
291 if (DM_EVENT_ENABLED(src_dir_vp->v_vfsp, src_dp, DM_EVENT_RENAME) ||
292 DM_EVENT_ENABLED(target_dir_vp->v_vfsp,
293 target_dp, DM_EVENT_RENAME)) {
294 error = XFS_SEND_NAMESP(mp, DM_EVENT_RENAME,
295 src_dir_vp, DM_RIGHT_NULL,
296 target_dir_vp, DM_RIGHT_NULL,
297 src_name, target_name,
298 0, 0, 0);
299 if (error) {
300 return error;
301 }
302 }
303 /* Return through std_return after this point. */
304
305 /*
306 * Lock all the participating inodes. Depending upon whether
307 * the target_name exists in the target directory, and
308 * whether the target directory is the same as the source
309 * directory, we can lock from 2 to 4 inodes.
310 * xfs_lock_for_rename() will return ENOENT if src_name
311 * does not exist in the source directory.
312 */
313 tp = NULL;
314 error = xfs_lock_for_rename(src_dp, target_dp, src_vname,
315 target_vname, &src_ip, &target_ip, inodes,
316 &num_inodes);
317
318 if (error) {
319 rename_which_error_return = __LINE__;
320 /*
321 * We have nothing locked, no inode references, and
322 * no transaction, so just get out.
323 */
324 goto std_return;
325 }
326
327 ASSERT(src_ip != NULL);
328
329 if ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
330 /*
331 * Check for link count overflow on target_dp
332 */
333 if (target_ip == NULL && (src_dp != target_dp) &&
334 target_dp->i_d.di_nlink >= XFS_MAXLINK) {
335 rename_which_error_return = __LINE__;
336 error = XFS_ERROR(EMLINK);
337 xfs_rename_unlock4(inodes, XFS_ILOCK_SHARED);
338 goto rele_return;
339 }
340 }
341
342 new_parent = (src_dp != target_dp);
343 src_is_directory = ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR);
344
345 /*
346 * Drop the locks on our inodes so that we can start the transaction.
347 */
348 xfs_rename_unlock4(inodes, XFS_ILOCK_SHARED);
349
350 XFS_BMAP_INIT(&free_list, &first_block);
351 tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
352 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
353 spaceres = XFS_RENAME_SPACE_RES(mp, target_namelen);
354 error = xfs_trans_reserve(tp, spaceres, XFS_RENAME_LOG_RES(mp), 0,
355 XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT);
356 if (error == ENOSPC) {
357 spaceres = 0;
358 error = xfs_trans_reserve(tp, 0, XFS_RENAME_LOG_RES(mp), 0,
359 XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT);
360 }
361 if (error) {
362 rename_which_error_return = __LINE__;
363 xfs_trans_cancel(tp, 0);
364 goto rele_return;
365 }
366
367 /*
368 * Attach the dquots to the inodes
369 */
370 if ((error = XFS_QM_DQVOPRENAME(mp, inodes))) {
371 xfs_trans_cancel(tp, cancel_flags);
372 rename_which_error_return = __LINE__;
373 goto rele_return;
374 }
375
376 /*
377 * Reacquire the inode locks we dropped above.
378 */
379 xfs_lock_inodes(inodes, num_inodes, 0, XFS_ILOCK_EXCL);
380
381 /*
382 * Join all the inodes to the transaction. From this point on,
383 * we can rely on either trans_commit or trans_cancel to unlock
384 * them. Note that we need to add a vnode reference to the
385 * directories since trans_commit & trans_cancel will decrement
386 * them when they unlock the inodes. Also, we need to be careful
387 * not to add an inode to the transaction more than once.
388 */
389 VN_HOLD(src_dir_vp);
390 xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
391 if (new_parent) {
392 VN_HOLD(target_dir_vp);
393 xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
394 }
395 if ((src_ip != src_dp) && (src_ip != target_dp)) {
396 xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
397 }
398 if ((target_ip != NULL) &&
399 (target_ip != src_ip) &&
400 (target_ip != src_dp) &&
401 (target_ip != target_dp)) {
402 xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
403 }
404
405 /*
406 * Set up the target.
407 */
408 if (target_ip == NULL) {
409 /*
410 * If there's no space reservation, check the entry will
411 * fit before actually inserting it.
412 */
413 if (spaceres == 0 &&
414 (error = XFS_DIR_CANENTER(mp, tp, target_dp, target_name,
415 target_namelen))) {
416 rename_which_error_return = __LINE__;
417 goto error_return;
418 }
419 /*
420 * If target does not exist and the rename crosses
421 * directories, adjust the target directory link count
422 * to account for the ".." reference from the new entry.
423 */
424 error = XFS_DIR_CREATENAME(mp, tp, target_dp, target_name,
425 target_namelen, src_ip->i_ino,
426 &first_block, &free_list, spaceres);
427 if (error == ENOSPC) {
428 rename_which_error_return = __LINE__;
429 goto error_return;
430 }
431 if (error) {
432 rename_which_error_return = __LINE__;
433 goto abort_return;
434 }
435 xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
436
437 if (new_parent && src_is_directory) {
438 error = xfs_bumplink(tp, target_dp);
439 if (error) {
440 rename_which_error_return = __LINE__;
441 goto abort_return;
442 }
443 }
444 } else { /* target_ip != NULL */
445
446 /*
447 * If target exists and it's a directory, check that both
448 * target and source are directories and that target can be
449 * destroyed, or that neither is a directory.
450 */
451 if ((target_ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
452 /*
453 * Make sure target dir is empty.
454 */
455 if (!(XFS_DIR_ISEMPTY(target_ip->i_mount, target_ip)) ||
456 (target_ip->i_d.di_nlink > 2)) {
457 error = XFS_ERROR(EEXIST);
458 rename_which_error_return = __LINE__;
459 goto error_return;
460 }
461 }
462
463 /*
464 * Link the source inode under the target name.
465 * If the source inode is a directory and we are moving
466 * it across directories, its ".." entry will be
467 * inconsistent until we replace that down below.
468 *
469 * In case there is already an entry with the same
470 * name at the destination directory, remove it first.
471 */
472 error = XFS_DIR_REPLACE(mp, tp, target_dp, target_name,
473 target_namelen, src_ip->i_ino, &first_block,
474 &free_list, spaceres);
475 if (error) {
476 rename_which_error_return = __LINE__;
477 goto abort_return;
478 }
479 xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
480
481 /*
482 * Decrement the link count on the target since the target
483 * dir no longer points to it.
484 */
485 error = xfs_droplink(tp, target_ip);
486 if (error) {
487 rename_which_error_return = __LINE__;
488 goto abort_return;
489 }
490 target_ip_dropped = 1;
491
492 if (src_is_directory) {
493 /*
494 * Drop the link from the old "." entry.
495 */
496 error = xfs_droplink(tp, target_ip);
497 if (error) {
498 rename_which_error_return = __LINE__;
499 goto abort_return;
500 }
501 }
502
503 /* Do this test while we still hold the locks */
504 target_link_zero = (target_ip)->i_d.di_nlink==0;
505
506 } /* target_ip != NULL */
507
508 /*
509 * Remove the source.
510 */
511 if (new_parent && src_is_directory) {
512
513 /*
514 * Rewrite the ".." entry to point to the new
515 * directory.
516 */
517 error = XFS_DIR_REPLACE(mp, tp, src_ip, "..", 2,
518 target_dp->i_ino, &first_block,
519 &free_list, spaceres);
520 ASSERT(error != EEXIST);
521 if (error) {
522 rename_which_error_return = __LINE__;
523 goto abort_return;
524 }
525 xfs_ichgtime(src_ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
526
527 } else {
528 /*
529 * We always want to hit the ctime on the source inode.
530 * We do it in the if clause above for the 'new_parent &&
531 * src_is_directory' case, and here we get all the other
532 * cases. This isn't strictly required by the standards
533 * since the source inode isn't really being changed,
534 * but old unix file systems did it and some incremental
535 * backup programs won't work without it.
536 */
537 xfs_ichgtime(src_ip, XFS_ICHGTIME_CHG);
538 }
539
540 /*
541 * Adjust the link count on src_dp. This is necessary when
542 * renaming a directory, either within one parent when
543 * the target existed, or across two parent directories.
544 */
545 if (src_is_directory && (new_parent || target_ip != NULL)) {
546
547 /*
548 * Decrement link count on src_directory since the
549 * entry that's moved no longer points to it.
550 */
551 error = xfs_droplink(tp, src_dp);
552 if (error) {
553 rename_which_error_return = __LINE__;
554 goto abort_return;
555 }
556 }
557
558 error = XFS_DIR_REMOVENAME(mp, tp, src_dp, src_name, src_namelen,
559 src_ip->i_ino, &first_block, &free_list, spaceres);
560 if (error) {
561 rename_which_error_return = __LINE__;
562 goto abort_return;
563 }
564 xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
565
566 /*
567 * Update the generation counts on all the directory inodes
568 * that we're modifying.
569 */
570 src_dp->i_gen++;
571 xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
572
573 if (new_parent) {
574 target_dp->i_gen++;
575 xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
576 }
577
578 /*
579 * If there was a target inode, take an extra reference on
580 * it here so that it doesn't go to xfs_inactive() from
581 * within the commit.
582 */
583 if (target_ip != NULL) {
584 IHOLD(target_ip);
585 }
586
587 /*
588 * If this is a synchronous mount, make sure that the
589 * rename transaction goes to disk before returning to
590 * the user.
591 */
592 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
593 xfs_trans_set_sync(tp);
594 }
595
596 /*
597 * Take refs. for vop_link_removed calls below. No need to worry
598 * about directory refs. because the caller holds them.
599 *
600 * Do holds before the xfs_bmap_finish since it might rele them down
601 * to zero.
602 */
603
604 if (target_ip_dropped)
605 IHOLD(target_ip);
606 IHOLD(src_ip);
607
608 error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
609 if (error) {
610 xfs_bmap_cancel(&free_list);
611 xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
612 XFS_TRANS_ABORT));
613 if (target_ip != NULL) {
614 IRELE(target_ip);
615 }
616 if (target_ip_dropped) {
617 IRELE(target_ip);
618 }
619 IRELE(src_ip);
620 goto std_return;
621 }
622
623 /*
624 * trans_commit will unlock src_ip, target_ip & decrement
625 * the vnode references.
626 */
627 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
628 if (target_ip != NULL) {
629 xfs_refcache_purge_ip(target_ip);
630 IRELE(target_ip);
631 }
632 /*
633 * Let interposed file systems know about removed links.
634 */
635 if (target_ip_dropped) {
636 VOP_LINK_REMOVED(XFS_ITOV(target_ip), target_dir_vp,
637 target_link_zero);
638 IRELE(target_ip);
639 }
640
641 FSC_NOTIFY_NAME_CHANGED(XFS_ITOV(src_ip));
642
643 IRELE(src_ip);
644
645 /* Fall through to std_return with error = 0 or errno from
646 * xfs_trans_commit */
647std_return:
648 if (DM_EVENT_ENABLED(src_dir_vp->v_vfsp, src_dp, DM_EVENT_POSTRENAME) ||
649 DM_EVENT_ENABLED(target_dir_vp->v_vfsp,
650 target_dp, DM_EVENT_POSTRENAME)) {
651 (void) XFS_SEND_NAMESP (mp, DM_EVENT_POSTRENAME,
652 src_dir_vp, DM_RIGHT_NULL,
653 target_dir_vp, DM_RIGHT_NULL,
654 src_name, target_name,
655 0, error, 0);
656 }
657 return error;
658
659 abort_return:
660 cancel_flags |= XFS_TRANS_ABORT;
661 /* FALLTHROUGH */
662 error_return:
663 xfs_bmap_cancel(&free_list);
664 xfs_trans_cancel(tp, cancel_flags);
665 goto std_return;
666
667 rele_return:
668 IRELE(src_ip);
669 if (target_ip != NULL) {
670 IRELE(target_ip);
671 }
672 goto std_return;
673}
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
new file mode 100644
index 000000000000..2c37822d1012
--- /dev/null
+++ b/fs/xfs/xfs_rtalloc.c
@@ -0,0 +1,2469 @@
1/*
2 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33/*
34 * Free realtime space allocation for XFS.
35 */
36
37#include "xfs.h"
38#include "xfs_macros.h"
39#include "xfs_types.h"
40#include "xfs_inum.h"
41#include "xfs_log.h"
42#include "xfs_trans.h"
43#include "xfs_sb.h"
44#include "xfs_ag.h"
45#include "xfs_dir.h"
46#include "xfs_dir2.h"
47#include "xfs_dmapi.h"
48#include "xfs_mount.h"
49#include "xfs_alloc_btree.h"
50#include "xfs_bmap_btree.h"
51#include "xfs_ialloc_btree.h"
52#include "xfs_btree.h"
53#include "xfs_ialloc.h"
54#include "xfs_attr_sf.h"
55#include "xfs_dir_sf.h"
56#include "xfs_dir2_sf.h"
57#include "xfs_dinode.h"
58#include "xfs_inode.h"
59#include "xfs_alloc.h"
60#include "xfs_bmap.h"
61#include "xfs_bit.h"
62#include "xfs_rtalloc.h"
63#include "xfs_fsops.h"
64#include "xfs_error.h"
65#include "xfs_rw.h"
66#include "xfs_inode_item.h"
67#include "xfs_trans_space.h"
68
69
70/*
71 * Prototypes for internal functions.
72 */
73
74
75STATIC int xfs_rtallocate_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
76 xfs_extlen_t, xfs_buf_t **, xfs_fsblock_t *);
77STATIC int xfs_rtany_summary(xfs_mount_t *, xfs_trans_t *, int, int,
78 xfs_rtblock_t, xfs_buf_t **, xfs_fsblock_t *, int *);
79STATIC int xfs_rtcheck_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
80 xfs_extlen_t, int, xfs_rtblock_t *, int *);
81STATIC int xfs_rtfind_back(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
82 xfs_rtblock_t, xfs_rtblock_t *);
83STATIC int xfs_rtfind_forw(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
84 xfs_rtblock_t, xfs_rtblock_t *);
85STATIC int xfs_rtget_summary( xfs_mount_t *, xfs_trans_t *, int,
86 xfs_rtblock_t, xfs_buf_t **, xfs_fsblock_t *, xfs_suminfo_t *);
87STATIC int xfs_rtmodify_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
88 xfs_extlen_t, int);
89STATIC int xfs_rtmodify_summary(xfs_mount_t *, xfs_trans_t *, int,
90 xfs_rtblock_t, int, xfs_buf_t **, xfs_fsblock_t *);
91
92/*
93 * Internal functions.
94 */
95
96/*
97 * xfs_lowbit32: get low bit set out of 32-bit argument, -1 if none set.
98 */
99STATIC int
100xfs_lowbit32(
101 __uint32_t v)
102{
103 if (v)
104 return ffs(v) - 1;
105 return -1;
106}
107
108/*
109 * Allocate space to the bitmap or summary file, and zero it, for growfs.
110 */
111STATIC int /* error */
112xfs_growfs_rt_alloc(
113 xfs_mount_t *mp, /* file system mount point */
114 xfs_extlen_t oblocks, /* old count of blocks */
115 xfs_extlen_t nblocks, /* new count of blocks */
116 xfs_ino_t ino) /* inode number (bitmap/summary) */
117{
118 xfs_fileoff_t bno; /* block number in file */
119 xfs_buf_t *bp; /* temporary buffer for zeroing */
120 int cancelflags; /* flags for xfs_trans_cancel */
121 int committed; /* transaction committed flag */
122 xfs_daddr_t d; /* disk block address */
123 int error; /* error return value */
124 xfs_fsblock_t firstblock; /* first block allocated in xaction */
125 xfs_bmap_free_t flist; /* list of freed blocks */
126 xfs_fsblock_t fsbno; /* filesystem block for bno */
127 xfs_inode_t *ip; /* pointer to incore inode */
128 xfs_bmbt_irec_t map; /* block map output */
129 int nmap; /* number of block maps */
130 int resblks; /* space reservation */
131 xfs_trans_t *tp; /* transaction pointer */
132
133 /*
134 * Allocate space to the file, as necessary.
135 */
136 while (oblocks < nblocks) {
137 tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC);
138 resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks);
139 cancelflags = 0;
140 /*
141 * Reserve space & log for one extent added to the file.
142 */
143 if ((error = xfs_trans_reserve(tp, resblks,
144 XFS_GROWRTALLOC_LOG_RES(mp), 0,
145 XFS_TRANS_PERM_LOG_RES,
146 XFS_DEFAULT_PERM_LOG_COUNT)))
147 goto error_exit;
148 cancelflags = XFS_TRANS_RELEASE_LOG_RES;
149 /*
150 * Lock the inode.
151 */
152 if ((error = xfs_trans_iget(mp, tp, ino, 0, XFS_ILOCK_EXCL, &ip)))
153 goto error_exit;
154 XFS_BMAP_INIT(&flist, &firstblock);
155 /*
156 * Allocate blocks to the bitmap file.
157 */
158 nmap = 1;
159 cancelflags |= XFS_TRANS_ABORT;
160 error = xfs_bmapi(tp, ip, oblocks, nblocks - oblocks,
161 XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, &firstblock,
162 resblks, &map, &nmap, &flist);
163 if (!error && nmap < 1)
164 error = XFS_ERROR(ENOSPC);
165 if (error)
166 goto error_exit;
167 /*
168 * Free any blocks freed up in the transaction, then commit.
169 */
170 error = xfs_bmap_finish(&tp, &flist, firstblock, &committed);
171 if (error)
172 goto error_exit;
173 xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
174 /*
175 * Now we need to clear the allocated blocks.
176 * Do this one block per transaction, to keep it simple.
177 */
178 cancelflags = 0;
179 for (bno = map.br_startoff, fsbno = map.br_startblock;
180 bno < map.br_startoff + map.br_blockcount;
181 bno++, fsbno++) {
182 tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ZERO);
183 /*
184 * Reserve log for one block zeroing.
185 */
186 if ((error = xfs_trans_reserve(tp, 0,
187 XFS_GROWRTZERO_LOG_RES(mp), 0, 0, 0)))
188 goto error_exit;
189 /*
190 * Lock the bitmap inode.
191 */
192 if ((error = xfs_trans_iget(mp, tp, ino, 0, XFS_ILOCK_EXCL,
193 &ip)))
194 goto error_exit;
195 /*
196 * Get a buffer for the block.
197 */
198 d = XFS_FSB_TO_DADDR(mp, fsbno);
199 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
200 mp->m_bsize, 0);
201 if (bp == NULL) {
202 error = XFS_ERROR(EIO);
203 goto error_exit;
204 }
205 memset(XFS_BUF_PTR(bp), 0, mp->m_sb.sb_blocksize);
206 xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
207 /*
208 * Commit the transaction.
209 */
210 xfs_trans_commit(tp, 0, NULL);
211 }
212 /*
213 * Go on to the next extent, if any.
214 */
215 oblocks = map.br_startoff + map.br_blockcount;
216 }
217 return 0;
218error_exit:
219 xfs_trans_cancel(tp, cancelflags);
220 return error;
221}
222
223/*
224 * Attempt to allocate an extent minlen<=len<=maxlen starting from
225 * bitmap block bbno. If we don't get maxlen then use prod to trim
226 * the length, if given. Returns error; returns starting block in *rtblock.
227 * The lengths are all in rtextents.
228 */
229STATIC int /* error */
230xfs_rtallocate_extent_block(
231 xfs_mount_t *mp, /* file system mount point */
232 xfs_trans_t *tp, /* transaction pointer */
233 xfs_rtblock_t bbno, /* bitmap block number */
234 xfs_extlen_t minlen, /* minimum length to allocate */
235 xfs_extlen_t maxlen, /* maximum length to allocate */
236 xfs_extlen_t *len, /* out: actual length allocated */
237 xfs_rtblock_t *nextp, /* out: next block to try */
238 xfs_buf_t **rbpp, /* in/out: summary block buffer */
239 xfs_fsblock_t *rsb, /* in/out: summary block number */
240 xfs_extlen_t prod, /* extent product factor */
241 xfs_rtblock_t *rtblock) /* out: start block allocated */
242{
243 xfs_rtblock_t besti; /* best rtblock found so far */
244 xfs_rtblock_t bestlen; /* best length found so far */
245 xfs_rtblock_t end; /* last rtblock in chunk */
246 int error; /* error value */
247 xfs_rtblock_t i; /* current rtblock trying */
248 xfs_rtblock_t next; /* next rtblock to try */
249 int stat; /* status from internal calls */
250
251 /*
252 * Loop over all the extents starting in this bitmap block,
253 * looking for one that's long enough.
254 */
255 for (i = XFS_BLOCKTOBIT(mp, bbno), besti = -1, bestlen = 0,
256 end = XFS_BLOCKTOBIT(mp, bbno + 1) - 1;
257 i <= end;
258 i++) {
259 /*
260 * See if there's a free extent of maxlen starting at i.
261 * If it's not so then next will contain the first non-free.
262 */
263 error = xfs_rtcheck_range(mp, tp, i, maxlen, 1, &next, &stat);
264 if (error) {
265 return error;
266 }
267 if (stat) {
268 /*
269 * i for maxlen is all free, allocate and return that.
270 */
271 error = xfs_rtallocate_range(mp, tp, i, maxlen, rbpp,
272 rsb);
273 if (error) {
274 return error;
275 }
276 *len = maxlen;
277 *rtblock = i;
278 return 0;
279 }
280 /*
281 * In the case where we have a variable-sized allocation
282 * request, figure out how big this free piece is,
283 * and if it's big enough for the minimum, and the best
284 * so far, remember it.
285 */
286 if (minlen < maxlen) {
287 xfs_rtblock_t thislen; /* this extent size */
288
289 thislen = next - i;
290 if (thislen >= minlen && thislen > bestlen) {
291 besti = i;
292 bestlen = thislen;
293 }
294 }
295 /*
296 * If not done yet, find the start of the next free space.
297 */
298 if (next < end) {
299 error = xfs_rtfind_forw(mp, tp, next, end, &i);
300 if (error) {
301 return error;
302 }
303 } else
304 break;
305 }
306 /*
307 * Searched the whole thing & didn't find a maxlen free extent.
308 */
309 if (minlen < maxlen && besti != -1) {
310 xfs_extlen_t p; /* amount to trim length by */
311
312 /*
313 * If size should be a multiple of prod, make that so.
314 */
315 if (prod > 1 && (p = do_mod(bestlen, prod)))
316 bestlen -= p;
317 /*
318 * Allocate besti for bestlen & return that.
319 */
320 error = xfs_rtallocate_range(mp, tp, besti, bestlen, rbpp, rsb);
321 if (error) {
322 return error;
323 }
324 *len = bestlen;
325 *rtblock = besti;
326 return 0;
327 }
328 /*
329 * Allocation failed. Set *nextp to the next block to try.
330 */
331 *nextp = next;
332 *rtblock = NULLRTBLOCK;
333 return 0;
334}
335
336/*
337 * Allocate an extent of length minlen<=len<=maxlen, starting at block
338 * bno. If we don't get maxlen then use prod to trim the length, if given.
339 * Returns error; returns starting block in *rtblock.
340 * The lengths are all in rtextents.
341 */
342STATIC int /* error */
343xfs_rtallocate_extent_exact(
344 xfs_mount_t *mp, /* file system mount point */
345 xfs_trans_t *tp, /* transaction pointer */
346 xfs_rtblock_t bno, /* starting block number to allocate */
347 xfs_extlen_t minlen, /* minimum length to allocate */
348 xfs_extlen_t maxlen, /* maximum length to allocate */
349 xfs_extlen_t *len, /* out: actual length allocated */
350 xfs_buf_t **rbpp, /* in/out: summary block buffer */
351 xfs_fsblock_t *rsb, /* in/out: summary block number */
352 xfs_extlen_t prod, /* extent product factor */
353 xfs_rtblock_t *rtblock) /* out: start block allocated */
354{
355 int error; /* error value */
356 xfs_extlen_t i; /* extent length trimmed due to prod */
357 int isfree; /* extent is free */
358 xfs_rtblock_t next; /* next block to try (dummy) */
359
360 ASSERT(minlen % prod == 0 && maxlen % prod == 0);
361 /*
362 * Check if the range in question (for maxlen) is free.
363 */
364 error = xfs_rtcheck_range(mp, tp, bno, maxlen, 1, &next, &isfree);
365 if (error) {
366 return error;
367 }
368 if (isfree) {
369 /*
370 * If it is, allocate it and return success.
371 */
372 error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb);
373 if (error) {
374 return error;
375 }
376 *len = maxlen;
377 *rtblock = bno;
378 return 0;
379 }
380 /*
381 * If not, allocate what there is, if it's at least minlen.
382 */
383 maxlen = next - bno;
384 if (maxlen < minlen) {
385 /*
386 * Failed, return failure status.
387 */
388 *rtblock = NULLRTBLOCK;
389 return 0;
390 }
391 /*
392 * Trim off tail of extent, if prod is specified.
393 */
394 if (prod > 1 && (i = maxlen % prod)) {
395 maxlen -= i;
396 if (maxlen < minlen) {
397 /*
398 * Now we can't do it, return failure status.
399 */
400 *rtblock = NULLRTBLOCK;
401 return 0;
402 }
403 }
404 /*
405 * Allocate what we can and return it.
406 */
407 error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb);
408 if (error) {
409 return error;
410 }
411 *len = maxlen;
412 *rtblock = bno;
413 return 0;
414}
415
416/*
417 * Allocate an extent of length minlen<=len<=maxlen, starting as near
418 * to bno as possible. If we don't get maxlen then use prod to trim
419 * the length, if given. The lengths are all in rtextents.
420 */
421STATIC int /* error */
422xfs_rtallocate_extent_near(
423 xfs_mount_t *mp, /* file system mount point */
424 xfs_trans_t *tp, /* transaction pointer */
425 xfs_rtblock_t bno, /* starting block number to allocate */
426 xfs_extlen_t minlen, /* minimum length to allocate */
427 xfs_extlen_t maxlen, /* maximum length to allocate */
428 xfs_extlen_t *len, /* out: actual length allocated */
429 xfs_buf_t **rbpp, /* in/out: summary block buffer */
430 xfs_fsblock_t *rsb, /* in/out: summary block number */
431 xfs_extlen_t prod, /* extent product factor */
432 xfs_rtblock_t *rtblock) /* out: start block allocated */
433{
434 int any; /* any useful extents from summary */
435 xfs_rtblock_t bbno; /* bitmap block number */
436 int error; /* error value */
437 int i; /* bitmap block offset (loop control) */
438 int j; /* secondary loop control */
439 int log2len; /* log2 of minlen */
440 xfs_rtblock_t n; /* next block to try */
441 xfs_rtblock_t r; /* result block */
442
443 ASSERT(minlen % prod == 0 && maxlen % prod == 0);
444 /*
445 * If the block number given is off the end, silently set it to
446 * the last block.
447 */
448 if (bno >= mp->m_sb.sb_rextents)
449 bno = mp->m_sb.sb_rextents - 1;
450 /*
451 * Try the exact allocation first.
452 */
453 error = xfs_rtallocate_extent_exact(mp, tp, bno, minlen, maxlen, len,
454 rbpp, rsb, prod, &r);
455 if (error) {
456 return error;
457 }
458 /*
459 * If the exact allocation worked, return that.
460 */
461 if (r != NULLRTBLOCK) {
462 *rtblock = r;
463 return 0;
464 }
465 bbno = XFS_BITTOBLOCK(mp, bno);
466 i = 0;
467 log2len = xfs_highbit32(minlen);
468 /*
469 * Loop over all bitmap blocks (bbno + i is current block).
470 */
471 for (;;) {
472 /*
473 * Get summary information of extents of all useful levels
474 * starting in this bitmap block.
475 */
476 error = xfs_rtany_summary(mp, tp, log2len, mp->m_rsumlevels - 1,
477 bbno + i, rbpp, rsb, &any);
478 if (error) {
479 return error;
480 }
481 /*
482 * If there are any useful extents starting here, try
483 * allocating one.
484 */
485 if (any) {
486 /*
487 * On the positive side of the starting location.
488 */
489 if (i >= 0) {
490 /*
491 * Try to allocate an extent starting in
492 * this block.
493 */
494 error = xfs_rtallocate_extent_block(mp, tp,
495 bbno + i, minlen, maxlen, len, &n, rbpp,
496 rsb, prod, &r);
497 if (error) {
498 return error;
499 }
500 /*
501 * If it worked, return it.
502 */
503 if (r != NULLRTBLOCK) {
504 *rtblock = r;
505 return 0;
506 }
507 }
508 /*
509 * On the negative side of the starting location.
510 */
511 else { /* i < 0 */
512 /*
513 * Loop backwards through the bitmap blocks from
514 * the starting point-1 up to where we are now.
515 * There should be an extent which ends in this
516 * bitmap block and is long enough.
517 */
518 for (j = -1; j > i; j--) {
519 /*
520 * Grab the summary information for
521 * this bitmap block.
522 */
523 error = xfs_rtany_summary(mp, tp,
524 log2len, mp->m_rsumlevels - 1,
525 bbno + j, rbpp, rsb, &any);
526 if (error) {
527 return error;
528 }
529 /*
530 * If there's no extent given in the
531 * summary that means the extent we
532 * found must carry over from an
533 * earlier block. If there is an
534 * extent given, we've already tried
535 * that allocation, don't do it again.
536 */
537 if (any)
538 continue;
539 error = xfs_rtallocate_extent_block(mp,
540 tp, bbno + j, minlen, maxlen,
541 len, &n, rbpp, rsb, prod, &r);
542 if (error) {
543 return error;
544 }
545 /*
546 * If it works, return the extent.
547 */
548 if (r != NULLRTBLOCK) {
549 *rtblock = r;
550 return 0;
551 }
552 }
553 /*
554 * There weren't intervening bitmap blocks
555 * with a long enough extent, or the
556 * allocation didn't work for some reason
557 * (i.e. it's a little * too short).
558 * Try to allocate from the summary block
559 * that we found.
560 */
561 error = xfs_rtallocate_extent_block(mp, tp,
562 bbno + i, minlen, maxlen, len, &n, rbpp,
563 rsb, prod, &r);
564 if (error) {
565 return error;
566 }
567 /*
568 * If it works, return the extent.
569 */
570 if (r != NULLRTBLOCK) {
571 *rtblock = r;
572 return 0;
573 }
574 }
575 }
576 /*
577 * Loop control. If we were on the positive side, and there's
578 * still more blocks on the negative side, go there.
579 */
580 if (i > 0 && (int)bbno - i >= 0)
581 i = -i;
582 /*
583 * If positive, and no more negative, but there are more
584 * positive, go there.
585 */
586 else if (i > 0 && (int)bbno + i < mp->m_sb.sb_rbmblocks - 1)
587 i++;
588 /*
589 * If negative or 0 (just started), and there are positive
590 * blocks to go, go there. The 0 case moves to block 1.
591 */
592 else if (i <= 0 && (int)bbno - i < mp->m_sb.sb_rbmblocks - 1)
593 i = 1 - i;
594 /*
595 * If negative or 0 and there are more negative blocks,
596 * go there.
597 */
598 else if (i <= 0 && (int)bbno + i > 0)
599 i--;
600 /*
601 * Must be done. Return failure.
602 */
603 else
604 break;
605 }
606 *rtblock = NULLRTBLOCK;
607 return 0;
608}
609
610/*
611 * Allocate an extent of length minlen<=len<=maxlen, with no position
612 * specified. If we don't get maxlen then use prod to trim
613 * the length, if given. The lengths are all in rtextents.
614 */
615STATIC int /* error */
616xfs_rtallocate_extent_size(
617 xfs_mount_t *mp, /* file system mount point */
618 xfs_trans_t *tp, /* transaction pointer */
619 xfs_extlen_t minlen, /* minimum length to allocate */
620 xfs_extlen_t maxlen, /* maximum length to allocate */
621 xfs_extlen_t *len, /* out: actual length allocated */
622 xfs_buf_t **rbpp, /* in/out: summary block buffer */
623 xfs_fsblock_t *rsb, /* in/out: summary block number */
624 xfs_extlen_t prod, /* extent product factor */
625 xfs_rtblock_t *rtblock) /* out: start block allocated */
626{
627 int error; /* error value */
628 int i; /* bitmap block number */
629 int l; /* level number (loop control) */
630 xfs_rtblock_t n; /* next block to be tried */
631 xfs_rtblock_t r; /* result block number */
632 xfs_suminfo_t sum; /* summary information for extents */
633
634 ASSERT(minlen % prod == 0 && maxlen % prod == 0);
635 /*
636 * Loop over all the levels starting with maxlen.
637 * At each level, look at all the bitmap blocks, to see if there
638 * are extents starting there that are long enough (>= maxlen).
639 * Note, only on the initial level can the allocation fail if
640 * the summary says there's an extent.
641 */
642 for (l = xfs_highbit32(maxlen); l < mp->m_rsumlevels; l++) {
643 /*
644 * Loop over all the bitmap blocks.
645 */
646 for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) {
647 /*
648 * Get the summary for this level/block.
649 */
650 error = xfs_rtget_summary(mp, tp, l, i, rbpp, rsb,
651 &sum);
652 if (error) {
653 return error;
654 }
655 /*
656 * Nothing there, on to the next block.
657 */
658 if (!sum)
659 continue;
660 /*
661 * Try allocating the extent.
662 */
663 error = xfs_rtallocate_extent_block(mp, tp, i, maxlen,
664 maxlen, len, &n, rbpp, rsb, prod, &r);
665 if (error) {
666 return error;
667 }
668 /*
669 * If it worked, return that.
670 */
671 if (r != NULLRTBLOCK) {
672 *rtblock = r;
673 return 0;
674 }
675 /*
676 * If the "next block to try" returned from the
677 * allocator is beyond the next bitmap block,
678 * skip to that bitmap block.
679 */
680 if (XFS_BITTOBLOCK(mp, n) > i + 1)
681 i = XFS_BITTOBLOCK(mp, n) - 1;
682 }
683 }
684 /*
685 * Didn't find any maxlen blocks. Try smaller ones, unless
686 * we're asking for a fixed size extent.
687 */
688 if (minlen > --maxlen) {
689 *rtblock = NULLRTBLOCK;
690 return 0;
691 }
692 /*
693 * Loop over sizes, from maxlen down to minlen.
694 * This time, when we do the allocations, allow smaller ones
695 * to succeed.
696 */
697 for (l = xfs_highbit32(maxlen); l >= xfs_highbit32(minlen); l--) {
698 /*
699 * Loop over all the bitmap blocks, try an allocation
700 * starting in that block.
701 */
702 for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) {
703 /*
704 * Get the summary information for this level/block.
705 */
706 error = xfs_rtget_summary(mp, tp, l, i, rbpp, rsb,
707 &sum);
708 if (error) {
709 return error;
710 }
711 /*
712 * If nothing there, go on to next.
713 */
714 if (!sum)
715 continue;
716 /*
717 * Try the allocation. Make sure the specified
718 * minlen/maxlen are in the possible range for
719 * this summary level.
720 */
721 error = xfs_rtallocate_extent_block(mp, tp, i,
722 XFS_RTMAX(minlen, 1 << l),
723 XFS_RTMIN(maxlen, (1 << (l + 1)) - 1),
724 len, &n, rbpp, rsb, prod, &r);
725 if (error) {
726 return error;
727 }
728 /*
729 * If it worked, return that extent.
730 */
731 if (r != NULLRTBLOCK) {
732 *rtblock = r;
733 return 0;
734 }
735 /*
736 * If the "next block to try" returned from the
737 * allocator is beyond the next bitmap block,
738 * skip to that bitmap block.
739 */
740 if (XFS_BITTOBLOCK(mp, n) > i + 1)
741 i = XFS_BITTOBLOCK(mp, n) - 1;
742 }
743 }
744 /*
745 * Got nothing, return failure.
746 */
747 *rtblock = NULLRTBLOCK;
748 return 0;
749}
750
751/*
752 * Mark an extent specified by start and len allocated.
753 * Updates all the summary information as well as the bitmap.
754 */
755STATIC int /* error */
756xfs_rtallocate_range(
757 xfs_mount_t *mp, /* file system mount point */
758 xfs_trans_t *tp, /* transaction pointer */
759 xfs_rtblock_t start, /* start block to allocate */
760 xfs_extlen_t len, /* length to allocate */
761 xfs_buf_t **rbpp, /* in/out: summary block buffer */
762 xfs_fsblock_t *rsb) /* in/out: summary block number */
763{
764 xfs_rtblock_t end; /* end of the allocated extent */
765 int error; /* error value */
766 xfs_rtblock_t postblock; /* first block allocated > end */
767 xfs_rtblock_t preblock; /* first block allocated < start */
768
769 end = start + len - 1;
770 /*
771 * Assume we're allocating out of the middle of a free extent.
772 * We need to find the beginning and end of the extent so we can
773 * properly update the summary.
774 */
775 error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
776 if (error) {
777 return error;
778 }
779 /*
780 * Find the next allocated block (end of free extent).
781 */
782 error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
783 &postblock);
784 if (error) {
785 return error;
786 }
787 /*
788 * Decrement the summary information corresponding to the entire
789 * (old) free extent.
790 */
791 error = xfs_rtmodify_summary(mp, tp,
792 XFS_RTBLOCKLOG(postblock + 1 - preblock),
793 XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
794 if (error) {
795 return error;
796 }
797 /*
798 * If there are blocks not being allocated at the front of the
799 * old extent, add summary data for them to be free.
800 */
801 if (preblock < start) {
802 error = xfs_rtmodify_summary(mp, tp,
803 XFS_RTBLOCKLOG(start - preblock),
804 XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
805 if (error) {
806 return error;
807 }
808 }
809 /*
810 * If there are blocks not being allocated at the end of the
811 * old extent, add summary data for them to be free.
812 */
813 if (postblock > end) {
814 error = xfs_rtmodify_summary(mp, tp,
815 XFS_RTBLOCKLOG(postblock - end),
816 XFS_BITTOBLOCK(mp, end + 1), 1, rbpp, rsb);
817 if (error) {
818 return error;
819 }
820 }
821 /*
822 * Modify the bitmap to mark this extent allocated.
823 */
824 error = xfs_rtmodify_range(mp, tp, start, len, 0);
825 return error;
826}
827
828/*
829 * Return whether there are any free extents in the size range given
830 * by low and high, for the bitmap block bbno.
831 */
832STATIC int /* error */
833xfs_rtany_summary(
834 xfs_mount_t *mp, /* file system mount structure */
835 xfs_trans_t *tp, /* transaction pointer */
836 int low, /* low log2 extent size */
837 int high, /* high log2 extent size */
838 xfs_rtblock_t bbno, /* bitmap block number */
839 xfs_buf_t **rbpp, /* in/out: summary block buffer */
840 xfs_fsblock_t *rsb, /* in/out: summary block number */
841 int *stat) /* out: any good extents here? */
842{
843 int error; /* error value */
844 int log; /* loop counter, log2 of ext. size */
845 xfs_suminfo_t sum; /* summary data */
846
847 /*
848 * Loop over logs of extent sizes. Order is irrelevant.
849 */
850 for (log = low; log <= high; log++) {
851 /*
852 * Get one summary datum.
853 */
854 error = xfs_rtget_summary(mp, tp, log, bbno, rbpp, rsb, &sum);
855 if (error) {
856 return error;
857 }
858 /*
859 * If there are any, return success.
860 */
861 if (sum) {
862 *stat = 1;
863 return 0;
864 }
865 }
866 /*
867 * Found nothing, return failure.
868 */
869 *stat = 0;
870 return 0;
871}
872
873/*
874 * Get a buffer for the bitmap or summary file block specified.
875 * The buffer is returned read and locked.
876 */
877STATIC int /* error */
878xfs_rtbuf_get(
879 xfs_mount_t *mp, /* file system mount structure */
880 xfs_trans_t *tp, /* transaction pointer */
881 xfs_rtblock_t block, /* block number in bitmap or summary */
882 int issum, /* is summary not bitmap */
883 xfs_buf_t **bpp) /* output: buffer for the block */
884{
885 xfs_buf_t *bp; /* block buffer, result */
886 xfs_daddr_t d; /* disk addr of block */
887 int error; /* error value */
888 xfs_fsblock_t fsb; /* fs block number for block */
889 xfs_inode_t *ip; /* bitmap or summary inode */
890
891 ip = issum ? mp->m_rsumip : mp->m_rbmip;
892 /*
893 * Map from the file offset (block) and inode number to the
894 * file system block.
895 */
896 error = xfs_bmapi_single(tp, ip, XFS_DATA_FORK, &fsb, block);
897 if (error) {
898 return error;
899 }
900 ASSERT(fsb != NULLFSBLOCK);
901 /*
902 * Convert to disk address for buffer cache.
903 */
904 d = XFS_FSB_TO_DADDR(mp, fsb);
905 /*
906 * Read the buffer.
907 */
908 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
909 mp->m_bsize, 0, &bp);
910 if (error) {
911 return error;
912 }
913 ASSERT(bp && !XFS_BUF_GETERROR(bp));
914 *bpp = bp;
915 return 0;
916}
917
918#ifdef DEBUG
919/*
920 * Check that the given extent (block range) is allocated already.
921 */
922STATIC int /* error */
923xfs_rtcheck_alloc_range(
924 xfs_mount_t *mp, /* file system mount point */
925 xfs_trans_t *tp, /* transaction pointer */
926 xfs_rtblock_t bno, /* starting block number of extent */
927 xfs_extlen_t len, /* length of extent */
928 int *stat) /* out: 1 for allocated, 0 for not */
929{
930 xfs_rtblock_t new; /* dummy for xfs_rtcheck_range */
931
932 return xfs_rtcheck_range(mp, tp, bno, len, 0, &new, stat);
933}
934#endif
935
936#ifdef DEBUG
937/*
938 * Check whether the given block in the bitmap has the given value.
939 */
940STATIC int /* 1 for matches, 0 for not */
941xfs_rtcheck_bit(
942 xfs_mount_t *mp, /* file system mount structure */
943 xfs_trans_t *tp, /* transaction pointer */
944 xfs_rtblock_t start, /* bit (block) to check */
945 int val) /* 1 for free, 0 for allocated */
946{
947 int bit; /* bit number in the word */
948 xfs_rtblock_t block; /* bitmap block number */
949 xfs_buf_t *bp; /* buf for the block */
950 xfs_rtword_t *bufp; /* pointer into the buffer */
951 /* REFERENCED */
952 int error; /* error value */
953 xfs_rtword_t wdiff; /* difference between bit & expected */
954 int word; /* word number in the buffer */
955 xfs_rtword_t wval; /* word value from buffer */
956
957 block = XFS_BITTOBLOCK(mp, start);
958 error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
959 bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
960 word = XFS_BITTOWORD(mp, start);
961 bit = (int)(start & (XFS_NBWORD - 1));
962 wval = bufp[word];
963 xfs_trans_brelse(tp, bp);
964 wdiff = (wval ^ -val) & ((xfs_rtword_t)1 << bit);
965 return !wdiff;
966}
967#endif /* DEBUG */
968
969#if 0
970/*
971 * Check that the given extent (block range) is free already.
972 */
973STATIC int /* error */
974xfs_rtcheck_free_range(
975 xfs_mount_t *mp, /* file system mount point */
976 xfs_trans_t *tp, /* transaction pointer */
977 xfs_rtblock_t bno, /* starting block number of extent */
978 xfs_extlen_t len, /* length of extent */
979 int *stat) /* out: 1 for free, 0 for not */
980{
981 xfs_rtblock_t new; /* dummy for xfs_rtcheck_range */
982
983 return xfs_rtcheck_range(mp, tp, bno, len, 1, &new, stat);
984}
985#endif
986
987/*
988 * Check that the given range is either all allocated (val = 0) or
989 * all free (val = 1).
990 */
991STATIC int /* error */
992xfs_rtcheck_range(
993 xfs_mount_t *mp, /* file system mount point */
994 xfs_trans_t *tp, /* transaction pointer */
995 xfs_rtblock_t start, /* starting block number of extent */
996 xfs_extlen_t len, /* length of extent */
997 int val, /* 1 for free, 0 for allocated */
998 xfs_rtblock_t *new, /* out: first block not matching */
999 int *stat) /* out: 1 for matches, 0 for not */
1000{
1001 xfs_rtword_t *b; /* current word in buffer */
1002 int bit; /* bit number in the word */
1003 xfs_rtblock_t block; /* bitmap block number */
1004 xfs_buf_t *bp; /* buf for the block */
1005 xfs_rtword_t *bufp; /* starting word in buffer */
1006 int error; /* error value */
1007 xfs_rtblock_t i; /* current bit number rel. to start */
1008 xfs_rtblock_t lastbit; /* last useful bit in word */
1009 xfs_rtword_t mask; /* mask of relevant bits for value */
1010 xfs_rtword_t wdiff; /* difference from wanted value */
1011 int word; /* word number in the buffer */
1012
1013 /*
1014 * Compute starting bitmap block number
1015 */
1016 block = XFS_BITTOBLOCK(mp, start);
1017 /*
1018 * Read the bitmap block.
1019 */
1020 error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
1021 if (error) {
1022 return error;
1023 }
1024 bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
1025 /*
1026 * Compute the starting word's address, and starting bit.
1027 */
1028 word = XFS_BITTOWORD(mp, start);
1029 b = &bufp[word];
1030 bit = (int)(start & (XFS_NBWORD - 1));
1031 /*
1032 * 0 (allocated) => all zero's; 1 (free) => all one's.
1033 */
1034 val = -val;
1035 /*
1036 * If not starting on a word boundary, deal with the first
1037 * (partial) word.
1038 */
1039 if (bit) {
1040 /*
1041 * Compute first bit not examined.
1042 */
1043 lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
1044 /*
1045 * Mask of relevant bits.
1046 */
1047 mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
1048 /*
1049 * Compute difference between actual and desired value.
1050 */
1051 if ((wdiff = (*b ^ val) & mask)) {
1052 /*
1053 * Different, compute first wrong bit and return.
1054 */
1055 xfs_trans_brelse(tp, bp);
1056 i = XFS_RTLOBIT(wdiff) - bit;
1057 *new = start + i;
1058 *stat = 0;
1059 return 0;
1060 }
1061 i = lastbit - bit;
1062 /*
1063 * Go on to next block if that's where the next word is
1064 * and we need the next word.
1065 */
1066 if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
1067 /*
1068 * If done with this block, get the next one.
1069 */
1070 xfs_trans_brelse(tp, bp);
1071 error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
1072 if (error) {
1073 return error;
1074 }
1075 b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
1076 word = 0;
1077 } else {
1078 /*
1079 * Go on to the next word in the buffer.
1080 */
1081 b++;
1082 }
1083 } else {
1084 /*
1085 * Starting on a word boundary, no partial word.
1086 */
1087 i = 0;
1088 }
1089 /*
1090 * Loop over whole words in buffers. When we use up one buffer
1091 * we move on to the next one.
1092 */
1093 while (len - i >= XFS_NBWORD) {
1094 /*
1095 * Compute difference between actual and desired value.
1096 */
1097 if ((wdiff = *b ^ val)) {
1098 /*
1099 * Different, compute first wrong bit and return.
1100 */
1101 xfs_trans_brelse(tp, bp);
1102 i += XFS_RTLOBIT(wdiff);
1103 *new = start + i;
1104 *stat = 0;
1105 return 0;
1106 }
1107 i += XFS_NBWORD;
1108 /*
1109 * Go on to next block if that's where the next word is
1110 * and we need the next word.
1111 */
1112 if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
1113 /*
1114 * If done with this block, get the next one.
1115 */
1116 xfs_trans_brelse(tp, bp);
1117 error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
1118 if (error) {
1119 return error;
1120 }
1121 b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
1122 word = 0;
1123 } else {
1124 /*
1125 * Go on to the next word in the buffer.
1126 */
1127 b++;
1128 }
1129 }
1130 /*
1131 * If not ending on a word boundary, deal with the last
1132 * (partial) word.
1133 */
1134 if ((lastbit = len - i)) {
1135 /*
1136 * Mask of relevant bits.
1137 */
1138 mask = ((xfs_rtword_t)1 << lastbit) - 1;
1139 /*
1140 * Compute difference between actual and desired value.
1141 */
1142 if ((wdiff = (*b ^ val) & mask)) {
1143 /*
1144 * Different, compute first wrong bit and return.
1145 */
1146 xfs_trans_brelse(tp, bp);
1147 i += XFS_RTLOBIT(wdiff);
1148 *new = start + i;
1149 *stat = 0;
1150 return 0;
1151 } else
1152 i = len;
1153 }
1154 /*
1155 * Successful, return.
1156 */
1157 xfs_trans_brelse(tp, bp);
1158 *new = start + i;
1159 *stat = 1;
1160 return 0;
1161}
1162
1163/*
1164 * Copy and transform the summary file, given the old and new
1165 * parameters in the mount structures.
1166 */
1167STATIC int /* error */
1168xfs_rtcopy_summary(
1169 xfs_mount_t *omp, /* old file system mount point */
1170 xfs_mount_t *nmp, /* new file system mount point */
1171 xfs_trans_t *tp) /* transaction pointer */
1172{
1173 xfs_rtblock_t bbno; /* bitmap block number */
1174 xfs_buf_t *bp; /* summary buffer */
1175 int error; /* error return value */
1176 int log; /* summary level number (log length) */
1177 xfs_suminfo_t sum; /* summary data */
1178 xfs_fsblock_t sumbno; /* summary block number */
1179
1180 bp = NULL;
1181 for (log = omp->m_rsumlevels - 1; log >= 0; log--) {
1182 for (bbno = omp->m_sb.sb_rbmblocks - 1;
1183 (xfs_srtblock_t)bbno >= 0;
1184 bbno--) {
1185 error = xfs_rtget_summary(omp, tp, log, bbno, &bp,
1186 &sumbno, &sum);
1187 if (error)
1188 return error;
1189 if (sum == 0)
1190 continue;
1191 error = xfs_rtmodify_summary(omp, tp, log, bbno, -sum,
1192 &bp, &sumbno);
1193 if (error)
1194 return error;
1195 error = xfs_rtmodify_summary(nmp, tp, log, bbno, sum,
1196 &bp, &sumbno);
1197 if (error)
1198 return error;
1199 ASSERT(sum > 0);
1200 }
1201 }
1202 return 0;
1203}
1204
1205/*
1206 * Searching backward from start to limit, find the first block whose
1207 * allocated/free state is different from start's.
1208 */
1209STATIC int /* error */
1210xfs_rtfind_back(
1211 xfs_mount_t *mp, /* file system mount point */
1212 xfs_trans_t *tp, /* transaction pointer */
1213 xfs_rtblock_t start, /* starting block to look at */
1214 xfs_rtblock_t limit, /* last block to look at */
1215 xfs_rtblock_t *rtblock) /* out: start block found */
1216{
1217 xfs_rtword_t *b; /* current word in buffer */
1218 int bit; /* bit number in the word */
1219 xfs_rtblock_t block; /* bitmap block number */
1220 xfs_buf_t *bp; /* buf for the block */
1221 xfs_rtword_t *bufp; /* starting word in buffer */
1222 int error; /* error value */
1223 xfs_rtblock_t firstbit; /* first useful bit in the word */
1224 xfs_rtblock_t i; /* current bit number rel. to start */
1225 xfs_rtblock_t len; /* length of inspected area */
1226 xfs_rtword_t mask; /* mask of relevant bits for value */
1227 xfs_rtword_t want; /* mask for "good" values */
1228 xfs_rtword_t wdiff; /* difference from wanted value */
1229 int word; /* word number in the buffer */
1230
1231 /*
1232 * Compute and read in starting bitmap block for starting block.
1233 */
1234 block = XFS_BITTOBLOCK(mp, start);
1235 error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
1236 if (error) {
1237 return error;
1238 }
1239 bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
1240 /*
1241 * Get the first word's index & point to it.
1242 */
1243 word = XFS_BITTOWORD(mp, start);
1244 b = &bufp[word];
1245 bit = (int)(start & (XFS_NBWORD - 1));
1246 len = start - limit + 1;
1247 /*
1248 * Compute match value, based on the bit at start: if 1 (free)
1249 * then all-ones, else all-zeroes.
1250 */
1251 want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
1252 /*
1253 * If the starting position is not word-aligned, deal with the
1254 * partial word.
1255 */
1256 if (bit < XFS_NBWORD - 1) {
1257 /*
1258 * Calculate first (leftmost) bit number to look at,
1259 * and mask for all the relevant bits in this word.
1260 */
1261 firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0);
1262 mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) <<
1263 firstbit;
1264 /*
1265 * Calculate the difference between the value there
1266 * and what we're looking for.
1267 */
1268 if ((wdiff = (*b ^ want) & mask)) {
1269 /*
1270 * Different. Mark where we are and return.
1271 */
1272 xfs_trans_brelse(tp, bp);
1273 i = bit - XFS_RTHIBIT(wdiff);
1274 *rtblock = start - i + 1;
1275 return 0;
1276 }
1277 i = bit - firstbit + 1;
1278 /*
1279 * Go on to previous block if that's where the previous word is
1280 * and we need the previous word.
1281 */
1282 if (--word == -1 && i < len) {
1283 /*
1284 * If done with this block, get the previous one.
1285 */
1286 xfs_trans_brelse(tp, bp);
1287 error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
1288 if (error) {
1289 return error;
1290 }
1291 bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
1292 word = XFS_BLOCKWMASK(mp);
1293 b = &bufp[word];
1294 } else {
1295 /*
1296 * Go on to the previous word in the buffer.
1297 */
1298 b--;
1299 }
1300 } else {
1301 /*
1302 * Starting on a word boundary, no partial word.
1303 */
1304 i = 0;
1305 }
1306 /*
1307 * Loop over whole words in buffers. When we use up one buffer
1308 * we move on to the previous one.
1309 */
1310 while (len - i >= XFS_NBWORD) {
1311 /*
1312 * Compute difference between actual and desired value.
1313 */
1314 if ((wdiff = *b ^ want)) {
1315 /*
1316 * Different, mark where we are and return.
1317 */
1318 xfs_trans_brelse(tp, bp);
1319 i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
1320 *rtblock = start - i + 1;
1321 return 0;
1322 }
1323 i += XFS_NBWORD;
1324 /*
1325 * Go on to previous block if that's where the previous word is
1326 * and we need the previous word.
1327 */
1328 if (--word == -1 && i < len) {
1329 /*
1330 * If done with this block, get the previous one.
1331 */
1332 xfs_trans_brelse(tp, bp);
1333 error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
1334 if (error) {
1335 return error;
1336 }
1337 bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
1338 word = XFS_BLOCKWMASK(mp);
1339 b = &bufp[word];
1340 } else {
1341 /*
1342 * Go on to the previous word in the buffer.
1343 */
1344 b--;
1345 }
1346 }
1347 /*
1348 * If not ending on a word boundary, deal with the last
1349 * (partial) word.
1350 */
1351 if (len - i) {
1352 /*
1353 * Calculate first (leftmost) bit number to look at,
1354 * and mask for all the relevant bits in this word.
1355 */
1356 firstbit = XFS_NBWORD - (len - i);
1357 mask = (((xfs_rtword_t)1 << (len - i)) - 1) << firstbit;
1358 /*
1359 * Compute difference between actual and desired value.
1360 */
1361 if ((wdiff = (*b ^ want) & mask)) {
1362 /*
1363 * Different, mark where we are and return.
1364 */
1365 xfs_trans_brelse(tp, bp);
1366 i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
1367 *rtblock = start - i + 1;
1368 return 0;
1369 } else
1370 i = len;
1371 }
1372 /*
1373 * No match, return that we scanned the whole area.
1374 */
1375 xfs_trans_brelse(tp, bp);
1376 *rtblock = start - i + 1;
1377 return 0;
1378}
1379
1380/*
1381 * Searching forward from start to limit, find the first block whose
1382 * allocated/free state is different from start's.
1383 */
1384STATIC int /* error */
1385xfs_rtfind_forw(
1386 xfs_mount_t *mp, /* file system mount point */
1387 xfs_trans_t *tp, /* transaction pointer */
1388 xfs_rtblock_t start, /* starting block to look at */
1389 xfs_rtblock_t limit, /* last block to look at */
1390 xfs_rtblock_t *rtblock) /* out: start block found */
1391{
1392 xfs_rtword_t *b; /* current word in buffer */
1393 int bit; /* bit number in the word */
1394 xfs_rtblock_t block; /* bitmap block number */
1395 xfs_buf_t *bp; /* buf for the block */
1396 xfs_rtword_t *bufp; /* starting word in buffer */
1397 int error; /* error value */
1398 xfs_rtblock_t i; /* current bit number rel. to start */
1399 xfs_rtblock_t lastbit; /* last useful bit in the word */
1400 xfs_rtblock_t len; /* length of inspected area */
1401 xfs_rtword_t mask; /* mask of relevant bits for value */
1402 xfs_rtword_t want; /* mask for "good" values */
1403 xfs_rtword_t wdiff; /* difference from wanted value */
1404 int word; /* word number in the buffer */
1405
1406 /*
1407 * Compute and read in starting bitmap block for starting block.
1408 */
1409 block = XFS_BITTOBLOCK(mp, start);
1410 error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
1411 if (error) {
1412 return error;
1413 }
1414 bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
1415 /*
1416 * Get the first word's index & point to it.
1417 */
1418 word = XFS_BITTOWORD(mp, start);
1419 b = &bufp[word];
1420 bit = (int)(start & (XFS_NBWORD - 1));
1421 len = limit - start + 1;
1422 /*
1423 * Compute match value, based on the bit at start: if 1 (free)
1424 * then all-ones, else all-zeroes.
1425 */
1426 want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
1427 /*
1428 * If the starting position is not word-aligned, deal with the
1429 * partial word.
1430 */
1431 if (bit) {
1432 /*
1433 * Calculate last (rightmost) bit number to look at,
1434 * and mask for all the relevant bits in this word.
1435 */
1436 lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
1437 mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
1438 /*
1439 * Calculate the difference between the value there
1440 * and what we're looking for.
1441 */
1442 if ((wdiff = (*b ^ want) & mask)) {
1443 /*
1444 * Different. Mark where we are and return.
1445 */
1446 xfs_trans_brelse(tp, bp);
1447 i = XFS_RTLOBIT(wdiff) - bit;
1448 *rtblock = start + i - 1;
1449 return 0;
1450 }
1451 i = lastbit - bit;
1452 /*
1453 * Go on to next block if that's where the next word is
1454 * and we need the next word.
1455 */
1456 if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
1457 /*
1458 * If done with this block, get the previous one.
1459 */
1460 xfs_trans_brelse(tp, bp);
1461 error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
1462 if (error) {
1463 return error;
1464 }
1465 b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
1466 word = 0;
1467 } else {
1468 /*
1469 * Go on to the previous word in the buffer.
1470 */
1471 b++;
1472 }
1473 } else {
1474 /*
1475 * Starting on a word boundary, no partial word.
1476 */
1477 i = 0;
1478 }
1479 /*
1480 * Loop over whole words in buffers. When we use up one buffer
1481 * we move on to the next one.
1482 */
1483 while (len - i >= XFS_NBWORD) {
1484 /*
1485 * Compute difference between actual and desired value.
1486 */
1487 if ((wdiff = *b ^ want)) {
1488 /*
1489 * Different, mark where we are and return.
1490 */
1491 xfs_trans_brelse(tp, bp);
1492 i += XFS_RTLOBIT(wdiff);
1493 *rtblock = start + i - 1;
1494 return 0;
1495 }
1496 i += XFS_NBWORD;
1497 /*
1498 * Go on to next block if that's where the next word is
1499 * and we need the next word.
1500 */
1501 if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
1502 /*
1503 * If done with this block, get the next one.
1504 */
1505 xfs_trans_brelse(tp, bp);
1506 error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
1507 if (error) {
1508 return error;
1509 }
1510 b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
1511 word = 0;
1512 } else {
1513 /*
1514 * Go on to the next word in the buffer.
1515 */
1516 b++;
1517 }
1518 }
1519 /*
1520 * If not ending on a word boundary, deal with the last
1521 * (partial) word.
1522 */
1523 if ((lastbit = len - i)) {
1524 /*
1525 * Calculate mask for all the relevant bits in this word.
1526 */
1527 mask = ((xfs_rtword_t)1 << lastbit) - 1;
1528 /*
1529 * Compute difference between actual and desired value.
1530 */
1531 if ((wdiff = (*b ^ want) & mask)) {
1532 /*
1533 * Different, mark where we are and return.
1534 */
1535 xfs_trans_brelse(tp, bp);
1536 i += XFS_RTLOBIT(wdiff);
1537 *rtblock = start + i - 1;
1538 return 0;
1539 } else
1540 i = len;
1541 }
1542 /*
1543 * No match, return that we scanned the whole area.
1544 */
1545 xfs_trans_brelse(tp, bp);
1546 *rtblock = start + i - 1;
1547 return 0;
1548}
1549
1550/*
1551 * Mark an extent specified by start and len freed.
1552 * Updates all the summary information as well as the bitmap.
1553 */
1554STATIC int /* error */
1555xfs_rtfree_range(
1556 xfs_mount_t *mp, /* file system mount point */
1557 xfs_trans_t *tp, /* transaction pointer */
1558 xfs_rtblock_t start, /* starting block to free */
1559 xfs_extlen_t len, /* length to free */
1560 xfs_buf_t **rbpp, /* in/out: summary block buffer */
1561 xfs_fsblock_t *rsb) /* in/out: summary block number */
1562{
1563 xfs_rtblock_t end; /* end of the freed extent */
1564 int error; /* error value */
1565 xfs_rtblock_t postblock; /* first block freed > end */
1566 xfs_rtblock_t preblock; /* first block freed < start */
1567
1568 end = start + len - 1;
1569 /*
1570 * Modify the bitmap to mark this extent freed.
1571 */
1572 error = xfs_rtmodify_range(mp, tp, start, len, 1);
1573 if (error) {
1574 return error;
1575 }
1576 /*
1577 * Assume we're freeing out of the middle of an allocated extent.
1578 * We need to find the beginning and end of the extent so we can
1579 * properly update the summary.
1580 */
1581 error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
1582 if (error) {
1583 return error;
1584 }
1585 /*
1586 * Find the next allocated block (end of allocated extent).
1587 */
1588 error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
1589 &postblock);
1590 /*
1591 * If there are blocks not being freed at the front of the
1592 * old extent, add summary data for them to be allocated.
1593 */
1594 if (preblock < start) {
1595 error = xfs_rtmodify_summary(mp, tp,
1596 XFS_RTBLOCKLOG(start - preblock),
1597 XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
1598 if (error) {
1599 return error;
1600 }
1601 }
1602 /*
1603 * If there are blocks not being freed at the end of the
1604 * old extent, add summary data for them to be allocated.
1605 */
1606 if (postblock > end) {
1607 error = xfs_rtmodify_summary(mp, tp,
1608 XFS_RTBLOCKLOG(postblock - end),
1609 XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb);
1610 if (error) {
1611 return error;
1612 }
1613 }
1614 /*
1615 * Increment the summary information corresponding to the entire
1616 * (new) free extent.
1617 */
1618 error = xfs_rtmodify_summary(mp, tp,
1619 XFS_RTBLOCKLOG(postblock + 1 - preblock),
1620 XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
1621 return error;
1622}
1623
1624/*
1625 * Read and return the summary information for a given extent size,
1626 * bitmap block combination.
1627 * Keeps track of a current summary block, so we don't keep reading
1628 * it from the buffer cache.
1629 */
1630STATIC int /* error */
1631xfs_rtget_summary(
1632 xfs_mount_t *mp, /* file system mount structure */
1633 xfs_trans_t *tp, /* transaction pointer */
1634 int log, /* log2 of extent size */
1635 xfs_rtblock_t bbno, /* bitmap block number */
1636 xfs_buf_t **rbpp, /* in/out: summary block buffer */
1637 xfs_fsblock_t *rsb, /* in/out: summary block number */
1638 xfs_suminfo_t *sum) /* out: summary info for this block */
1639{
1640 xfs_buf_t *bp; /* buffer for summary block */
1641 int error; /* error value */
1642 xfs_fsblock_t sb; /* summary fsblock */
1643 int so; /* index into the summary file */
1644 xfs_suminfo_t *sp; /* pointer to returned data */
1645
1646 /*
1647 * Compute entry number in the summary file.
1648 */
1649 so = XFS_SUMOFFS(mp, log, bbno);
1650 /*
1651 * Compute the block number in the summary file.
1652 */
1653 sb = XFS_SUMOFFSTOBLOCK(mp, so);
1654 /*
1655 * If we have an old buffer, and the block number matches, use that.
1656 */
1657 if (rbpp && *rbpp && *rsb == sb)
1658 bp = *rbpp;
1659 /*
1660 * Otherwise we have to get the buffer.
1661 */
1662 else {
1663 /*
1664 * If there was an old one, get rid of it first.
1665 */
1666 if (rbpp && *rbpp)
1667 xfs_trans_brelse(tp, *rbpp);
1668 error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
1669 if (error) {
1670 return error;
1671 }
1672 /*
1673 * Remember this buffer and block for the next call.
1674 */
1675 if (rbpp) {
1676 *rbpp = bp;
1677 *rsb = sb;
1678 }
1679 }
1680 /*
1681 * Point to the summary information & copy it out.
1682 */
1683 sp = XFS_SUMPTR(mp, bp, so);
1684 *sum = *sp;
1685 /*
1686 * Drop the buffer if we're not asked to remember it.
1687 */
1688 if (!rbpp)
1689 xfs_trans_brelse(tp, bp);
1690 return 0;
1691}
1692
1693/*
1694 * Set the given range of bitmap bits to the given value.
1695 * Do whatever I/O and logging is required.
1696 */
1697STATIC int /* error */
1698xfs_rtmodify_range(
1699 xfs_mount_t *mp, /* file system mount point */
1700 xfs_trans_t *tp, /* transaction pointer */
1701 xfs_rtblock_t start, /* starting block to modify */
1702 xfs_extlen_t len, /* length of extent to modify */
1703 int val) /* 1 for free, 0 for allocated */
1704{
1705 xfs_rtword_t *b; /* current word in buffer */
1706 int bit; /* bit number in the word */
1707 xfs_rtblock_t block; /* bitmap block number */
1708 xfs_buf_t *bp; /* buf for the block */
1709 xfs_rtword_t *bufp; /* starting word in buffer */
1710 int error; /* error value */
1711 xfs_rtword_t *first; /* first used word in the buffer */
1712 int i; /* current bit number rel. to start */
1713 int lastbit; /* last useful bit in word */
1714 xfs_rtword_t mask; /* mask o frelevant bits for value */
1715 int word; /* word number in the buffer */
1716
1717 /*
1718 * Compute starting bitmap block number.
1719 */
1720 block = XFS_BITTOBLOCK(mp, start);
1721 /*
1722 * Read the bitmap block, and point to its data.
1723 */
1724 error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
1725 if (error) {
1726 return error;
1727 }
1728 bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
1729 /*
1730 * Compute the starting word's address, and starting bit.
1731 */
1732 word = XFS_BITTOWORD(mp, start);
1733 first = b = &bufp[word];
1734 bit = (int)(start & (XFS_NBWORD - 1));
1735 /*
1736 * 0 (allocated) => all zeroes; 1 (free) => all ones.
1737 */
1738 val = -val;
1739 /*
1740 * If not starting on a word boundary, deal with the first
1741 * (partial) word.
1742 */
1743 if (bit) {
1744 /*
1745 * Compute first bit not changed and mask of relevant bits.
1746 */
1747 lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
1748 mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
1749 /*
1750 * Set/clear the active bits.
1751 */
1752 if (val)
1753 *b |= mask;
1754 else
1755 *b &= ~mask;
1756 i = lastbit - bit;
1757 /*
1758 * Go on to the next block if that's where the next word is
1759 * and we need the next word.
1760 */
1761 if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
1762 /*
1763 * Log the changed part of this block.
1764 * Get the next one.
1765 */
1766 xfs_trans_log_buf(tp, bp,
1767 (uint)((char *)first - (char *)bufp),
1768 (uint)((char *)b - (char *)bufp));
1769 error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
1770 if (error) {
1771 return error;
1772 }
1773 first = b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
1774 word = 0;
1775 } else {
1776 /*
1777 * Go on to the next word in the buffer
1778 */
1779 b++;
1780 }
1781 } else {
1782 /*
1783 * Starting on a word boundary, no partial word.
1784 */
1785 i = 0;
1786 }
1787 /*
1788 * Loop over whole words in buffers. When we use up one buffer
1789 * we move on to the next one.
1790 */
1791 while (len - i >= XFS_NBWORD) {
1792 /*
1793 * Set the word value correctly.
1794 */
1795 *b = val;
1796 i += XFS_NBWORD;
1797 /*
1798 * Go on to the next block if that's where the next word is
1799 * and we need the next word.
1800 */
1801 if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
1802 /*
1803 * Log the changed part of this block.
1804 * Get the next one.
1805 */
1806 xfs_trans_log_buf(tp, bp,
1807 (uint)((char *)first - (char *)bufp),
1808 (uint)((char *)b - (char *)bufp));
1809 error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
1810 if (error) {
1811 return error;
1812 }
1813 first = b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
1814 word = 0;
1815 } else {
1816 /*
1817 * Go on to the next word in the buffer
1818 */
1819 b++;
1820 }
1821 }
1822 /*
1823 * If not ending on a word boundary, deal with the last
1824 * (partial) word.
1825 */
1826 if ((lastbit = len - i)) {
1827 /*
1828 * Compute a mask of relevant bits.
1829 */
1830 bit = 0;
1831 mask = ((xfs_rtword_t)1 << lastbit) - 1;
1832 /*
1833 * Set/clear the active bits.
1834 */
1835 if (val)
1836 *b |= mask;
1837 else
1838 *b &= ~mask;
1839 b++;
1840 }
1841 /*
1842 * Log any remaining changed bytes.
1843 */
1844 if (b > first)
1845 xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp),
1846 (uint)((char *)b - (char *)bufp - 1));
1847 return 0;
1848}
1849
1850/*
1851 * Read and modify the summary information for a given extent size,
1852 * bitmap block combination.
1853 * Keeps track of a current summary block, so we don't keep reading
1854 * it from the buffer cache.
1855 */
1856STATIC int /* error */
1857xfs_rtmodify_summary(
1858 xfs_mount_t *mp, /* file system mount point */
1859 xfs_trans_t *tp, /* transaction pointer */
1860 int log, /* log2 of extent size */
1861 xfs_rtblock_t bbno, /* bitmap block number */
1862 int delta, /* change to make to summary info */
1863 xfs_buf_t **rbpp, /* in/out: summary block buffer */
1864 xfs_fsblock_t *rsb) /* in/out: summary block number */
1865{
1866 xfs_buf_t *bp; /* buffer for the summary block */
1867 int error; /* error value */
1868 xfs_fsblock_t sb; /* summary fsblock */
1869 int so; /* index into the summary file */
1870 xfs_suminfo_t *sp; /* pointer to returned data */
1871
1872 /*
1873 * Compute entry number in the summary file.
1874 */
1875 so = XFS_SUMOFFS(mp, log, bbno);
1876 /*
1877 * Compute the block number in the summary file.
1878 */
1879 sb = XFS_SUMOFFSTOBLOCK(mp, so);
1880 /*
1881 * If we have an old buffer, and the block number matches, use that.
1882 */
1883 if (rbpp && *rbpp && *rsb == sb)
1884 bp = *rbpp;
1885 /*
1886 * Otherwise we have to get the buffer.
1887 */
1888 else {
1889 /*
1890 * If there was an old one, get rid of it first.
1891 */
1892 if (rbpp && *rbpp)
1893 xfs_trans_brelse(tp, *rbpp);
1894 error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
1895 if (error) {
1896 return error;
1897 }
1898 /*
1899 * Remember this buffer and block for the next call.
1900 */
1901 if (rbpp) {
1902 *rbpp = bp;
1903 *rsb = sb;
1904 }
1905 }
1906 /*
1907 * Point to the summary information, modify and log it.
1908 */
1909 sp = XFS_SUMPTR(mp, bp, so);
1910 *sp += delta;
1911 xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)XFS_BUF_PTR(bp)),
1912 (uint)((char *)sp - (char *)XFS_BUF_PTR(bp) + sizeof(*sp) - 1));
1913 return 0;
1914}
1915
1916/*
1917 * Visible (exported) functions.
1918 */
1919
1920/*
1921 * Grow the realtime area of the filesystem.
1922 */
1923int
1924xfs_growfs_rt(
1925 xfs_mount_t *mp, /* mount point for filesystem */
1926 xfs_growfs_rt_t *in) /* growfs rt input struct */
1927{
1928 xfs_rtblock_t bmbno; /* bitmap block number */
1929 xfs_buf_t *bp; /* temporary buffer */
1930 int cancelflags; /* flags for xfs_trans_cancel */
1931 int error; /* error return value */
1932 xfs_inode_t *ip; /* bitmap inode, used as lock */
1933 xfs_mount_t *nmp; /* new (fake) mount structure */
1934 xfs_drfsbno_t nrblocks; /* new number of realtime blocks */
1935 xfs_extlen_t nrbmblocks; /* new number of rt bitmap blocks */
1936 xfs_drtbno_t nrextents; /* new number of realtime extents */
1937 uint8_t nrextslog; /* new log2 of sb_rextents */
1938 xfs_extlen_t nrsumblocks; /* new number of summary blocks */
1939 uint nrsumlevels; /* new rt summary levels */
1940 uint nrsumsize; /* new size of rt summary, bytes */
1941 xfs_sb_t *nsbp; /* new superblock */
1942 xfs_extlen_t rbmblocks; /* current number of rt bitmap blocks */
1943 xfs_extlen_t rsumblocks; /* current number of rt summary blks */
1944 xfs_sb_t *sbp; /* old superblock */
1945 xfs_fsblock_t sumbno; /* summary block number */
1946 xfs_trans_t *tp; /* transaction pointer */
1947
1948 sbp = &mp->m_sb;
1949 /*
1950 * Initial error checking.
1951 */
1952 if (mp->m_rtdev_targp || mp->m_rbmip == NULL ||
1953 (nrblocks = in->newblocks) <= sbp->sb_rblocks ||
1954 (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize)))
1955 return XFS_ERROR(EINVAL);
1956 /*
1957 * Read in the last block of the device, make sure it exists.
1958 */
1959 error = xfs_read_buf(mp, mp->m_rtdev_targp,
1960 XFS_FSB_TO_BB(mp, in->newblocks - 1),
1961 XFS_FSB_TO_BB(mp, 1), 0, &bp);
1962 if (error)
1963 return error;
1964 ASSERT(bp);
1965 xfs_buf_relse(bp);
1966 /*
1967 * Calculate new parameters. These are the final values to be reached.
1968 */
1969 nrextents = nrblocks;
1970 do_div(nrextents, in->extsize);
1971 nrbmblocks = roundup_64(nrextents, NBBY * sbp->sb_blocksize);
1972 nrextslog = xfs_highbit32(nrextents);
1973 nrsumlevels = nrextslog + 1;
1974 nrsumsize = (uint)sizeof(xfs_suminfo_t) * nrsumlevels * nrbmblocks;
1975 nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize);
1976 nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks);
1977 /*
1978 * New summary size can't be more than half the size of
1979 * the log. This prevents us from getting a log overflow,
1980 * since we'll log basically the whole summary file at once.
1981 */
1982 if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1))
1983 return XFS_ERROR(EINVAL);
1984 /*
1985 * Get the old block counts for bitmap and summary inodes.
1986 * These can't change since other growfs callers are locked out.
1987 */
1988 rbmblocks = XFS_B_TO_FSB(mp, mp->m_rbmip->i_d.di_size);
1989 rsumblocks = XFS_B_TO_FSB(mp, mp->m_rsumip->i_d.di_size);
1990 /*
1991 * Allocate space to the bitmap and summary files, as necessary.
1992 */
1993 if ((error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks,
1994 mp->m_sb.sb_rbmino)))
1995 return error;
1996 if ((error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks,
1997 mp->m_sb.sb_rsumino)))
1998 return error;
1999 nmp = NULL;
2000 /*
2001 * Loop over the bitmap blocks.
2002 * We will do everything one bitmap block at a time.
2003 * Skip the current block if it is exactly full.
2004 * This also deals with the case where there were no rtextents before.
2005 */
2006 for (bmbno = sbp->sb_rbmblocks -
2007 ((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0);
2008 bmbno < nrbmblocks;
2009 bmbno++) {
2010 /*
2011 * Allocate a new (fake) mount/sb.
2012 */
2013 nmp = kmem_alloc(sizeof(*nmp), KM_SLEEP);
2014 *nmp = *mp;
2015 nsbp = &nmp->m_sb;
2016 /*
2017 * Calculate new sb and mount fields for this round.
2018 */
2019 nsbp->sb_rextsize = in->extsize;
2020 nsbp->sb_rbmblocks = bmbno + 1;
2021 nsbp->sb_rblocks =
2022 XFS_RTMIN(nrblocks,
2023 nsbp->sb_rbmblocks * NBBY *
2024 nsbp->sb_blocksize * nsbp->sb_rextsize);
2025 nsbp->sb_rextents = nsbp->sb_rblocks;
2026 do_div(nsbp->sb_rextents, nsbp->sb_rextsize);
2027 nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents);
2028 nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1;
2029 nrsumsize =
2030 (uint)sizeof(xfs_suminfo_t) * nrsumlevels *
2031 nsbp->sb_rbmblocks;
2032 nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize);
2033 nmp->m_rsumsize = nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks);
2034 /*
2035 * Start a transaction, get the log reservation.
2036 */
2037 tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE);
2038 cancelflags = 0;
2039 if ((error = xfs_trans_reserve(tp, 0,
2040 XFS_GROWRTFREE_LOG_RES(nmp), 0, 0, 0)))
2041 goto error_exit;
2042 /*
2043 * Lock out other callers by grabbing the bitmap inode lock.
2044 */
2045 if ((error = xfs_trans_iget(mp, tp, 0, mp->m_sb.sb_rbmino,
2046 XFS_ILOCK_EXCL, &ip)))
2047 goto error_exit;
2048 ASSERT(ip == mp->m_rbmip);
2049 /*
2050 * Update the bitmap inode's size.
2051 */
2052 mp->m_rbmip->i_d.di_size =
2053 nsbp->sb_rbmblocks * nsbp->sb_blocksize;
2054 xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
2055 cancelflags |= XFS_TRANS_ABORT;
2056 /*
2057 * Get the summary inode into the transaction.
2058 */
2059 if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino,
2060 0, XFS_ILOCK_EXCL, &ip)))
2061 goto error_exit;
2062 ASSERT(ip == mp->m_rsumip);
2063 /*
2064 * Update the summary inode's size.
2065 */
2066 mp->m_rsumip->i_d.di_size = nmp->m_rsumsize;
2067 xfs_trans_log_inode(tp, mp->m_rsumip, XFS_ILOG_CORE);
2068 /*
2069 * Copy summary data from old to new sizes.
2070 * Do this when the real size (not block-aligned) changes.
2071 */
2072 if (sbp->sb_rbmblocks != nsbp->sb_rbmblocks ||
2073 mp->m_rsumlevels != nmp->m_rsumlevels) {
2074 error = xfs_rtcopy_summary(mp, nmp, tp);
2075 if (error)
2076 goto error_exit;
2077 }
2078 /*
2079 * Update superblock fields.
2080 */
2081 if (nsbp->sb_rextsize != sbp->sb_rextsize)
2082 xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSIZE,
2083 nsbp->sb_rextsize - sbp->sb_rextsize);
2084 if (nsbp->sb_rbmblocks != sbp->sb_rbmblocks)
2085 xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS,
2086 nsbp->sb_rbmblocks - sbp->sb_rbmblocks);
2087 if (nsbp->sb_rblocks != sbp->sb_rblocks)
2088 xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBLOCKS,
2089 nsbp->sb_rblocks - sbp->sb_rblocks);
2090 if (nsbp->sb_rextents != sbp->sb_rextents)
2091 xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTENTS,
2092 nsbp->sb_rextents - sbp->sb_rextents);
2093 if (nsbp->sb_rextslog != sbp->sb_rextslog)
2094 xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG,
2095 nsbp->sb_rextslog - sbp->sb_rextslog);
2096 /*
2097 * Free new extent.
2098 */
2099 bp = NULL;
2100 error = xfs_rtfree_range(nmp, tp, sbp->sb_rextents,
2101 nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno);
2102 if (error)
2103 goto error_exit;
2104 /*
2105 * Mark more blocks free in the superblock.
2106 */
2107 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS,
2108 nsbp->sb_rextents - sbp->sb_rextents);
2109 /*
2110 * Free the fake mp structure.
2111 */
2112 kmem_free(nmp, sizeof(*nmp));
2113 nmp = NULL;
2114 /*
2115 * Update mp values into the real mp structure.
2116 */
2117 mp->m_rsumlevels = nrsumlevels;
2118 mp->m_rsumsize = nrsumsize;
2119 /*
2120 * Commit the transaction.
2121 */
2122 xfs_trans_commit(tp, 0, NULL);
2123 }
2124 return 0;
2125
2126 /*
2127 * Error paths come here.
2128 */
2129error_exit:
2130 if (nmp)
2131 kmem_free(nmp, sizeof(*nmp));
2132 xfs_trans_cancel(tp, cancelflags);
2133 return error;
2134}
2135
2136/*
2137 * Allocate an extent in the realtime subvolume, with the usual allocation
2138 * parameters. The length units are all in realtime extents, as is the
2139 * result block number.
2140 */
2141int /* error */
2142xfs_rtallocate_extent(
2143 xfs_trans_t *tp, /* transaction pointer */
2144 xfs_rtblock_t bno, /* starting block number to allocate */
2145 xfs_extlen_t minlen, /* minimum length to allocate */
2146 xfs_extlen_t maxlen, /* maximum length to allocate */
2147 xfs_extlen_t *len, /* out: actual length allocated */
2148 xfs_alloctype_t type, /* allocation type XFS_ALLOCTYPE... */
2149 int wasdel, /* was a delayed allocation extent */
2150 xfs_extlen_t prod, /* extent product factor */
2151 xfs_rtblock_t *rtblock) /* out: start block allocated */
2152{
2153 int error; /* error value */
2154 xfs_inode_t *ip; /* inode for bitmap file */
2155 xfs_mount_t *mp; /* file system mount structure */
2156 xfs_rtblock_t r; /* result allocated block */
2157 xfs_fsblock_t sb; /* summary file block number */
2158 xfs_buf_t *sumbp; /* summary file block buffer */
2159
2160 ASSERT(minlen > 0 && minlen <= maxlen);
2161 mp = tp->t_mountp;
2162 /*
2163 * If prod is set then figure out what to do to minlen and maxlen.
2164 */
2165 if (prod > 1) {
2166 xfs_extlen_t i;
2167
2168 if ((i = maxlen % prod))
2169 maxlen -= i;
2170 if ((i = minlen % prod))
2171 minlen += prod - i;
2172 if (maxlen < minlen) {
2173 *rtblock = NULLRTBLOCK;
2174 return 0;
2175 }
2176 }
2177 /*
2178 * Lock out other callers by grabbing the bitmap inode lock.
2179 */
2180 error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, XFS_ILOCK_EXCL, &ip);
2181 if (error) {
2182 return error;
2183 }
2184 sumbp = NULL;
2185 /*
2186 * Allocate by size, or near another block, or exactly at some block.
2187 */
2188 switch (type) {
2189 case XFS_ALLOCTYPE_ANY_AG:
2190 error = xfs_rtallocate_extent_size(mp, tp, minlen, maxlen, len,
2191 &sumbp, &sb, prod, &r);
2192 break;
2193 case XFS_ALLOCTYPE_NEAR_BNO:
2194 error = xfs_rtallocate_extent_near(mp, tp, bno, minlen, maxlen,
2195 len, &sumbp, &sb, prod, &r);
2196 break;
2197 case XFS_ALLOCTYPE_THIS_BNO:
2198 error = xfs_rtallocate_extent_exact(mp, tp, bno, minlen, maxlen,
2199 len, &sumbp, &sb, prod, &r);
2200 break;
2201 default:
2202 ASSERT(0);
2203 }
2204 if (error) {
2205 return error;
2206 }
2207 /*
2208 * If it worked, update the superblock.
2209 */
2210 if (r != NULLRTBLOCK) {
2211 long slen = (long)*len;
2212
2213 ASSERT(*len >= minlen && *len <= maxlen);
2214 if (wasdel)
2215 xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FREXTENTS, -slen);
2216 else
2217 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, -slen);
2218 }
2219 *rtblock = r;
2220 return 0;
2221}
2222
2223/*
2224 * Free an extent in the realtime subvolume. Length is expressed in
2225 * realtime extents, as is the block number.
2226 */
2227int /* error */
2228xfs_rtfree_extent(
2229 xfs_trans_t *tp, /* transaction pointer */
2230 xfs_rtblock_t bno, /* starting block number to free */
2231 xfs_extlen_t len) /* length of extent freed */
2232{
2233 int error; /* error value */
2234 xfs_inode_t *ip; /* bitmap file inode */
2235 xfs_mount_t *mp; /* file system mount structure */
2236 xfs_fsblock_t sb; /* summary file block number */
2237 xfs_buf_t *sumbp; /* summary file block buffer */
2238
2239 mp = tp->t_mountp;
2240 /*
2241 * Synchronize by locking the bitmap inode.
2242 */
2243 error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, XFS_ILOCK_EXCL, &ip);
2244 if (error) {
2245 return error;
2246 }
2247#if defined(__KERNEL__) && defined(DEBUG)
2248 /*
2249 * Check to see that this whole range is currently allocated.
2250 */
2251 {
2252 int stat; /* result from checking range */
2253
2254 error = xfs_rtcheck_alloc_range(mp, tp, bno, len, &stat);
2255 if (error) {
2256 return error;
2257 }
2258 ASSERT(stat);
2259 }
2260#endif
2261 sumbp = NULL;
2262 /*
2263 * Free the range of realtime blocks.
2264 */
2265 error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb);
2266 if (error) {
2267 return error;
2268 }
2269 /*
2270 * Mark more blocks free in the superblock.
2271 */
2272 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len);
2273 /*
2274 * If we've now freed all the blocks, reset the file sequence
2275 * number to 0.
2276 */
2277 if (tp->t_frextents_delta + mp->m_sb.sb_frextents ==
2278 mp->m_sb.sb_rextents) {
2279 if (!(ip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
2280 ip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
2281 *(__uint64_t *)&ip->i_d.di_atime = 0;
2282 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2283 }
2284 return 0;
2285}
2286
2287/*
2288 * Initialize realtime fields in the mount structure.
2289 */
2290int /* error */
2291xfs_rtmount_init(
2292 xfs_mount_t *mp) /* file system mount structure */
2293{
2294 xfs_buf_t *bp; /* buffer for last block of subvolume */
2295 xfs_daddr_t d; /* address of last block of subvolume */
2296 int error; /* error return value */
2297 xfs_sb_t *sbp; /* filesystem superblock copy in mount */
2298
2299 sbp = &mp->m_sb;
2300 if (sbp->sb_rblocks == 0)
2301 return 0;
2302 if (mp->m_rtdev_targp == NULL) {
2303 cmn_err(CE_WARN,
2304 "XFS: This filesystem has a realtime volume, use rtdev=device option");
2305 return XFS_ERROR(ENODEV);
2306 }
2307 mp->m_rsumlevels = sbp->sb_rextslog + 1;
2308 mp->m_rsumsize =
2309 (uint)sizeof(xfs_suminfo_t) * mp->m_rsumlevels *
2310 sbp->sb_rbmblocks;
2311 mp->m_rsumsize = roundup(mp->m_rsumsize, sbp->sb_blocksize);
2312 mp->m_rbmip = mp->m_rsumip = NULL;
2313 /*
2314 * Check that the realtime section is an ok size.
2315 */
2316 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
2317 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) {
2318 cmn_err(CE_WARN, "XFS: realtime mount -- %llu != %llu",
2319 (unsigned long long) XFS_BB_TO_FSB(mp, d),
2320 (unsigned long long) mp->m_sb.sb_rblocks);
2321 return XFS_ERROR(E2BIG);
2322 }
2323 error = xfs_read_buf(mp, mp->m_rtdev_targp,
2324 d - XFS_FSB_TO_BB(mp, 1),
2325 XFS_FSB_TO_BB(mp, 1), 0, &bp);
2326 if (error) {
2327 cmn_err(CE_WARN,
2328 "XFS: realtime mount -- xfs_read_buf failed, returned %d", error);
2329 if (error == ENOSPC)
2330 return XFS_ERROR(E2BIG);
2331 return error;
2332 }
2333 xfs_buf_relse(bp);
2334 return 0;
2335}
2336
2337/*
2338 * Get the bitmap and summary inodes into the mount structure
2339 * at mount time.
2340 */
2341int /* error */
2342xfs_rtmount_inodes(
2343 xfs_mount_t *mp) /* file system mount structure */
2344{
2345 int error; /* error return value */
2346 xfs_sb_t *sbp;
2347
2348 sbp = &mp->m_sb;
2349 if (sbp->sb_rbmino == NULLFSINO)
2350 return 0;
2351 error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip, 0);
2352 if (error)
2353 return error;
2354 ASSERT(mp->m_rbmip != NULL);
2355 ASSERT(sbp->sb_rsumino != NULLFSINO);
2356 error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip, 0);
2357 if (error) {
2358 VN_RELE(XFS_ITOV(mp->m_rbmip));
2359 return error;
2360 }
2361 ASSERT(mp->m_rsumip != NULL);
2362 return 0;
2363}
2364
2365/*
2366 * Pick an extent for allocation at the start of a new realtime file.
2367 * Use the sequence number stored in the atime field of the bitmap inode.
2368 * Translate this to a fraction of the rtextents, and return the product
2369 * of rtextents and the fraction.
2370 * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ...
2371 */
2372int /* error */
2373xfs_rtpick_extent(
2374 xfs_mount_t *mp, /* file system mount point */
2375 xfs_trans_t *tp, /* transaction pointer */
2376 xfs_extlen_t len, /* allocation length (rtextents) */
2377 xfs_rtblock_t *pick) /* result rt extent */
2378{
2379 xfs_rtblock_t b; /* result block */
2380 int error; /* error return value */
2381 xfs_inode_t *ip; /* bitmap incore inode */
2382 int log2; /* log of sequence number */
2383 __uint64_t resid; /* residual after log removed */
2384 __uint64_t seq; /* sequence number of file creation */
2385 __uint64_t *seqp; /* pointer to seqno in inode */
2386
2387 error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, XFS_ILOCK_EXCL, &ip);
2388 if (error)
2389 return error;
2390 ASSERT(ip == mp->m_rbmip);
2391 seqp = (__uint64_t *)&ip->i_d.di_atime;
2392 if (!(ip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) {
2393 ip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
2394 *seqp = 0;
2395 }
2396 seq = *seqp;
2397 if ((log2 = xfs_highbit64(seq)) == -1)
2398 b = 0;
2399 else {
2400 resid = seq - (1ULL << log2);
2401 b = (mp->m_sb.sb_rextents * ((resid << 1) + 1ULL)) >>
2402 (log2 + 1);
2403 if (b >= mp->m_sb.sb_rextents)
2404 b = do_mod(b, mp->m_sb.sb_rextents);
2405 if (b + len > mp->m_sb.sb_rextents)
2406 b = mp->m_sb.sb_rextents - len;
2407 }
2408 *seqp = seq + 1;
2409 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2410 *pick = b;
2411 return 0;
2412}
2413
2414#ifdef DEBUG
2415/*
2416 * Debug code: print out the value of a range in the bitmap.
2417 */
2418void
2419xfs_rtprint_range(
2420 xfs_mount_t *mp, /* file system mount structure */
2421 xfs_trans_t *tp, /* transaction pointer */
2422 xfs_rtblock_t start, /* starting block to print */
2423 xfs_extlen_t len) /* length to print */
2424{
2425 xfs_extlen_t i; /* block number in the extent */
2426
2427 printk("%Ld: ", (long long)start);
2428 for (i = 0; i < len; i++)
2429 printk("%d", xfs_rtcheck_bit(mp, tp, start + i, 1));
2430 printk("\n");
2431}
2432
2433/*
2434 * Debug code: print the summary file.
2435 */
2436void
2437xfs_rtprint_summary(
2438 xfs_mount_t *mp, /* file system mount structure */
2439 xfs_trans_t *tp) /* transaction pointer */
2440{
2441 xfs_suminfo_t c; /* summary data */
2442 xfs_rtblock_t i; /* bitmap block number */
2443 int l; /* summary information level */
2444 int p; /* flag for printed anything */
2445 xfs_fsblock_t sb; /* summary block number */
2446 xfs_buf_t *sumbp; /* summary block buffer */
2447
2448 sumbp = NULL;
2449 for (l = 0; l < mp->m_rsumlevels; l++) {
2450 for (p = 0, i = 0; i < mp->m_sb.sb_rbmblocks; i++) {
2451 (void)xfs_rtget_summary(mp, tp, l, i, &sumbp, &sb, &c);
2452 if (c) {
2453 if (!p) {
2454 printk("%Ld-%Ld:", 1LL << l,
2455 XFS_RTMIN((1LL << l) +
2456 ((1LL << l) - 1LL),
2457 mp->m_sb.sb_rextents));
2458 p = 1;
2459 }
2460 printk(" %Ld:%d", (long long)i, c);
2461 }
2462 }
2463 if (p)
2464 printk("\n");
2465 }
2466 if (sumbp)
2467 xfs_trans_brelse(tp, sumbp);
2468}
2469#endif /* DEBUG */
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
new file mode 100644
index 000000000000..e2710264c054
--- /dev/null
+++ b/fs/xfs/xfs_rtalloc.h
@@ -0,0 +1,187 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_RTALLOC_H__
33#define __XFS_RTALLOC_H__
34
35struct xfs_mount;
36struct xfs_trans;
37
38#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME)
39
40/* Min and max rt extent sizes, specified in bytes */
41#define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024) /* 1GB */
42#define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64KB */
43#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4KB */
44
45/*
46 * Constants for bit manipulations.
47 */
48#define XFS_NBBYLOG 3 /* log2(NBBY) */
49#define XFS_WORDLOG 2 /* log2(sizeof(xfs_rtword_t)) */
50#define XFS_NBWORDLOG (XFS_NBBYLOG + XFS_WORDLOG)
51#define XFS_NBWORD (1 << XFS_NBWORDLOG)
52#define XFS_WORDMASK ((1 << XFS_WORDLOG) - 1)
53
54#define XFS_BLOCKSIZE(mp) ((mp)->m_sb.sb_blocksize)
55#define XFS_BLOCKMASK(mp) ((mp)->m_blockmask)
56#define XFS_BLOCKWSIZE(mp) ((mp)->m_blockwsize)
57#define XFS_BLOCKWMASK(mp) ((mp)->m_blockwmask)
58
59/*
60 * Summary and bit manipulation macros.
61 */
62#define XFS_SUMOFFS(mp,ls,bb) ((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb)))
63#define XFS_SUMOFFSTOBLOCK(mp,s) \
64 (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog)
65#define XFS_SUMPTR(mp,bp,so) \
66 ((xfs_suminfo_t *)((char *)XFS_BUF_PTR(bp) + \
67 (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp))))
68
69#define XFS_BITTOBLOCK(mp,bi) ((bi) >> (mp)->m_blkbit_log)
70#define XFS_BLOCKTOBIT(mp,bb) ((bb) << (mp)->m_blkbit_log)
71#define XFS_BITTOWORD(mp,bi) \
72 ((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp)))
73
74#define XFS_RTMIN(a,b) ((a) < (b) ? (a) : (b))
75#define XFS_RTMAX(a,b) ((a) > (b) ? (a) : (b))
76
77#define XFS_RTLOBIT(w) xfs_lowbit32(w)
78#define XFS_RTHIBIT(w) xfs_highbit32(w)
79
80#if XFS_BIG_BLKNOS
81#define XFS_RTBLOCKLOG(b) xfs_highbit64(b)
82#else
83#define XFS_RTBLOCKLOG(b) xfs_highbit32(b)
84#endif
85
86
87#ifdef __KERNEL__
88
89#ifdef CONFIG_XFS_RT
90/*
91 * Function prototypes for exported functions.
92 */
93
94/*
95 * Allocate an extent in the realtime subvolume, with the usual allocation
96 * parameters. The length units are all in realtime extents, as is the
97 * result block number.
98 */
99int /* error */
100xfs_rtallocate_extent(
101 struct xfs_trans *tp, /* transaction pointer */
102 xfs_rtblock_t bno, /* starting block number to allocate */
103 xfs_extlen_t minlen, /* minimum length to allocate */
104 xfs_extlen_t maxlen, /* maximum length to allocate */
105 xfs_extlen_t *len, /* out: actual length allocated */
106 xfs_alloctype_t type, /* allocation type XFS_ALLOCTYPE... */
107 int wasdel, /* was a delayed allocation extent */
108 xfs_extlen_t prod, /* extent product factor */
109 xfs_rtblock_t *rtblock); /* out: start block allocated */
110
111/*
112 * Free an extent in the realtime subvolume. Length is expressed in
113 * realtime extents, as is the block number.
114 */
115int /* error */
116xfs_rtfree_extent(
117 struct xfs_trans *tp, /* transaction pointer */
118 xfs_rtblock_t bno, /* starting block number to free */
119 xfs_extlen_t len); /* length of extent freed */
120
121/*
122 * Initialize realtime fields in the mount structure.
123 */
124int /* error */
125xfs_rtmount_init(
126 struct xfs_mount *mp); /* file system mount structure */
127
128/*
129 * Get the bitmap and summary inodes into the mount structure
130 * at mount time.
131 */
132int /* error */
133xfs_rtmount_inodes(
134 struct xfs_mount *mp); /* file system mount structure */
135
136/*
137 * Pick an extent for allocation at the start of a new realtime file.
138 * Use the sequence number stored in the atime field of the bitmap inode.
139 * Translate this to a fraction of the rtextents, and return the product
140 * of rtextents and the fraction.
141 * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ...
142 */
143int /* error */
144xfs_rtpick_extent(
145 struct xfs_mount *mp, /* file system mount point */
146 struct xfs_trans *tp, /* transaction pointer */
147 xfs_extlen_t len, /* allocation length (rtextents) */
148 xfs_rtblock_t *pick); /* result rt extent */
149
150/*
151 * Debug code: print out the value of a range in the bitmap.
152 */
153void
154xfs_rtprint_range(
155 struct xfs_mount *mp, /* file system mount structure */
156 struct xfs_trans *tp, /* transaction pointer */
157 xfs_rtblock_t start, /* starting block to print */
158 xfs_extlen_t len); /* length to print */
159
160/*
161 * Debug code: print the summary file.
162 */
163void
164xfs_rtprint_summary(
165 struct xfs_mount *mp, /* file system mount structure */
166 struct xfs_trans *tp); /* transaction pointer */
167
168/*
169 * Grow the realtime area of the filesystem.
170 */
171int
172xfs_growfs_rt(
173 struct xfs_mount *mp, /* file system mount structure */
174 xfs_growfs_rt_t *in); /* user supplied growfs struct */
175
176#else
177# define xfs_rtallocate_extent(t,b,min,max,l,a,f,p,rb) (ENOSYS)
178# define xfs_rtfree_extent(t,b,l) (ENOSYS)
179# define xfs_rtpick_extent(m,t,l,rb) (ENOSYS)
180# define xfs_growfs_rt(mp,in) (ENOSYS)
181# define xfs_rtmount_init(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
182# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
183#endif /* CONFIG_XFS_RT */
184
185#endif /* __KERNEL__ */
186
187#endif /* __XFS_RTALLOC_H__ */
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
new file mode 100644
index 000000000000..d3ff7aef33ba
--- /dev/null
+++ b/fs/xfs/xfs_rw.c
@@ -0,0 +1,356 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34#include "xfs_macros.h"
35#include "xfs_types.h"
36#include "xfs_inum.h"
37#include "xfs_log.h"
38#include "xfs_trans.h"
39#include "xfs_sb.h"
40#include "xfs_ag.h"
41#include "xfs_dir.h"
42#include "xfs_dir2.h"
43#include "xfs_dmapi.h"
44#include "xfs_mount.h"
45#include "xfs_alloc_btree.h"
46#include "xfs_bmap_btree.h"
47#include "xfs_ialloc_btree.h"
48#include "xfs_itable.h"
49#include "xfs_btree.h"
50#include "xfs_alloc.h"
51#include "xfs_ialloc.h"
52#include "xfs_attr.h"
53#include "xfs_attr_sf.h"
54#include "xfs_dir_sf.h"
55#include "xfs_dir2_sf.h"
56#include "xfs_dinode.h"
57#include "xfs_inode_item.h"
58#include "xfs_inode.h"
59#include "xfs_bmap.h"
60#include "xfs_acl.h"
61#include "xfs_mac.h"
62#include "xfs_error.h"
63#include "xfs_buf_item.h"
64#include "xfs_rw.h"
65
66/*
67 * This is a subroutine for xfs_write() and other writers (xfs_ioctl)
68 * which clears the setuid and setgid bits when a file is written.
69 */
70int
71xfs_write_clear_setuid(
72 xfs_inode_t *ip)
73{
74 xfs_mount_t *mp;
75 xfs_trans_t *tp;
76 int error;
77
78 mp = ip->i_mount;
79 tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
80 if ((error = xfs_trans_reserve(tp, 0,
81 XFS_WRITEID_LOG_RES(mp),
82 0, 0, 0))) {
83 xfs_trans_cancel(tp, 0);
84 return error;
85 }
86 xfs_ilock(ip, XFS_ILOCK_EXCL);
87 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
88 xfs_trans_ihold(tp, ip);
89 ip->i_d.di_mode &= ~S_ISUID;
90
91 /*
92 * Note that we don't have to worry about mandatory
93 * file locking being disabled here because we only
94 * clear the S_ISGID bit if the Group execute bit is
95 * on, but if it was on then mandatory locking wouldn't
96 * have been enabled.
97 */
98 if (ip->i_d.di_mode & S_IXGRP) {
99 ip->i_d.di_mode &= ~S_ISGID;
100 }
101 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
102 xfs_trans_set_sync(tp);
103 error = xfs_trans_commit(tp, 0, NULL);
104 xfs_iunlock(ip, XFS_ILOCK_EXCL);
105 return 0;
106}
107
108/*
109 * Force a shutdown of the filesystem instantly while keeping
110 * the filesystem consistent. We don't do an unmount here; just shutdown
111 * the shop, make sure that absolutely nothing persistent happens to
112 * this filesystem after this point.
113 */
114
115void
116xfs_do_force_shutdown(
117 bhv_desc_t *bdp,
118 int flags,
119 char *fname,
120 int lnnum)
121{
122 int logerror;
123 xfs_mount_t *mp;
124
125 mp = XFS_BHVTOM(bdp);
126 logerror = flags & XFS_LOG_IO_ERROR;
127
128 if (!(flags & XFS_FORCE_UMOUNT)) {
129 cmn_err(CE_NOTE,
130 "xfs_force_shutdown(%s,0x%x) called from line %d of file %s. Return address = 0x%p",
131 mp->m_fsname,flags,lnnum,fname,__return_address);
132 }
133 /*
134 * No need to duplicate efforts.
135 */
136 if (XFS_FORCED_SHUTDOWN(mp) && !logerror)
137 return;
138
139 /*
140 * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't
141 * queue up anybody new on the log reservations, and wakes up
142 * everybody who's sleeping on log reservations and tells
143 * them the bad news.
144 */
145 if (xfs_log_force_umount(mp, logerror))
146 return;
147
148 if (flags & XFS_CORRUPT_INCORE) {
149 xfs_cmn_err(XFS_PTAG_SHUTDOWN_CORRUPT, CE_ALERT, mp,
150 "Corruption of in-memory data detected. Shutting down filesystem: %s",
151 mp->m_fsname);
152 if (XFS_ERRLEVEL_HIGH <= xfs_error_level) {
153 xfs_stack_trace();
154 }
155 } else if (!(flags & XFS_FORCE_UMOUNT)) {
156 if (logerror) {
157 xfs_cmn_err(XFS_PTAG_SHUTDOWN_LOGERROR, CE_ALERT, mp,
158 "Log I/O Error Detected. Shutting down filesystem: %s",
159 mp->m_fsname);
160 } else if (!(flags & XFS_SHUTDOWN_REMOTE_REQ)) {
161 xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp,
162 "I/O Error Detected. Shutting down filesystem: %s",
163 mp->m_fsname);
164 }
165 }
166 if (!(flags & XFS_FORCE_UMOUNT)) {
167 cmn_err(CE_ALERT,
168 "Please umount the filesystem, and rectify the problem(s)");
169 }
170}
171
172
173/*
174 * Called when we want to stop a buffer from getting written or read.
175 * We attach the EIO error, muck with its flags, and call biodone
176 * so that the proper iodone callbacks get called.
177 */
178int
179xfs_bioerror(
180 xfs_buf_t *bp)
181{
182
183#ifdef XFSERRORDEBUG
184 ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
185#endif
186
187 /*
188 * No need to wait until the buffer is unpinned.
189 * We aren't flushing it.
190 */
191 xfs_buftrace("XFS IOERROR", bp);
192 XFS_BUF_ERROR(bp, EIO);
193 /*
194 * We're calling biodone, so delete B_DONE flag. Either way
195 * we have to call the iodone callback, and calling biodone
196 * probably is the best way since it takes care of
197 * GRIO as well.
198 */
199 XFS_BUF_UNREAD(bp);
200 XFS_BUF_UNDELAYWRITE(bp);
201 XFS_BUF_UNDONE(bp);
202 XFS_BUF_STALE(bp);
203
204 XFS_BUF_CLR_BDSTRAT_FUNC(bp);
205 xfs_biodone(bp);
206
207 return (EIO);
208}
209
210/*
211 * Same as xfs_bioerror, except that we are releasing the buffer
212 * here ourselves, and avoiding the biodone call.
213 * This is meant for userdata errors; metadata bufs come with
214 * iodone functions attached, so that we can track down errors.
215 */
216int
217xfs_bioerror_relse(
218 xfs_buf_t *bp)
219{
220 int64_t fl;
221
222 ASSERT(XFS_BUF_IODONE_FUNC(bp) != xfs_buf_iodone_callbacks);
223 ASSERT(XFS_BUF_IODONE_FUNC(bp) != xlog_iodone);
224
225 xfs_buftrace("XFS IOERRELSE", bp);
226 fl = XFS_BUF_BFLAGS(bp);
227 /*
228 * No need to wait until the buffer is unpinned.
229 * We aren't flushing it.
230 *
231 * chunkhold expects B_DONE to be set, whether
232 * we actually finish the I/O or not. We don't want to
233 * change that interface.
234 */
235 XFS_BUF_UNREAD(bp);
236 XFS_BUF_UNDELAYWRITE(bp);
237 XFS_BUF_DONE(bp);
238 XFS_BUF_STALE(bp);
239 XFS_BUF_CLR_IODONE_FUNC(bp);
240 XFS_BUF_CLR_BDSTRAT_FUNC(bp);
241 if (!(fl & XFS_B_ASYNC)) {
242 /*
243 * Mark b_error and B_ERROR _both_.
244 * Lot's of chunkcache code assumes that.
245 * There's no reason to mark error for
246 * ASYNC buffers.
247 */
248 XFS_BUF_ERROR(bp, EIO);
249 XFS_BUF_V_IODONESEMA(bp);
250 } else {
251 xfs_buf_relse(bp);
252 }
253 return (EIO);
254}
255/*
256 * Prints out an ALERT message about I/O error.
257 */
258void
259xfs_ioerror_alert(
260 char *func,
261 struct xfs_mount *mp,
262 xfs_buf_t *bp,
263 xfs_daddr_t blkno)
264{
265 cmn_err(CE_ALERT,
266 "I/O error in filesystem (\"%s\") meta-data dev %s block 0x%llx"
267 " (\"%s\") error %d buf count %u",
268 (!mp || !mp->m_fsname) ? "(fs name not set)" : mp->m_fsname,
269 XFS_BUFTARG_NAME(bp->pb_target),
270 (__uint64_t)blkno,
271 func,
272 XFS_BUF_GETERROR(bp),
273 XFS_BUF_COUNT(bp));
274}
275
276/*
277 * This isn't an absolute requirement, but it is
278 * just a good idea to call xfs_read_buf instead of
279 * directly doing a read_buf call. For one, we shouldn't
280 * be doing this disk read if we are in SHUTDOWN state anyway,
281 * so this stops that from happening. Secondly, this does all
282 * the error checking stuff and the brelse if appropriate for
283 * the caller, so the code can be a little leaner.
284 */
285
286int
287xfs_read_buf(
288 struct xfs_mount *mp,
289 xfs_buftarg_t *target,
290 xfs_daddr_t blkno,
291 int len,
292 uint flags,
293 xfs_buf_t **bpp)
294{
295 xfs_buf_t *bp;
296 int error;
297
298 if (flags)
299 bp = xfs_buf_read_flags(target, blkno, len, flags);
300 else
301 bp = xfs_buf_read(target, blkno, len, flags);
302 if (!bp)
303 return XFS_ERROR(EIO);
304 error = XFS_BUF_GETERROR(bp);
305 if (bp && !error && !XFS_FORCED_SHUTDOWN(mp)) {
306 *bpp = bp;
307 } else {
308 *bpp = NULL;
309 if (error) {
310 xfs_ioerror_alert("xfs_read_buf", mp, bp, XFS_BUF_ADDR(bp));
311 } else {
312 error = XFS_ERROR(EIO);
313 }
314 if (bp) {
315 XFS_BUF_UNDONE(bp);
316 XFS_BUF_UNDELAYWRITE(bp);
317 XFS_BUF_STALE(bp);
318 /*
319 * brelse clears B_ERROR and b_error
320 */
321 xfs_buf_relse(bp);
322 }
323 }
324 return (error);
325}
326
327/*
328 * Wrapper around bwrite() so that we can trap
329 * write errors, and act accordingly.
330 */
331int
332xfs_bwrite(
333 struct xfs_mount *mp,
334 struct xfs_buf *bp)
335{
336 int error;
337
338 /*
339 * XXXsup how does this work for quotas.
340 */
341 XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
342 XFS_BUF_SET_FSPRIVATE3(bp, mp);
343 XFS_BUF_WRITE(bp);
344
345 if ((error = XFS_bwrite(bp))) {
346 ASSERT(mp);
347 /*
348 * Cannot put a buftrace here since if the buffer is not
349 * B_HOLD then we will brelse() the buffer before returning
350 * from bwrite and we could be tracing a buffer that has
351 * been reused.
352 */
353 xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR);
354 }
355 return (error);
356}
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
new file mode 100644
index 000000000000..c8b10bf8f530
--- /dev/null
+++ b/fs/xfs/xfs_rw.h
@@ -0,0 +1,154 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_RW_H__
33#define __XFS_RW_H__
34
35struct xfs_buf;
36struct xfs_inode;
37struct xfs_mount;
38
39/*
40 * Maximum count of bmaps used by read and write paths.
41 */
42#define XFS_MAX_RW_NBMAPS 4
43
44/*
45 * Counts of readahead buffers to use based on physical memory size.
46 * None of these should be more than XFS_MAX_RW_NBMAPS.
47 */
48#define XFS_RW_NREADAHEAD_16MB 2
49#define XFS_RW_NREADAHEAD_32MB 3
50#define XFS_RW_NREADAHEAD_K32 4
51#define XFS_RW_NREADAHEAD_K64 4
52
53/*
54 * Maximum size of a buffer that we\'ll map. Making this
55 * too big will degrade performance due to the number of
56 * pages which need to be gathered. Making it too small
57 * will prevent us from doing large I/O\'s to hardware that
58 * needs it.
59 *
60 * This is currently set to 512 KB.
61 */
62#define XFS_MAX_BMAP_LEN_BB 1024
63#define XFS_MAX_BMAP_LEN_BYTES 524288
64
65/*
66 * Convert the given file system block to a disk block.
67 * We have to treat it differently based on whether the
68 * file is a real time file or not, because the bmap code
69 * does.
70 */
71#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FSB_TO_DB)
72xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb);
73#define XFS_FSB_TO_DB(ip,fsb) xfs_fsb_to_db(ip,fsb)
74#else
75#define XFS_FSB_TO_DB(ip,fsb) \
76 (((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) ? \
77 (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
78 XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)))
79#endif
80
81#define XFS_FSB_TO_DB_IO(io,fsb) \
82 (((io)->io_flags & XFS_IOCORE_RT) ? \
83 XFS_FSB_TO_BB((io)->io_mount, (fsb)) : \
84 XFS_FSB_TO_DADDR((io)->io_mount, (fsb)))
85
86/*
87 * Prototypes for functions in xfs_rw.c.
88 */
89
90int
91xfs_write_clear_setuid(
92 struct xfs_inode *ip);
93
94int
95xfs_bwrite(
96 struct xfs_mount *mp,
97 struct xfs_buf *bp);
98
99int
100xfs_bioerror(
101 struct xfs_buf *b);
102
103int
104xfs_bioerror_relse(
105 struct xfs_buf *b);
106
107int
108xfs_read_buf(
109 struct xfs_mount *mp,
110 xfs_buftarg_t *target,
111 xfs_daddr_t blkno,
112 int len,
113 uint flags,
114 struct xfs_buf **bpp);
115
116void
117xfs_ioerror_alert(
118 char *func,
119 struct xfs_mount *mp,
120 xfs_buf_t *bp,
121 xfs_daddr_t blkno);
122
123
124/*
125 * Prototypes for functions in xfs_vnodeops.c.
126 */
127
128int
129xfs_rwlock(
130 bhv_desc_t *bdp,
131 vrwlock_t write_lock);
132
133void
134xfs_rwunlock(
135 bhv_desc_t *bdp,
136 vrwlock_t write_lock);
137
138int
139xfs_change_file_space(
140 bhv_desc_t *bdp,
141 int cmd,
142 xfs_flock64_t *bf,
143 xfs_off_t offset,
144 cred_t *credp,
145 int flags);
146
147int
148xfs_set_dmattrs(
149 bhv_desc_t *bdp,
150 u_int evmask,
151 u_int16_t state,
152 cred_t *credp);
153
154#endif /* __XFS_RW_H__ */
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
new file mode 100644
index 000000000000..ad090a834ced
--- /dev/null
+++ b/fs/xfs/xfs_sb.h
@@ -0,0 +1,583 @@
1/*
2 * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_SB_H__
33#define __XFS_SB_H__
34
35/*
36 * Super block
37 * Fits into a sector-sized buffer at address 0 of each allocation group.
38 * Only the first of these is ever updated except during growfs.
39 */
40
41struct xfs_buf;
42struct xfs_mount;
43
44#define XFS_SB_MAGIC 0x58465342 /* 'XFSB' */
45#define XFS_SB_VERSION_1 1 /* 5.3, 6.0.1, 6.1 */
46#define XFS_SB_VERSION_2 2 /* 6.2 - attributes */
47#define XFS_SB_VERSION_3 3 /* 6.2 - new inode version */
48#define XFS_SB_VERSION_4 4 /* 6.2+ - bitmask version */
49#define XFS_SB_VERSION_NUMBITS 0x000f
50#define XFS_SB_VERSION_ALLFBITS 0xfff0
51#define XFS_SB_VERSION_SASHFBITS 0xf000
52#define XFS_SB_VERSION_REALFBITS 0x0ff0
53#define XFS_SB_VERSION_ATTRBIT 0x0010
54#define XFS_SB_VERSION_NLINKBIT 0x0020
55#define XFS_SB_VERSION_QUOTABIT 0x0040
56#define XFS_SB_VERSION_ALIGNBIT 0x0080
57#define XFS_SB_VERSION_DALIGNBIT 0x0100
58#define XFS_SB_VERSION_SHAREDBIT 0x0200
59#define XFS_SB_VERSION_LOGV2BIT 0x0400
60#define XFS_SB_VERSION_SECTORBIT 0x0800
61#define XFS_SB_VERSION_EXTFLGBIT 0x1000
62#define XFS_SB_VERSION_DIRV2BIT 0x2000
63#define XFS_SB_VERSION_MOREBITSBIT 0x8000
64#define XFS_SB_VERSION_OKSASHFBITS \
65 (XFS_SB_VERSION_EXTFLGBIT | \
66 XFS_SB_VERSION_DIRV2BIT)
67#define XFS_SB_VERSION_OKREALFBITS \
68 (XFS_SB_VERSION_ATTRBIT | \
69 XFS_SB_VERSION_NLINKBIT | \
70 XFS_SB_VERSION_QUOTABIT | \
71 XFS_SB_VERSION_ALIGNBIT | \
72 XFS_SB_VERSION_DALIGNBIT | \
73 XFS_SB_VERSION_SHAREDBIT | \
74 XFS_SB_VERSION_LOGV2BIT | \
75 XFS_SB_VERSION_SECTORBIT)
76#define XFS_SB_VERSION_OKSASHBITS \
77 (XFS_SB_VERSION_NUMBITS | \
78 XFS_SB_VERSION_REALFBITS | \
79 XFS_SB_VERSION_OKSASHFBITS)
80#define XFS_SB_VERSION_OKREALBITS \
81 (XFS_SB_VERSION_NUMBITS | \
82 XFS_SB_VERSION_OKREALFBITS | \
83 XFS_SB_VERSION_OKSASHFBITS)
84#define XFS_SB_VERSION_MKFS(ia,dia,extflag,dirv2,na,sflag,morebits) \
85 (((ia) || (dia) || (extflag) || (dirv2) || (na) || (sflag) || \
86 (morebits)) ? \
87 (XFS_SB_VERSION_4 | \
88 ((ia) ? XFS_SB_VERSION_ALIGNBIT : 0) | \
89 ((dia) ? XFS_SB_VERSION_DALIGNBIT : 0) | \
90 ((extflag) ? XFS_SB_VERSION_EXTFLGBIT : 0) | \
91 ((dirv2) ? XFS_SB_VERSION_DIRV2BIT : 0) | \
92 ((na) ? XFS_SB_VERSION_LOGV2BIT : 0) | \
93 ((sflag) ? XFS_SB_VERSION_SECTORBIT : 0) | \
94 ((morebits) ? XFS_SB_VERSION_MOREBITSBIT : 0)) : \
95 XFS_SB_VERSION_1)
96
97/*
98 * There are two words to hold XFS "feature" bits: the original
99 * word, sb_versionnum, and sb_features2. Whenever a bit is set in
100 * sb_features2, the feature bit XFS_SB_VERSION_MOREBITSBIT must be set.
101 *
102 * These defines represent bits in sb_features2.
103 */
104#define XFS_SB_VERSION2_REALFBITS 0x00ffffff /* Mask: features */
105#define XFS_SB_VERSION2_RESERVED1BIT 0x00000001
106#define XFS_SB_VERSION2_SASHFBITS 0xff000000 /* Mask: features that
107 require changing
108 PROM and SASH */
109
110#define XFS_SB_VERSION2_OKREALFBITS \
111 (0)
112#define XFS_SB_VERSION2_OKSASHFBITS \
113 (0)
114#define XFS_SB_VERSION2_OKREALBITS \
115 (XFS_SB_VERSION2_OKREALFBITS | \
116 XFS_SB_VERSION2_OKSASHFBITS )
117
118/*
119 * mkfs macro to set up sb_features2 word
120 */
121#define XFS_SB_VERSION2_MKFS(xyz) \
122 ((xyz) ? 0 : 0)
123
124typedef struct xfs_sb
125{
126 __uint32_t sb_magicnum; /* magic number == XFS_SB_MAGIC */
127 __uint32_t sb_blocksize; /* logical block size, bytes */
128 xfs_drfsbno_t sb_dblocks; /* number of data blocks */
129 xfs_drfsbno_t sb_rblocks; /* number of realtime blocks */
130 xfs_drtbno_t sb_rextents; /* number of realtime extents */
131 uuid_t sb_uuid; /* file system unique id */
132 xfs_dfsbno_t sb_logstart; /* starting block of log if internal */
133 xfs_ino_t sb_rootino; /* root inode number */
134 xfs_ino_t sb_rbmino; /* bitmap inode for realtime extents */
135 xfs_ino_t sb_rsumino; /* summary inode for rt bitmap */
136 xfs_agblock_t sb_rextsize; /* realtime extent size, blocks */
137 xfs_agblock_t sb_agblocks; /* size of an allocation group */
138 xfs_agnumber_t sb_agcount; /* number of allocation groups */
139 xfs_extlen_t sb_rbmblocks; /* number of rt bitmap blocks */
140 xfs_extlen_t sb_logblocks; /* number of log blocks */
141 __uint16_t sb_versionnum; /* header version == XFS_SB_VERSION */
142 __uint16_t sb_sectsize; /* volume sector size, bytes */
143 __uint16_t sb_inodesize; /* inode size, bytes */
144 __uint16_t sb_inopblock; /* inodes per block */
145 char sb_fname[12]; /* file system name */
146 __uint8_t sb_blocklog; /* log2 of sb_blocksize */
147 __uint8_t sb_sectlog; /* log2 of sb_sectsize */
148 __uint8_t sb_inodelog; /* log2 of sb_inodesize */
149 __uint8_t sb_inopblog; /* log2 of sb_inopblock */
150 __uint8_t sb_agblklog; /* log2 of sb_agblocks (rounded up) */
151 __uint8_t sb_rextslog; /* log2 of sb_rextents */
152 __uint8_t sb_inprogress; /* mkfs is in progress, don't mount */
153 __uint8_t sb_imax_pct; /* max % of fs for inode space */
154 /* statistics */
155 /*
156 * These fields must remain contiguous. If you really
157 * want to change their layout, make sure you fix the
158 * code in xfs_trans_apply_sb_deltas().
159 */
160 __uint64_t sb_icount; /* allocated inodes */
161 __uint64_t sb_ifree; /* free inodes */
162 __uint64_t sb_fdblocks; /* free data blocks */
163 __uint64_t sb_frextents; /* free realtime extents */
164 /*
165 * End contiguous fields.
166 */
167 xfs_ino_t sb_uquotino; /* user quota inode */
168 xfs_ino_t sb_gquotino; /* group quota inode */
169 __uint16_t sb_qflags; /* quota flags */
170 __uint8_t sb_flags; /* misc. flags */
171 __uint8_t sb_shared_vn; /* shared version number */
172 xfs_extlen_t sb_inoalignmt; /* inode chunk alignment, fsblocks */
173 __uint32_t sb_unit; /* stripe or raid unit */
174 __uint32_t sb_width; /* stripe or raid width */
175 __uint8_t sb_dirblklog; /* log2 of dir block size (fsbs) */
176 __uint8_t sb_logsectlog; /* log2 of the log sector size */
177 __uint16_t sb_logsectsize; /* sector size for the log, bytes */
178 __uint32_t sb_logsunit; /* stripe unit size for the log */
179 __uint32_t sb_features2; /* additonal feature bits */
180} xfs_sb_t;
181
182/*
183 * Sequence number values for the fields.
184 */
185typedef enum {
186 XFS_SBS_MAGICNUM, XFS_SBS_BLOCKSIZE, XFS_SBS_DBLOCKS, XFS_SBS_RBLOCKS,
187 XFS_SBS_REXTENTS, XFS_SBS_UUID, XFS_SBS_LOGSTART, XFS_SBS_ROOTINO,
188 XFS_SBS_RBMINO, XFS_SBS_RSUMINO, XFS_SBS_REXTSIZE, XFS_SBS_AGBLOCKS,
189 XFS_SBS_AGCOUNT, XFS_SBS_RBMBLOCKS, XFS_SBS_LOGBLOCKS,
190 XFS_SBS_VERSIONNUM, XFS_SBS_SECTSIZE, XFS_SBS_INODESIZE,
191 XFS_SBS_INOPBLOCK, XFS_SBS_FNAME, XFS_SBS_BLOCKLOG,
192 XFS_SBS_SECTLOG, XFS_SBS_INODELOG, XFS_SBS_INOPBLOG, XFS_SBS_AGBLKLOG,
193 XFS_SBS_REXTSLOG, XFS_SBS_INPROGRESS, XFS_SBS_IMAX_PCT, XFS_SBS_ICOUNT,
194 XFS_SBS_IFREE, XFS_SBS_FDBLOCKS, XFS_SBS_FREXTENTS, XFS_SBS_UQUOTINO,
195 XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN,
196 XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG,
197 XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT,
198 XFS_SBS_FEATURES2,
199 XFS_SBS_FIELDCOUNT
200} xfs_sb_field_t;
201
202/*
203 * Mask values, defined based on the xfs_sb_field_t values.
204 * Only define the ones we're using.
205 */
206#define XFS_SB_MVAL(x) (1LL << XFS_SBS_ ## x)
207#define XFS_SB_UUID XFS_SB_MVAL(UUID)
208#define XFS_SB_FNAME XFS_SB_MVAL(FNAME)
209#define XFS_SB_ROOTINO XFS_SB_MVAL(ROOTINO)
210#define XFS_SB_RBMINO XFS_SB_MVAL(RBMINO)
211#define XFS_SB_RSUMINO XFS_SB_MVAL(RSUMINO)
212#define XFS_SB_VERSIONNUM XFS_SB_MVAL(VERSIONNUM)
213#define XFS_SB_UQUOTINO XFS_SB_MVAL(UQUOTINO)
214#define XFS_SB_GQUOTINO XFS_SB_MVAL(GQUOTINO)
215#define XFS_SB_QFLAGS XFS_SB_MVAL(QFLAGS)
216#define XFS_SB_SHARED_VN XFS_SB_MVAL(SHARED_VN)
217#define XFS_SB_UNIT XFS_SB_MVAL(UNIT)
218#define XFS_SB_WIDTH XFS_SB_MVAL(WIDTH)
219#define XFS_SB_NUM_BITS ((int)XFS_SBS_FIELDCOUNT)
220#define XFS_SB_ALL_BITS ((1LL << XFS_SB_NUM_BITS) - 1)
221#define XFS_SB_MOD_BITS \
222 (XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \
223 XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \
224 XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH)
225
226/*
227 * Misc. Flags - warning - these will be cleared by xfs_repair unless
228 * a feature bit is set when the flag is used.
229 */
230#define XFS_SBF_NOFLAGS 0x00 /* no flags set */
231#define XFS_SBF_READONLY 0x01 /* only read-only mounts allowed */
232
233/*
234 * define max. shared version we can interoperate with
235 */
236#define XFS_SB_MAX_SHARED_VN 0
237
238#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_NUM)
239int xfs_sb_version_num(xfs_sb_t *sbp);
240#define XFS_SB_VERSION_NUM(sbp) xfs_sb_version_num(sbp)
241#else
242#define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)
243#endif
244
245#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_GOOD_VERSION)
246int xfs_sb_good_version(xfs_sb_t *sbp);
247#define XFS_SB_GOOD_VERSION(sbp) xfs_sb_good_version(sbp)
248#else
249#define XFS_SB_GOOD_VERSION_INT(sbp) \
250 ((((sbp)->sb_versionnum >= XFS_SB_VERSION_1) && \
251 ((sbp)->sb_versionnum <= XFS_SB_VERSION_3)) || \
252 ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
253 !(((sbp)->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS) || \
254 (((sbp)->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && \
255 ((sbp)->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS)))
256
257#ifdef __KERNEL__
258#define XFS_SB_GOOD_VERSION(sbp) \
259 (XFS_SB_GOOD_VERSION_INT(sbp) && \
260 (sbp)->sb_shared_vn <= XFS_SB_MAX_SHARED_VN) ))
261#else
262/*
263 * extra 2 paren's here (( to unconfuse paren-matching editors
264 * like vi because XFS_SB_GOOD_VERSION_INT is a partial expression
265 * and the two XFS_SB_GOOD_VERSION's each 2 more close paren's to
266 * complete the expression.
267 */
268#define XFS_SB_GOOD_VERSION(sbp) \
269 (XFS_SB_GOOD_VERSION_INT(sbp) && \
270 (!((sbp)->sb_versionnum & XFS_SB_VERSION_SHAREDBIT) || \
271 (sbp)->sb_shared_vn <= XFS_SB_MAX_SHARED_VN)) ))
272#endif /* __KERNEL__ */
273#endif
274
275#define XFS_SB_GOOD_SASH_VERSION(sbp) \
276 ((((sbp)->sb_versionnum >= XFS_SB_VERSION_1) && \
277 ((sbp)->sb_versionnum <= XFS_SB_VERSION_3)) || \
278 ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
279 !((sbp)->sb_versionnum & ~XFS_SB_VERSION_OKSASHBITS)))
280
281#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_TONEW)
282unsigned xfs_sb_version_tonew(unsigned v);
283#define XFS_SB_VERSION_TONEW(v) xfs_sb_version_tonew(v)
284#else
285#define XFS_SB_VERSION_TONEW(v) \
286 ((((v) == XFS_SB_VERSION_1) ? \
287 0 : \
288 (((v) == XFS_SB_VERSION_2) ? \
289 XFS_SB_VERSION_ATTRBIT : \
290 (XFS_SB_VERSION_ATTRBIT | XFS_SB_VERSION_NLINKBIT))) | \
291 XFS_SB_VERSION_4)
292#endif
293
294#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_TOOLD)
295unsigned xfs_sb_version_toold(unsigned v);
296#define XFS_SB_VERSION_TOOLD(v) xfs_sb_version_toold(v)
297#else
298#define XFS_SB_VERSION_TOOLD(v) \
299 (((v) & (XFS_SB_VERSION_QUOTABIT | XFS_SB_VERSION_ALIGNBIT)) ? \
300 0 : \
301 (((v) & XFS_SB_VERSION_NLINKBIT) ? \
302 XFS_SB_VERSION_3 : \
303 (((v) & XFS_SB_VERSION_ATTRBIT) ? \
304 XFS_SB_VERSION_2 : \
305 XFS_SB_VERSION_1)))
306#endif
307
308#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASATTR)
309int xfs_sb_version_hasattr(xfs_sb_t *sbp);
310#define XFS_SB_VERSION_HASATTR(sbp) xfs_sb_version_hasattr(sbp)
311#else
312#define XFS_SB_VERSION_HASATTR(sbp) \
313 (((sbp)->sb_versionnum == XFS_SB_VERSION_2) || \
314 ((sbp)->sb_versionnum == XFS_SB_VERSION_3) || \
315 ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
316 ((sbp)->sb_versionnum & XFS_SB_VERSION_ATTRBIT)))
317#endif
318
319#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_ADDATTR)
320void xfs_sb_version_addattr(xfs_sb_t *sbp);
321#define XFS_SB_VERSION_ADDATTR(sbp) xfs_sb_version_addattr(sbp)
322#else
323#define XFS_SB_VERSION_ADDATTR(sbp) \
324 ((sbp)->sb_versionnum = \
325 (((sbp)->sb_versionnum == XFS_SB_VERSION_1) ? \
326 XFS_SB_VERSION_2 : \
327 ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) ? \
328 ((sbp)->sb_versionnum | XFS_SB_VERSION_ATTRBIT) : \
329 (XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT))))
330#endif
331
332#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASNLINK)
333int xfs_sb_version_hasnlink(xfs_sb_t *sbp);
334#define XFS_SB_VERSION_HASNLINK(sbp) xfs_sb_version_hasnlink(sbp)
335#else
336#define XFS_SB_VERSION_HASNLINK(sbp) \
337 (((sbp)->sb_versionnum == XFS_SB_VERSION_3) || \
338 ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
339 ((sbp)->sb_versionnum & XFS_SB_VERSION_NLINKBIT)))
340#endif
341
342#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_ADDNLINK)
343void xfs_sb_version_addnlink(xfs_sb_t *sbp);
344#define XFS_SB_VERSION_ADDNLINK(sbp) xfs_sb_version_addnlink(sbp)
345#else
346#define XFS_SB_VERSION_ADDNLINK(sbp) \
347 ((sbp)->sb_versionnum = \
348 ((sbp)->sb_versionnum <= XFS_SB_VERSION_2 ? \
349 XFS_SB_VERSION_3 : \
350 ((sbp)->sb_versionnum | XFS_SB_VERSION_NLINKBIT)))
351#endif
352
353#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASQUOTA)
354int xfs_sb_version_hasquota(xfs_sb_t *sbp);
355#define XFS_SB_VERSION_HASQUOTA(sbp) xfs_sb_version_hasquota(sbp)
356#else
357#define XFS_SB_VERSION_HASQUOTA(sbp) \
358 ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
359 ((sbp)->sb_versionnum & XFS_SB_VERSION_QUOTABIT))
360#endif
361
362#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_ADDQUOTA)
363void xfs_sb_version_addquota(xfs_sb_t *sbp);
364#define XFS_SB_VERSION_ADDQUOTA(sbp) xfs_sb_version_addquota(sbp)
365#else
366#define XFS_SB_VERSION_ADDQUOTA(sbp) \
367 ((sbp)->sb_versionnum = \
368 (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 ? \
369 ((sbp)->sb_versionnum | XFS_SB_VERSION_QUOTABIT) : \
370 (XFS_SB_VERSION_TONEW((sbp)->sb_versionnum) | \
371 XFS_SB_VERSION_QUOTABIT)))
372#endif
373
374#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASALIGN)
375int xfs_sb_version_hasalign(xfs_sb_t *sbp);
376#define XFS_SB_VERSION_HASALIGN(sbp) xfs_sb_version_hasalign(sbp)
377#else
378#define XFS_SB_VERSION_HASALIGN(sbp) \
379 ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
380 ((sbp)->sb_versionnum & XFS_SB_VERSION_ALIGNBIT))
381#endif
382
383#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_SUBALIGN)
384void xfs_sb_version_subalign(xfs_sb_t *sbp);
385#define XFS_SB_VERSION_SUBALIGN(sbp) xfs_sb_version_subalign(sbp)
386#else
387#define XFS_SB_VERSION_SUBALIGN(sbp) \
388 ((sbp)->sb_versionnum = \
389 XFS_SB_VERSION_TOOLD((sbp)->sb_versionnum & ~XFS_SB_VERSION_ALIGNBIT))
390#endif
391
392#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASDALIGN)
393int xfs_sb_version_hasdalign(xfs_sb_t *sbp);
394#define XFS_SB_VERSION_HASDALIGN(sbp) xfs_sb_version_hasdalign(sbp)
395#else
396#define XFS_SB_VERSION_HASDALIGN(sbp) \
397 ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
398 ((sbp)->sb_versionnum & XFS_SB_VERSION_DALIGNBIT))
399#endif
400
401#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_ADDDALIGN)
402int xfs_sb_version_adddalign(xfs_sb_t *sbp);
403#define XFS_SB_VERSION_ADDDALIGN(sbp) xfs_sb_version_adddalign(sbp)
404#else
405#define XFS_SB_VERSION_ADDDALIGN(sbp) \
406 ((sbp)->sb_versionnum = \
407 ((sbp)->sb_versionnum | XFS_SB_VERSION_DALIGNBIT))
408#endif
409
410#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASSHARED)
411int xfs_sb_version_hasshared(xfs_sb_t *sbp);
412#define XFS_SB_VERSION_HASSHARED(sbp) xfs_sb_version_hasshared(sbp)
413#else
414#define XFS_SB_VERSION_HASSHARED(sbp) \
415 ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
416 ((sbp)->sb_versionnum & XFS_SB_VERSION_SHAREDBIT))
417#endif
418
419#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_ADDSHARED)
420int xfs_sb_version_addshared(xfs_sb_t *sbp);
421#define XFS_SB_VERSION_ADDSHARED(sbp) xfs_sb_version_addshared(sbp)
422#else
423#define XFS_SB_VERSION_ADDSHARED(sbp) \
424 ((sbp)->sb_versionnum = \
425 ((sbp)->sb_versionnum | XFS_SB_VERSION_SHAREDBIT))
426#endif
427
428#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_SUBSHARED)
429int xfs_sb_version_subshared(xfs_sb_t *sbp);
430#define XFS_SB_VERSION_SUBSHARED(sbp) xfs_sb_version_subshared(sbp)
431#else
432#define XFS_SB_VERSION_SUBSHARED(sbp) \
433 ((sbp)->sb_versionnum = \
434 ((sbp)->sb_versionnum & ~XFS_SB_VERSION_SHAREDBIT))
435#endif
436
437#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASDIRV2)
438int xfs_sb_version_hasdirv2(xfs_sb_t *sbp);
439#define XFS_SB_VERSION_HASDIRV2(sbp) xfs_sb_version_hasdirv2(sbp)
440#else
441#define XFS_SB_VERSION_HASDIRV2(sbp) \
442 ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
443 ((sbp)->sb_versionnum & XFS_SB_VERSION_DIRV2BIT))
444#endif
445
446#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASLOGV2)
447int xfs_sb_version_haslogv2(xfs_sb_t *sbp);
448#define XFS_SB_VERSION_HASLOGV2(sbp) xfs_sb_version_haslogv2(sbp)
449#else
450#define XFS_SB_VERSION_HASLOGV2(sbp) \
451 ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
452 ((sbp)->sb_versionnum & XFS_SB_VERSION_LOGV2BIT))
453#endif
454
455#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASEXTFLGBIT)
456int xfs_sb_version_hasextflgbit(xfs_sb_t *sbp);
457#define XFS_SB_VERSION_HASEXTFLGBIT(sbp) xfs_sb_version_hasextflgbit(sbp)
458#else
459#define XFS_SB_VERSION_HASEXTFLGBIT(sbp) \
460 ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
461 ((sbp)->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT))
462#endif
463
464#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_ADDEXTFLGBIT)
465int xfs_sb_version_addextflgbit(xfs_sb_t *sbp);
466#define XFS_SB_VERSION_ADDEXTFLGBIT(sbp) xfs_sb_version_addextflgbit(sbp)
467#else
468#define XFS_SB_VERSION_ADDEXTFLGBIT(sbp) \
469 ((sbp)->sb_versionnum = \
470 ((sbp)->sb_versionnum | XFS_SB_VERSION_EXTFLGBIT))
471#endif
472
473#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_SUBEXTFLGBIT)
474int xfs_sb_version_subextflgbit(xfs_sb_t *sbp);
475#define XFS_SB_VERSION_SUBEXTFLGBIT(sbp) xfs_sb_version_subextflgbit(sbp)
476#else
477#define XFS_SB_VERSION_SUBEXTFLGBIT(sbp) \
478 ((sbp)->sb_versionnum = \
479 ((sbp)->sb_versionnum & ~XFS_SB_VERSION_EXTFLGBIT))
480#endif
481
482#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASSECTOR)
483int xfs_sb_version_hassector(xfs_sb_t *sbp);
484#define XFS_SB_VERSION_HASSECTOR(sbp) xfs_sb_version_hassector(sbp)
485#else
486#define XFS_SB_VERSION_HASSECTOR(sbp) \
487 ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
488 ((sbp)->sb_versionnum & XFS_SB_VERSION_SECTORBIT))
489#endif
490
491#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASMOREBITSBIT)
492int xfs_sb_version_hasmorebits(xfs_sb_t *sbp);
493#define XFS_SB_VERSION_HASMOREBITS(sbp) xfs_sb_version_hasmorebits(sbp)
494#else
495#define XFS_SB_VERSION_HASMOREBITS(sbp) \
496 ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
497 ((sbp)->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT))
498#endif
499
500/*
501 * sb_features2 bit version macros.
502 *
503 * For example, for a bit defined as XFS_SB_VERSION2_YBIT, has a macro:
504 *
505 * SB_VERSION_HASYBIT(xfs_sb_t *sbp)
506 * ((XFS_SB_VERSION_HASMOREBITS(sbp) &&
507 * ((sbp)->sb_versionnum & XFS_SB_VERSION2_YBIT)
508 */
509
510/*
511 * end of superblock version macros
512 */
513
514#define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */
515#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_BLOCK)
516xfs_agblock_t xfs_sb_block(struct xfs_mount *mp);
517#define XFS_SB_BLOCK(mp) xfs_sb_block(mp)
518#else
519#define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR)
520#endif
521
522#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_HDR_BLOCK)
523xfs_agblock_t xfs_hdr_block(struct xfs_mount *mp, xfs_daddr_t d);
524#define XFS_HDR_BLOCK(mp,d) xfs_hdr_block(mp,d)
525#else
526#define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)(XFS_BB_TO_FSBT(mp,d)))
527#endif
528#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DADDR_TO_FSB)
529xfs_fsblock_t xfs_daddr_to_fsb(struct xfs_mount *mp, xfs_daddr_t d);
530#define XFS_DADDR_TO_FSB(mp,d) xfs_daddr_to_fsb(mp,d)
531#else
532#define XFS_DADDR_TO_FSB(mp,d) \
533 XFS_AGB_TO_FSB(mp, XFS_DADDR_TO_AGNO(mp,d), XFS_DADDR_TO_AGBNO(mp,d))
534#endif
535#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FSB_TO_DADDR)
536xfs_daddr_t xfs_fsb_to_daddr(struct xfs_mount *mp, xfs_fsblock_t fsbno);
537#define XFS_FSB_TO_DADDR(mp,fsbno) xfs_fsb_to_daddr(mp,fsbno)
538#else
539#define XFS_FSB_TO_DADDR(mp,fsbno) \
540 XFS_AGB_TO_DADDR(mp, XFS_FSB_TO_AGNO(mp,fsbno), \
541 XFS_FSB_TO_AGBNO(mp,fsbno))
542#endif
543
544#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_SBP)
545xfs_sb_t *xfs_buf_to_sbp(struct xfs_buf *bp);
546#define XFS_BUF_TO_SBP(bp) xfs_buf_to_sbp(bp)
547#else
548#define XFS_BUF_TO_SBP(bp) ((xfs_sb_t *)XFS_BUF_PTR(bp))
549#endif
550
551/*
552 * File system sector to basic block conversions.
553 */
554#define XFS_FSS_TO_BB(mp,sec) ((sec) << (mp)->m_sectbb_log)
555#define XFS_BB_TO_FSS(mp,bb) \
556 (((bb) + (XFS_FSS_TO_BB(mp,1) - 1)) >> (mp)->m_sectbb_log)
557#define XFS_BB_TO_FSST(mp,bb) ((bb) >> (mp)->m_sectbb_log)
558
559/*
560 * File system sector to byte conversions.
561 */
562#define XFS_FSS_TO_B(mp,sectno) ((xfs_fsize_t)(sectno) << (mp)->m_sb.sb_sectlog)
563#define XFS_B_TO_FSST(mp,b) (((__uint64_t)(b)) >> (mp)->m_sb.sb_sectlog)
564
565/*
566 * File system block to basic block conversions.
567 */
568#define XFS_FSB_TO_BB(mp,fsbno) ((fsbno) << (mp)->m_blkbb_log)
569#define XFS_BB_TO_FSB(mp,bb) \
570 (((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log)
571#define XFS_BB_TO_FSBT(mp,bb) ((bb) >> (mp)->m_blkbb_log)
572#define XFS_BB_FSB_OFFSET(mp,bb) ((bb) & ((mp)->m_bsize - 1))
573
574/*
575 * File system block to byte conversions.
576 */
577#define XFS_FSB_TO_B(mp,fsbno) ((xfs_fsize_t)(fsbno) << (mp)->m_sb.sb_blocklog)
578#define XFS_B_TO_FSB(mp,b) \
579 ((((__uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog)
580#define XFS_B_TO_FSBT(mp,b) (((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog)
581#define XFS_B_FSB_OFFSET(mp,b) ((b) & (mp)->m_blockmask)
582
583#endif /* __XFS_SB_H__ */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
new file mode 100644
index 000000000000..3db0e2200775
--- /dev/null
+++ b/fs/xfs/xfs_trans.c
@@ -0,0 +1,1315 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34#include "xfs_macros.h"
35#include "xfs_types.h"
36#include "xfs_inum.h"
37#include "xfs_log.h"
38#include "xfs_trans.h"
39#include "xfs_sb.h"
40#include "xfs_ag.h"
41#include "xfs_dir.h"
42#include "xfs_dir2.h"
43#include "xfs_dmapi.h"
44#include "xfs_mount.h"
45#include "xfs_error.h"
46#include "xfs_trans_priv.h"
47#include "xfs_alloc_btree.h"
48#include "xfs_bmap_btree.h"
49#include "xfs_ialloc_btree.h"
50#include "xfs_btree.h"
51#include "xfs_ialloc.h"
52#include "xfs_alloc.h"
53#include "xfs_attr_sf.h"
54#include "xfs_dir_sf.h"
55#include "xfs_dir2_sf.h"
56#include "xfs_dinode.h"
57#include "xfs_inode.h"
58#include "xfs_bmap.h"
59#include "xfs_da_btree.h"
60#include "xfs_quota.h"
61#include "xfs_trans_space.h"
62
63
64STATIC void xfs_trans_apply_sb_deltas(xfs_trans_t *);
65STATIC uint xfs_trans_count_vecs(xfs_trans_t *);
66STATIC void xfs_trans_fill_vecs(xfs_trans_t *, xfs_log_iovec_t *);
67STATIC void xfs_trans_uncommit(xfs_trans_t *, uint);
68STATIC void xfs_trans_committed(xfs_trans_t *, int);
69STATIC void xfs_trans_chunk_committed(xfs_log_item_chunk_t *, xfs_lsn_t, int);
70STATIC void xfs_trans_free(xfs_trans_t *);
71
72kmem_zone_t *xfs_trans_zone;
73
74
75/*
76 * Initialize the precomputed transaction reservation values
77 * in the mount structure.
78 */
79void
80xfs_trans_init(
81 xfs_mount_t *mp)
82{
83 xfs_trans_reservations_t *resp;
84
85 resp = &(mp->m_reservations);
86 resp->tr_write =
87 (uint)(XFS_CALC_WRITE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
88 resp->tr_itruncate =
89 (uint)(XFS_CALC_ITRUNCATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
90 resp->tr_rename =
91 (uint)(XFS_CALC_RENAME_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
92 resp->tr_link = (uint)XFS_CALC_LINK_LOG_RES(mp);
93 resp->tr_remove =
94 (uint)(XFS_CALC_REMOVE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
95 resp->tr_symlink =
96 (uint)(XFS_CALC_SYMLINK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
97 resp->tr_create =
98 (uint)(XFS_CALC_CREATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
99 resp->tr_mkdir =
100 (uint)(XFS_CALC_MKDIR_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
101 resp->tr_ifree =
102 (uint)(XFS_CALC_IFREE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
103 resp->tr_ichange =
104 (uint)(XFS_CALC_ICHANGE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
105 resp->tr_growdata = (uint)XFS_CALC_GROWDATA_LOG_RES(mp);
106 resp->tr_swrite = (uint)XFS_CALC_SWRITE_LOG_RES(mp);
107 resp->tr_writeid = (uint)XFS_CALC_WRITEID_LOG_RES(mp);
108 resp->tr_addafork =
109 (uint)(XFS_CALC_ADDAFORK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
110 resp->tr_attrinval = (uint)XFS_CALC_ATTRINVAL_LOG_RES(mp);
111 resp->tr_attrset =
112 (uint)(XFS_CALC_ATTRSET_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
113 resp->tr_attrrm =
114 (uint)(XFS_CALC_ATTRRM_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
115 resp->tr_clearagi = (uint)XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp);
116 resp->tr_growrtalloc = (uint)XFS_CALC_GROWRTALLOC_LOG_RES(mp);
117 resp->tr_growrtzero = (uint)XFS_CALC_GROWRTZERO_LOG_RES(mp);
118 resp->tr_growrtfree = (uint)XFS_CALC_GROWRTFREE_LOG_RES(mp);
119}
120
121/*
122 * This routine is called to allocate a transaction structure.
123 * The type parameter indicates the type of the transaction. These
124 * are enumerated in xfs_trans.h.
125 *
126 * Dynamically allocate the transaction structure from the transaction
127 * zone, initialize it, and return it to the caller.
128 */
129xfs_trans_t *
130xfs_trans_alloc(
131 xfs_mount_t *mp,
132 uint type)
133{
134 fs_check_frozen(XFS_MTOVFS(mp), SB_FREEZE_TRANS);
135 atomic_inc(&mp->m_active_trans);
136
137 return (_xfs_trans_alloc(mp, type));
138
139}
140
141xfs_trans_t *
142_xfs_trans_alloc(
143 xfs_mount_t *mp,
144 uint type)
145{
146 xfs_trans_t *tp;
147
148 ASSERT(xfs_trans_zone != NULL);
149 tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
150
151 /*
152 * Initialize the transaction structure.
153 */
154 tp->t_magic = XFS_TRANS_MAGIC;
155 tp->t_type = type;
156 tp->t_mountp = mp;
157 tp->t_items_free = XFS_LIC_NUM_SLOTS;
158 tp->t_busy_free = XFS_LBC_NUM_SLOTS;
159 XFS_LIC_INIT(&(tp->t_items));
160 XFS_LBC_INIT(&(tp->t_busy));
161
162 return (tp);
163}
164
165/*
166 * This is called to create a new transaction which will share the
167 * permanent log reservation of the given transaction. The remaining
168 * unused block and rt extent reservations are also inherited. This
169 * implies that the original transaction is no longer allowed to allocate
170 * blocks. Locks and log items, however, are no inherited. They must
171 * be added to the new transaction explicitly.
172 */
173xfs_trans_t *
174xfs_trans_dup(
175 xfs_trans_t *tp)
176{
177 xfs_trans_t *ntp;
178
179 ntp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
180
181 /*
182 * Initialize the new transaction structure.
183 */
184 ntp->t_magic = XFS_TRANS_MAGIC;
185 ntp->t_type = tp->t_type;
186 ntp->t_mountp = tp->t_mountp;
187 ntp->t_items_free = XFS_LIC_NUM_SLOTS;
188 ntp->t_busy_free = XFS_LBC_NUM_SLOTS;
189 XFS_LIC_INIT(&(ntp->t_items));
190 XFS_LBC_INIT(&(ntp->t_busy));
191
192 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
193
194#if defined(XLOG_NOLOG) || defined(DEBUG)
195 ASSERT(!xlog_debug || tp->t_ticket != NULL);
196#else
197 ASSERT(tp->t_ticket != NULL);
198#endif
199 ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE);
200 ntp->t_ticket = tp->t_ticket;
201 ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used;
202 tp->t_blk_res = tp->t_blk_res_used;
203 ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used;
204 tp->t_rtx_res = tp->t_rtx_res_used;
205 PFLAGS_DUP(&tp->t_pflags, &ntp->t_pflags);
206
207 XFS_TRANS_DUP_DQINFO(tp->t_mountp, tp, ntp);
208
209 atomic_inc(&tp->t_mountp->m_active_trans);
210 return ntp;
211}
212
213/*
214 * This is called to reserve free disk blocks and log space for the
215 * given transaction. This must be done before allocating any resources
216 * within the transaction.
217 *
218 * This will return ENOSPC if there are not enough blocks available.
219 * It will sleep waiting for available log space.
220 * The only valid value for the flags parameter is XFS_RES_LOG_PERM, which
221 * is used by long running transactions. If any one of the reservations
222 * fails then they will all be backed out.
223 *
224 * This does not do quota reservations. That typically is done by the
225 * caller afterwards.
226 */
227int
228xfs_trans_reserve(
229 xfs_trans_t *tp,
230 uint blocks,
231 uint logspace,
232 uint rtextents,
233 uint flags,
234 uint logcount)
235{
236 int log_flags;
237 int error;
238 int rsvd;
239
240 error = 0;
241 rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
242
243 /* Mark this thread as being in a transaction */
244 PFLAGS_SET_FSTRANS(&tp->t_pflags);
245
246 /*
247 * Attempt to reserve the needed disk blocks by decrementing
248 * the number needed from the number available. This will
249 * fail if the count would go below zero.
250 */
251 if (blocks > 0) {
252 error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS,
253 -blocks, rsvd);
254 if (error != 0) {
255 PFLAGS_RESTORE_FSTRANS(&tp->t_pflags);
256 return (XFS_ERROR(ENOSPC));
257 }
258 tp->t_blk_res += blocks;
259 }
260
261 /*
262 * Reserve the log space needed for this transaction.
263 */
264 if (logspace > 0) {
265 ASSERT((tp->t_log_res == 0) || (tp->t_log_res == logspace));
266 ASSERT((tp->t_log_count == 0) ||
267 (tp->t_log_count == logcount));
268 if (flags & XFS_TRANS_PERM_LOG_RES) {
269 log_flags = XFS_LOG_PERM_RESERV;
270 tp->t_flags |= XFS_TRANS_PERM_LOG_RES;
271 } else {
272 ASSERT(tp->t_ticket == NULL);
273 ASSERT(!(tp->t_flags & XFS_TRANS_PERM_LOG_RES));
274 log_flags = 0;
275 }
276
277 error = xfs_log_reserve(tp->t_mountp, logspace, logcount,
278 &tp->t_ticket,
279 XFS_TRANSACTION, log_flags);
280 if (error) {
281 goto undo_blocks;
282 }
283 tp->t_log_res = logspace;
284 tp->t_log_count = logcount;
285 }
286
287 /*
288 * Attempt to reserve the needed realtime extents by decrementing
289 * the number needed from the number available. This will
290 * fail if the count would go below zero.
291 */
292 if (rtextents > 0) {
293 error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FREXTENTS,
294 -rtextents, rsvd);
295 if (error) {
296 error = XFS_ERROR(ENOSPC);
297 goto undo_log;
298 }
299 tp->t_rtx_res += rtextents;
300 }
301
302 return 0;
303
304 /*
305 * Error cases jump to one of these labels to undo any
306 * reservations which have already been performed.
307 */
308undo_log:
309 if (logspace > 0) {
310 if (flags & XFS_TRANS_PERM_LOG_RES) {
311 log_flags = XFS_LOG_REL_PERM_RESERV;
312 } else {
313 log_flags = 0;
314 }
315 xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, log_flags);
316 tp->t_ticket = NULL;
317 tp->t_log_res = 0;
318 tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES;
319 }
320
321undo_blocks:
322 if (blocks > 0) {
323 (void) xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS,
324 blocks, rsvd);
325 tp->t_blk_res = 0;
326 }
327
328 PFLAGS_RESTORE_FSTRANS(&tp->t_pflags);
329
330 return (error);
331}
332
333
334/*
335 * This is called to set the a callback to be called when the given
336 * transaction is committed to disk. The transaction pointer and the
337 * argument pointer will be passed to the callback routine.
338 *
339 * Only one callback can be associated with any single transaction.
340 */
341void
342xfs_trans_callback(
343 xfs_trans_t *tp,
344 xfs_trans_callback_t callback,
345 void *arg)
346{
347 ASSERT(tp->t_callback == NULL);
348 tp->t_callback = callback;
349 tp->t_callarg = arg;
350}
351
352
353/*
354 * Record the indicated change to the given field for application
355 * to the file system's superblock when the transaction commits.
356 * For now, just store the change in the transaction structure.
357 *
358 * Mark the transaction structure to indicate that the superblock
359 * needs to be updated before committing.
360 */
361void
362xfs_trans_mod_sb(
363 xfs_trans_t *tp,
364 uint field,
365 long delta)
366{
367
368 switch (field) {
369 case XFS_TRANS_SB_ICOUNT:
370 tp->t_icount_delta += delta;
371 break;
372 case XFS_TRANS_SB_IFREE:
373 tp->t_ifree_delta += delta;
374 break;
375 case XFS_TRANS_SB_FDBLOCKS:
376 /*
377 * Track the number of blocks allocated in the
378 * transaction. Make sure it does not exceed the
379 * number reserved.
380 */
381 if (delta < 0) {
382 tp->t_blk_res_used += (uint)-delta;
383 ASSERT(tp->t_blk_res_used <= tp->t_blk_res);
384 }
385 tp->t_fdblocks_delta += delta;
386 break;
387 case XFS_TRANS_SB_RES_FDBLOCKS:
388 /*
389 * The allocation has already been applied to the
390 * in-core superblock's counter. This should only
391 * be applied to the on-disk superblock.
392 */
393 ASSERT(delta < 0);
394 tp->t_res_fdblocks_delta += delta;
395 break;
396 case XFS_TRANS_SB_FREXTENTS:
397 /*
398 * Track the number of blocks allocated in the
399 * transaction. Make sure it does not exceed the
400 * number reserved.
401 */
402 if (delta < 0) {
403 tp->t_rtx_res_used += (uint)-delta;
404 ASSERT(tp->t_rtx_res_used <= tp->t_rtx_res);
405 }
406 tp->t_frextents_delta += delta;
407 break;
408 case XFS_TRANS_SB_RES_FREXTENTS:
409 /*
410 * The allocation has already been applied to the
411 * in-core superblocks's counter. This should only
412 * be applied to the on-disk superblock.
413 */
414 ASSERT(delta < 0);
415 tp->t_res_frextents_delta += delta;
416 break;
417 case XFS_TRANS_SB_DBLOCKS:
418 ASSERT(delta > 0);
419 tp->t_dblocks_delta += delta;
420 break;
421 case XFS_TRANS_SB_AGCOUNT:
422 ASSERT(delta > 0);
423 tp->t_agcount_delta += delta;
424 break;
425 case XFS_TRANS_SB_IMAXPCT:
426 tp->t_imaxpct_delta += delta;
427 break;
428 case XFS_TRANS_SB_REXTSIZE:
429 tp->t_rextsize_delta += delta;
430 break;
431 case XFS_TRANS_SB_RBMBLOCKS:
432 tp->t_rbmblocks_delta += delta;
433 break;
434 case XFS_TRANS_SB_RBLOCKS:
435 tp->t_rblocks_delta += delta;
436 break;
437 case XFS_TRANS_SB_REXTENTS:
438 tp->t_rextents_delta += delta;
439 break;
440 case XFS_TRANS_SB_REXTSLOG:
441 tp->t_rextslog_delta += delta;
442 break;
443 default:
444 ASSERT(0);
445 return;
446 }
447
448 tp->t_flags |= (XFS_TRANS_SB_DIRTY | XFS_TRANS_DIRTY);
449}
450
451/*
452 * xfs_trans_apply_sb_deltas() is called from the commit code
453 * to bring the superblock buffer into the current transaction
454 * and modify it as requested by earlier calls to xfs_trans_mod_sb().
455 *
456 * For now we just look at each field allowed to change and change
457 * it if necessary.
458 */
459STATIC void
460xfs_trans_apply_sb_deltas(
461 xfs_trans_t *tp)
462{
463 xfs_sb_t *sbp;
464 xfs_buf_t *bp;
465 int whole = 0;
466
467 bp = xfs_trans_getsb(tp, tp->t_mountp, 0);
468 sbp = XFS_BUF_TO_SBP(bp);
469
470 /*
471 * Check that superblock mods match the mods made to AGF counters.
472 */
473 ASSERT((tp->t_fdblocks_delta + tp->t_res_fdblocks_delta) ==
474 (tp->t_ag_freeblks_delta + tp->t_ag_flist_delta +
475 tp->t_ag_btree_delta));
476
477 if (tp->t_icount_delta != 0) {
478 INT_MOD(sbp->sb_icount, ARCH_CONVERT, tp->t_icount_delta);
479 }
480 if (tp->t_ifree_delta != 0) {
481 INT_MOD(sbp->sb_ifree, ARCH_CONVERT, tp->t_ifree_delta);
482 }
483
484 if (tp->t_fdblocks_delta != 0) {
485 INT_MOD(sbp->sb_fdblocks, ARCH_CONVERT, tp->t_fdblocks_delta);
486 }
487 if (tp->t_res_fdblocks_delta != 0) {
488 INT_MOD(sbp->sb_fdblocks, ARCH_CONVERT, tp->t_res_fdblocks_delta);
489 }
490
491 if (tp->t_frextents_delta != 0) {
492 INT_MOD(sbp->sb_frextents, ARCH_CONVERT, tp->t_frextents_delta);
493 }
494 if (tp->t_res_frextents_delta != 0) {
495 INT_MOD(sbp->sb_frextents, ARCH_CONVERT, tp->t_res_frextents_delta);
496 }
497 if (tp->t_dblocks_delta != 0) {
498 INT_MOD(sbp->sb_dblocks, ARCH_CONVERT, tp->t_dblocks_delta);
499 whole = 1;
500 }
501 if (tp->t_agcount_delta != 0) {
502 INT_MOD(sbp->sb_agcount, ARCH_CONVERT, tp->t_agcount_delta);
503 whole = 1;
504 }
505 if (tp->t_imaxpct_delta != 0) {
506 INT_MOD(sbp->sb_imax_pct, ARCH_CONVERT, tp->t_imaxpct_delta);
507 whole = 1;
508 }
509 if (tp->t_rextsize_delta != 0) {
510 INT_MOD(sbp->sb_rextsize, ARCH_CONVERT, tp->t_rextsize_delta);
511 whole = 1;
512 }
513 if (tp->t_rbmblocks_delta != 0) {
514 INT_MOD(sbp->sb_rbmblocks, ARCH_CONVERT, tp->t_rbmblocks_delta);
515 whole = 1;
516 }
517 if (tp->t_rblocks_delta != 0) {
518 INT_MOD(sbp->sb_rblocks, ARCH_CONVERT, tp->t_rblocks_delta);
519 whole = 1;
520 }
521 if (tp->t_rextents_delta != 0) {
522 INT_MOD(sbp->sb_rextents, ARCH_CONVERT, tp->t_rextents_delta);
523 whole = 1;
524 }
525 if (tp->t_rextslog_delta != 0) {
526 INT_MOD(sbp->sb_rextslog, ARCH_CONVERT, tp->t_rextslog_delta);
527 whole = 1;
528 }
529
530 if (whole)
531 /*
532 * Log the whole thing, the fields are discontiguous.
533 */
534 xfs_trans_log_buf(tp, bp, 0, sizeof(xfs_sb_t) - 1);
535 else
536 /*
537 * Since all the modifiable fields are contiguous, we
538 * can get away with this.
539 */
540 xfs_trans_log_buf(tp, bp, offsetof(xfs_sb_t, sb_icount),
541 offsetof(xfs_sb_t, sb_frextents) +
542 sizeof(sbp->sb_frextents) - 1);
543
544 XFS_MTOVFS(tp->t_mountp)->vfs_super->s_dirt = 1;
545}
546
547/*
548 * xfs_trans_unreserve_and_mod_sb() is called to release unused
549 * reservations and apply superblock counter changes to the in-core
550 * superblock.
551 *
552 * This is done efficiently with a single call to xfs_mod_incore_sb_batch().
553 */
554void
555xfs_trans_unreserve_and_mod_sb(
556 xfs_trans_t *tp)
557{
558 xfs_mod_sb_t msb[14]; /* If you add cases, add entries */
559 xfs_mod_sb_t *msbp;
560 /* REFERENCED */
561 int error;
562 int rsvd;
563
564 msbp = msb;
565 rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
566
567 /*
568 * Release any reserved blocks. Any that were allocated
569 * will be taken back again by fdblocks_delta below.
570 */
571 if (tp->t_blk_res > 0) {
572 msbp->msb_field = XFS_SBS_FDBLOCKS;
573 msbp->msb_delta = tp->t_blk_res;
574 msbp++;
575 }
576
577 /*
578 * Release any reserved real time extents . Any that were
579 * allocated will be taken back again by frextents_delta below.
580 */
581 if (tp->t_rtx_res > 0) {
582 msbp->msb_field = XFS_SBS_FREXTENTS;
583 msbp->msb_delta = tp->t_rtx_res;
584 msbp++;
585 }
586
587 /*
588 * Apply any superblock modifications to the in-core version.
589 * The t_res_fdblocks_delta and t_res_frextents_delta fields are
590 * explicity NOT applied to the in-core superblock.
591 * The idea is that that has already been done.
592 */
593 if (tp->t_flags & XFS_TRANS_SB_DIRTY) {
594 if (tp->t_icount_delta != 0) {
595 msbp->msb_field = XFS_SBS_ICOUNT;
596 msbp->msb_delta = (int)tp->t_icount_delta;
597 msbp++;
598 }
599 if (tp->t_ifree_delta != 0) {
600 msbp->msb_field = XFS_SBS_IFREE;
601 msbp->msb_delta = (int)tp->t_ifree_delta;
602 msbp++;
603 }
604 if (tp->t_fdblocks_delta != 0) {
605 msbp->msb_field = XFS_SBS_FDBLOCKS;
606 msbp->msb_delta = (int)tp->t_fdblocks_delta;
607 msbp++;
608 }
609 if (tp->t_frextents_delta != 0) {
610 msbp->msb_field = XFS_SBS_FREXTENTS;
611 msbp->msb_delta = (int)tp->t_frextents_delta;
612 msbp++;
613 }
614 if (tp->t_dblocks_delta != 0) {
615 msbp->msb_field = XFS_SBS_DBLOCKS;
616 msbp->msb_delta = (int)tp->t_dblocks_delta;
617 msbp++;
618 }
619 if (tp->t_agcount_delta != 0) {
620 msbp->msb_field = XFS_SBS_AGCOUNT;
621 msbp->msb_delta = (int)tp->t_agcount_delta;
622 msbp++;
623 }
624 if (tp->t_imaxpct_delta != 0) {
625 msbp->msb_field = XFS_SBS_IMAX_PCT;
626 msbp->msb_delta = (int)tp->t_imaxpct_delta;
627 msbp++;
628 }
629 if (tp->t_rextsize_delta != 0) {
630 msbp->msb_field = XFS_SBS_REXTSIZE;
631 msbp->msb_delta = (int)tp->t_rextsize_delta;
632 msbp++;
633 }
634 if (tp->t_rbmblocks_delta != 0) {
635 msbp->msb_field = XFS_SBS_RBMBLOCKS;
636 msbp->msb_delta = (int)tp->t_rbmblocks_delta;
637 msbp++;
638 }
639 if (tp->t_rblocks_delta != 0) {
640 msbp->msb_field = XFS_SBS_RBLOCKS;
641 msbp->msb_delta = (int)tp->t_rblocks_delta;
642 msbp++;
643 }
644 if (tp->t_rextents_delta != 0) {
645 msbp->msb_field = XFS_SBS_REXTENTS;
646 msbp->msb_delta = (int)tp->t_rextents_delta;
647 msbp++;
648 }
649 if (tp->t_rextslog_delta != 0) {
650 msbp->msb_field = XFS_SBS_REXTSLOG;
651 msbp->msb_delta = (int)tp->t_rextslog_delta;
652 msbp++;
653 }
654 }
655
656 /*
657 * If we need to change anything, do it.
658 */
659 if (msbp > msb) {
660 error = xfs_mod_incore_sb_batch(tp->t_mountp, msb,
661 (uint)(msbp - msb), rsvd);
662 ASSERT(error == 0);
663 }
664}
665
666
667/*
668 * xfs_trans_commit
669 *
670 * Commit the given transaction to the log a/synchronously.
671 *
672 * XFS disk error handling mechanism is not based on a typical
673 * transaction abort mechanism. Logically after the filesystem
674 * gets marked 'SHUTDOWN', we can't let any new transactions
675 * be durable - ie. committed to disk - because some metadata might
676 * be inconsistent. In such cases, this returns an error, and the
677 * caller may assume that all locked objects joined to the transaction
678 * have already been unlocked as if the commit had succeeded.
679 * Do not reference the transaction structure after this call.
680 */
681 /*ARGSUSED*/
682int
683xfs_trans_commit(
684 xfs_trans_t *tp,
685 uint flags,
686 xfs_lsn_t *commit_lsn_p)
687{
688 xfs_log_iovec_t *log_vector;
689 int nvec;
690 xfs_mount_t *mp;
691 xfs_lsn_t commit_lsn;
692 /* REFERENCED */
693 int error;
694 int log_flags;
695 int sync;
696#define XFS_TRANS_LOGVEC_COUNT 16
697 xfs_log_iovec_t log_vector_fast[XFS_TRANS_LOGVEC_COUNT];
698#if defined(XLOG_NOLOG) || defined(DEBUG)
699 static xfs_lsn_t trans_lsn = 1;
700#endif
701 void *commit_iclog;
702 int shutdown;
703
704 commit_lsn = -1;
705
706 /*
707 * Determine whether this commit is releasing a permanent
708 * log reservation or not.
709 */
710 if (flags & XFS_TRANS_RELEASE_LOG_RES) {
711 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
712 log_flags = XFS_LOG_REL_PERM_RESERV;
713 } else {
714 log_flags = 0;
715 }
716 mp = tp->t_mountp;
717
718 /*
719 * If there is nothing to be logged by the transaction,
720 * then unlock all of the items associated with the
721 * transaction and free the transaction structure.
722 * Also make sure to return any reserved blocks to
723 * the free pool.
724 */
725shut_us_down:
726 shutdown = XFS_FORCED_SHUTDOWN(mp) ? EIO : 0;
727 if (!(tp->t_flags & XFS_TRANS_DIRTY) || shutdown) {
728 xfs_trans_unreserve_and_mod_sb(tp);
729 /*
730 * It is indeed possible for the transaction to be
731 * not dirty but the dqinfo portion to be. All that
732 * means is that we have some (non-persistent) quota
733 * reservations that need to be unreserved.
734 */
735 XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(mp, tp);
736 if (tp->t_ticket) {
737 commit_lsn = xfs_log_done(mp, tp->t_ticket,
738 NULL, log_flags);
739 if (commit_lsn == -1 && !shutdown)
740 shutdown = XFS_ERROR(EIO);
741 }
742 PFLAGS_RESTORE_FSTRANS(&tp->t_pflags);
743 xfs_trans_free_items(tp, shutdown? XFS_TRANS_ABORT : 0);
744 xfs_trans_free_busy(tp);
745 xfs_trans_free(tp);
746 XFS_STATS_INC(xs_trans_empty);
747 if (commit_lsn_p)
748 *commit_lsn_p = commit_lsn;
749 return (shutdown);
750 }
751#if defined(XLOG_NOLOG) || defined(DEBUG)
752 ASSERT(!xlog_debug || tp->t_ticket != NULL);
753#else
754 ASSERT(tp->t_ticket != NULL);
755#endif
756
757 /*
758 * If we need to update the superblock, then do it now.
759 */
760 if (tp->t_flags & XFS_TRANS_SB_DIRTY) {
761 xfs_trans_apply_sb_deltas(tp);
762 }
763 XFS_TRANS_APPLY_DQUOT_DELTAS(mp, tp);
764
765 /*
766 * Ask each log item how many log_vector entries it will
767 * need so we can figure out how many to allocate.
768 * Try to avoid the kmem_alloc() call in the common case
769 * by using a vector from the stack when it fits.
770 */
771 nvec = xfs_trans_count_vecs(tp);
772
773 if (nvec == 0) {
774 xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
775 goto shut_us_down;
776 }
777
778
779 if (nvec <= XFS_TRANS_LOGVEC_COUNT) {
780 log_vector = log_vector_fast;
781 } else {
782 log_vector = (xfs_log_iovec_t *)kmem_alloc(nvec *
783 sizeof(xfs_log_iovec_t),
784 KM_SLEEP);
785 }
786
787 /*
788 * Fill in the log_vector and pin the logged items, and
789 * then write the transaction to the log.
790 */
791 xfs_trans_fill_vecs(tp, log_vector);
792
793 /*
794 * Ignore errors here. xfs_log_done would do the right thing.
795 * We need to put the ticket, etc. away.
796 */
797 error = xfs_log_write(mp, log_vector, nvec, tp->t_ticket,
798 &(tp->t_lsn));
799
800#if defined(XLOG_NOLOG) || defined(DEBUG)
801 if (xlog_debug) {
802 commit_lsn = xfs_log_done(mp, tp->t_ticket,
803 &commit_iclog, log_flags);
804 } else {
805 commit_lsn = 0;
806 tp->t_lsn = trans_lsn++;
807 }
808#else
809 /*
810 * This is the regular case. At this point (after the call finishes),
811 * the transaction is committed incore and could go out to disk at
812 * any time. However, all the items associated with the transaction
813 * are still locked and pinned in memory.
814 */
815 commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags);
816#endif
817
818 tp->t_commit_lsn = commit_lsn;
819 if (nvec > XFS_TRANS_LOGVEC_COUNT) {
820 kmem_free(log_vector, nvec * sizeof(xfs_log_iovec_t));
821 }
822
823 if (commit_lsn_p)
824 *commit_lsn_p = commit_lsn;
825
826 /*
827 * If we got a log write error. Unpin the logitems that we
828 * had pinned, clean up, free trans structure, and return error.
829 */
830 if (error || commit_lsn == -1) {
831 PFLAGS_RESTORE_FSTRANS(&tp->t_pflags);
832 xfs_trans_uncommit(tp, flags|XFS_TRANS_ABORT);
833 return XFS_ERROR(EIO);
834 }
835
836 /*
837 * Once the transaction has committed, unused
838 * reservations need to be released and changes to
839 * the superblock need to be reflected in the in-core
840 * version. Do that now.
841 */
842 xfs_trans_unreserve_and_mod_sb(tp);
843
844 sync = tp->t_flags & XFS_TRANS_SYNC;
845
846 /*
847 * Tell the LM to call the transaction completion routine
848 * when the log write with LSN commit_lsn completes (e.g.
849 * when the transaction commit really hits the on-disk log).
850 * After this call we cannot reference tp, because the call
851 * can happen at any time and the call will free the transaction
852 * structure pointed to by tp. The only case where we call
853 * the completion routine (xfs_trans_committed) directly is
854 * if the log is turned off on a debug kernel or we're
855 * running in simulation mode (the log is explicitly turned
856 * off).
857 */
858 tp->t_logcb.cb_func = (void(*)(void*, int))xfs_trans_committed;
859 tp->t_logcb.cb_arg = tp;
860
861 /*
862 * We need to pass the iclog buffer which was used for the
863 * transaction commit record into this function, and attach
864 * the callback to it. The callback must be attached before
865 * the items are unlocked to avoid racing with other threads
866 * waiting for an item to unlock.
867 */
868 shutdown = xfs_log_notify(mp, commit_iclog, &(tp->t_logcb));
869
870 /*
871 * Mark this thread as no longer being in a transaction
872 */
873 PFLAGS_RESTORE_FSTRANS(&tp->t_pflags);
874
875 /*
876 * Once all the items of the transaction have been copied
877 * to the in core log and the callback is attached, the
878 * items can be unlocked.
879 *
880 * This will free descriptors pointing to items which were
881 * not logged since there is nothing more to do with them.
882 * For items which were logged, we will keep pointers to them
883 * so they can be unpinned after the transaction commits to disk.
884 * This will also stamp each modified meta-data item with
885 * the commit lsn of this transaction for dependency tracking
886 * purposes.
887 */
888 xfs_trans_unlock_items(tp, commit_lsn);
889
890 /*
891 * If we detected a log error earlier, finish committing
892 * the transaction now (unpin log items, etc).
893 *
894 * Order is critical here, to avoid using the transaction
895 * pointer after its been freed (by xfs_trans_committed
896 * either here now, or as a callback). We cannot do this
897 * step inside xfs_log_notify as was done earlier because
898 * of this issue.
899 */
900 if (shutdown)
901 xfs_trans_committed(tp, XFS_LI_ABORTED);
902
903 /*
904 * Now that the xfs_trans_committed callback has been attached,
905 * and the items are released we can finally allow the iclog to
906 * go to disk.
907 */
908 error = xfs_log_release_iclog(mp, commit_iclog);
909
910 /*
911 * If the transaction needs to be synchronous, then force the
912 * log out now and wait for it.
913 */
914 if (sync) {
915 if (!error)
916 error = xfs_log_force(mp, commit_lsn,
917 XFS_LOG_FORCE | XFS_LOG_SYNC);
918 XFS_STATS_INC(xs_trans_sync);
919 } else {
920 XFS_STATS_INC(xs_trans_async);
921 }
922
923 return (error);
924}
925
926
927/*
928 * Total up the number of log iovecs needed to commit this
929 * transaction. The transaction itself needs one for the
930 * transaction header. Ask each dirty item in turn how many
931 * it needs to get the total.
932 */
933STATIC uint
934xfs_trans_count_vecs(
935 xfs_trans_t *tp)
936{
937 int nvecs;
938 xfs_log_item_desc_t *lidp;
939
940 nvecs = 1;
941 lidp = xfs_trans_first_item(tp);
942 ASSERT(lidp != NULL);
943
944 /* In the non-debug case we need to start bailing out if we
945 * didn't find a log_item here, return zero and let trans_commit
946 * deal with it.
947 */
948 if (lidp == NULL)
949 return 0;
950
951 while (lidp != NULL) {
952 /*
953 * Skip items which aren't dirty in this transaction.
954 */
955 if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
956 lidp = xfs_trans_next_item(tp, lidp);
957 continue;
958 }
959 lidp->lid_size = IOP_SIZE(lidp->lid_item);
960 nvecs += lidp->lid_size;
961 lidp = xfs_trans_next_item(tp, lidp);
962 }
963
964 return nvecs;
965}
966
967/*
968 * Called from the trans_commit code when we notice that
969 * the filesystem is in the middle of a forced shutdown.
970 */
971STATIC void
972xfs_trans_uncommit(
973 xfs_trans_t *tp,
974 uint flags)
975{
976 xfs_log_item_desc_t *lidp;
977
978 for (lidp = xfs_trans_first_item(tp);
979 lidp != NULL;
980 lidp = xfs_trans_next_item(tp, lidp)) {
981 /*
982 * Unpin all but those that aren't dirty.
983 */
984 if (lidp->lid_flags & XFS_LID_DIRTY)
985 IOP_UNPIN_REMOVE(lidp->lid_item, tp);
986 }
987
988 xfs_trans_unreserve_and_mod_sb(tp);
989 XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(tp->t_mountp, tp);
990
991 xfs_trans_free_items(tp, flags);
992 xfs_trans_free_busy(tp);
993 xfs_trans_free(tp);
994}
995
996/*
997 * Fill in the vector with pointers to data to be logged
998 * by this transaction. The transaction header takes
999 * the first vector, and then each dirty item takes the
1000 * number of vectors it indicated it needed in xfs_trans_count_vecs().
1001 *
1002 * As each item fills in the entries it needs, also pin the item
1003 * so that it cannot be flushed out until the log write completes.
1004 */
1005STATIC void
1006xfs_trans_fill_vecs(
1007 xfs_trans_t *tp,
1008 xfs_log_iovec_t *log_vector)
1009{
1010 xfs_log_item_desc_t *lidp;
1011 xfs_log_iovec_t *vecp;
1012 uint nitems;
1013
1014 /*
1015 * Skip over the entry for the transaction header, we'll
1016 * fill that in at the end.
1017 */
1018 vecp = log_vector + 1; /* pointer arithmetic */
1019
1020 nitems = 0;
1021 lidp = xfs_trans_first_item(tp);
1022 ASSERT(lidp != NULL);
1023 while (lidp != NULL) {
1024 /*
1025 * Skip items which aren't dirty in this transaction.
1026 */
1027 if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
1028 lidp = xfs_trans_next_item(tp, lidp);
1029 continue;
1030 }
1031 /*
1032 * The item may be marked dirty but not log anything.
1033 * This can be used to get called when a transaction
1034 * is committed.
1035 */
1036 if (lidp->lid_size) {
1037 nitems++;
1038 }
1039 IOP_FORMAT(lidp->lid_item, vecp);
1040 vecp += lidp->lid_size; /* pointer arithmetic */
1041 IOP_PIN(lidp->lid_item);
1042 lidp = xfs_trans_next_item(tp, lidp);
1043 }
1044
1045 /*
1046 * Now that we've counted the number of items in this
1047 * transaction, fill in the transaction header.
1048 */
1049 tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC;
1050 tp->t_header.th_type = tp->t_type;
1051 tp->t_header.th_num_items = nitems;
1052 log_vector->i_addr = (xfs_caddr_t)&tp->t_header;
1053 log_vector->i_len = sizeof(xfs_trans_header_t);
1054}
1055
1056
1057/*
1058 * Unlock all of the transaction's items and free the transaction.
1059 * The transaction must not have modified any of its items, because
1060 * there is no way to restore them to their previous state.
1061 *
1062 * If the transaction has made a log reservation, make sure to release
1063 * it as well.
1064 */
1065void
1066xfs_trans_cancel(
1067 xfs_trans_t *tp,
1068 int flags)
1069{
1070 int log_flags;
1071#ifdef DEBUG
1072 xfs_log_item_chunk_t *licp;
1073 xfs_log_item_desc_t *lidp;
1074 xfs_log_item_t *lip;
1075 int i;
1076#endif
1077
1078 /*
1079 * See if the caller is being too lazy to figure out if
1080 * the transaction really needs an abort.
1081 */
1082 if ((flags & XFS_TRANS_ABORT) && !(tp->t_flags & XFS_TRANS_DIRTY))
1083 flags &= ~XFS_TRANS_ABORT;
1084 /*
1085 * See if the caller is relying on us to shut down the
1086 * filesystem. This happens in paths where we detect
1087 * corruption and decide to give up.
1088 */
1089 if ((tp->t_flags & XFS_TRANS_DIRTY) &&
1090 !XFS_FORCED_SHUTDOWN(tp->t_mountp))
1091 xfs_force_shutdown(tp->t_mountp, XFS_CORRUPT_INCORE);
1092#ifdef DEBUG
1093 if (!(flags & XFS_TRANS_ABORT)) {
1094 licp = &(tp->t_items);
1095 while (licp != NULL) {
1096 lidp = licp->lic_descs;
1097 for (i = 0; i < licp->lic_unused; i++, lidp++) {
1098 if (XFS_LIC_ISFREE(licp, i)) {
1099 continue;
1100 }
1101
1102 lip = lidp->lid_item;
1103 if (!XFS_FORCED_SHUTDOWN(tp->t_mountp))
1104 ASSERT(!(lip->li_type == XFS_LI_EFD));
1105 }
1106 licp = licp->lic_next;
1107 }
1108 }
1109#endif
1110 xfs_trans_unreserve_and_mod_sb(tp);
1111 XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(tp->t_mountp, tp);
1112
1113 if (tp->t_ticket) {
1114 if (flags & XFS_TRANS_RELEASE_LOG_RES) {
1115 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
1116 log_flags = XFS_LOG_REL_PERM_RESERV;
1117 } else {
1118 log_flags = 0;
1119 }
1120 xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, log_flags);
1121 }
1122
1123 /* mark this thread as no longer being in a transaction */
1124 PFLAGS_RESTORE_FSTRANS(&tp->t_pflags);
1125
1126 xfs_trans_free_items(tp, flags);
1127 xfs_trans_free_busy(tp);
1128 xfs_trans_free(tp);
1129}
1130
1131
1132/*
1133 * Free the transaction structure. If there is more clean up
1134 * to do when the structure is freed, add it here.
1135 */
1136STATIC void
1137xfs_trans_free(
1138 xfs_trans_t *tp)
1139{
1140 atomic_dec(&tp->t_mountp->m_active_trans);
1141 XFS_TRANS_FREE_DQINFO(tp->t_mountp, tp);
1142 kmem_zone_free(xfs_trans_zone, tp);
1143}
1144
1145
1146/*
1147 * THIS SHOULD BE REWRITTEN TO USE xfs_trans_next_item().
1148 *
1149 * This is typically called by the LM when a transaction has been fully
1150 * committed to disk. It needs to unpin the items which have
1151 * been logged by the transaction and update their positions
1152 * in the AIL if necessary.
1153 * This also gets called when the transactions didn't get written out
1154 * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then.
1155 *
1156 * Call xfs_trans_chunk_committed() to process the items in
1157 * each chunk.
1158 */
1159STATIC void
1160xfs_trans_committed(
1161 xfs_trans_t *tp,
1162 int abortflag)
1163{
1164 xfs_log_item_chunk_t *licp;
1165 xfs_log_item_chunk_t *next_licp;
1166 xfs_log_busy_chunk_t *lbcp;
1167 xfs_log_busy_slot_t *lbsp;
1168 int i;
1169
1170 /*
1171 * Call the transaction's completion callback if there
1172 * is one.
1173 */
1174 if (tp->t_callback != NULL) {
1175 tp->t_callback(tp, tp->t_callarg);
1176 }
1177
1178 /*
1179 * Special case the chunk embedded in the transaction.
1180 */
1181 licp = &(tp->t_items);
1182 if (!(XFS_LIC_ARE_ALL_FREE(licp))) {
1183 xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
1184 }
1185
1186 /*
1187 * Process the items in each chunk in turn.
1188 */
1189 licp = licp->lic_next;
1190 while (licp != NULL) {
1191 ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
1192 xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
1193 next_licp = licp->lic_next;
1194 kmem_free(licp, sizeof(xfs_log_item_chunk_t));
1195 licp = next_licp;
1196 }
1197
1198 /*
1199 * Clear all the per-AG busy list items listed in this transaction
1200 */
1201 lbcp = &tp->t_busy;
1202 while (lbcp != NULL) {
1203 for (i = 0, lbsp = lbcp->lbc_busy; i < lbcp->lbc_unused; i++, lbsp++) {
1204 if (!XFS_LBC_ISFREE(lbcp, i)) {
1205 xfs_alloc_clear_busy(tp, lbsp->lbc_ag,
1206 lbsp->lbc_idx);
1207 }
1208 }
1209 lbcp = lbcp->lbc_next;
1210 }
1211 xfs_trans_free_busy(tp);
1212
1213 /*
1214 * That's it for the transaction structure. Free it.
1215 */
1216 xfs_trans_free(tp);
1217}
1218
1219/*
1220 * This is called to perform the commit processing for each
1221 * item described by the given chunk.
1222 *
1223 * The commit processing consists of unlocking items which were
1224 * held locked with the SYNC_UNLOCK attribute, calling the committed
1225 * routine of each logged item, updating the item's position in the AIL
1226 * if necessary, and unpinning each item. If the committed routine
1227 * returns -1, then do nothing further with the item because it
1228 * may have been freed.
1229 *
1230 * Since items are unlocked when they are copied to the incore
1231 * log, it is possible for two transactions to be completing
1232 * and manipulating the same item simultaneously. The AIL lock
1233 * will protect the lsn field of each item. The value of this
1234 * field can never go backwards.
1235 *
1236 * We unpin the items after repositioning them in the AIL, because
1237 * otherwise they could be immediately flushed and we'd have to race
1238 * with the flusher trying to pull the item from the AIL as we add it.
1239 */
1240STATIC void
1241xfs_trans_chunk_committed(
1242 xfs_log_item_chunk_t *licp,
1243 xfs_lsn_t lsn,
1244 int aborted)
1245{
1246 xfs_log_item_desc_t *lidp;
1247 xfs_log_item_t *lip;
1248 xfs_lsn_t item_lsn;
1249 struct xfs_mount *mp;
1250 int i;
1251 SPLDECL(s);
1252
1253 lidp = licp->lic_descs;
1254 for (i = 0; i < licp->lic_unused; i++, lidp++) {
1255 if (XFS_LIC_ISFREE(licp, i)) {
1256 continue;
1257 }
1258
1259 lip = lidp->lid_item;
1260 if (aborted)
1261 lip->li_flags |= XFS_LI_ABORTED;
1262
1263 /*
1264 * Send in the ABORTED flag to the COMMITTED routine
1265 * so that it knows whether the transaction was aborted
1266 * or not.
1267 */
1268 item_lsn = IOP_COMMITTED(lip, lsn);
1269
1270 /*
1271 * If the committed routine returns -1, make
1272 * no more references to the item.
1273 */
1274 if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) {
1275 continue;
1276 }
1277
1278 /*
1279 * If the returned lsn is greater than what it
1280 * contained before, update the location of the
1281 * item in the AIL. If it is not, then do nothing.
1282 * Items can never move backwards in the AIL.
1283 *
1284 * While the new lsn should usually be greater, it
1285 * is possible that a later transaction completing
1286 * simultaneously with an earlier one using the
1287 * same item could complete first with a higher lsn.
1288 * This would cause the earlier transaction to fail
1289 * the test below.
1290 */
1291 mp = lip->li_mountp;
1292 AIL_LOCK(mp,s);
1293 if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) {
1294 /*
1295 * This will set the item's lsn to item_lsn
1296 * and update the position of the item in
1297 * the AIL.
1298 *
1299 * xfs_trans_update_ail() drops the AIL lock.
1300 */
1301 xfs_trans_update_ail(mp, lip, item_lsn, s);
1302 } else {
1303 AIL_UNLOCK(mp, s);
1304 }
1305
1306 /*
1307 * Now that we've repositioned the item in the AIL,
1308 * unpin it so it can be flushed. Pass information
1309 * about buffer stale state down from the log item
1310 * flags, if anyone else stales the buffer we do not
1311 * want to pay any attention to it.
1312 */
1313 IOP_UNPIN(lip, lidp->lid_flags & XFS_LID_BUF_STALE);
1314 }
1315}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
new file mode 100644
index 000000000000..bd37ccb85e76
--- /dev/null
+++ b/fs/xfs/xfs_trans.h
@@ -0,0 +1,1042 @@
1/*
2 * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_TRANS_H__
33#define __XFS_TRANS_H__
34
35/*
36 * This is the structure written in the log at the head of
37 * every transaction. It identifies the type and id of the
38 * transaction, and contains the number of items logged by
39 * the transaction so we know how many to expect during recovery.
40 *
41 * Do not change the below structure without redoing the code in
42 * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans().
43 */
44typedef struct xfs_trans_header {
45 uint th_magic; /* magic number */
46 uint th_type; /* transaction type */
47 __int32_t th_tid; /* transaction id (unused) */
48 uint th_num_items; /* num items logged by trans */
49} xfs_trans_header_t;
50
51#define XFS_TRANS_HEADER_MAGIC 0x5452414e /* TRAN */
52
53/*
54 * Log item types.
55 */
56#define XFS_LI_5_3_BUF 0x1234 /* v1 bufs, 1-block inode buffers */
57#define XFS_LI_5_3_INODE 0x1235 /* 1-block inode buffers */
58#define XFS_LI_EFI 0x1236
59#define XFS_LI_EFD 0x1237
60#define XFS_LI_IUNLINK 0x1238
61#define XFS_LI_6_1_INODE 0x1239 /* 4K non-aligned inode bufs */
62#define XFS_LI_6_1_BUF 0x123a /* v1, 4K inode buffers */
63#define XFS_LI_INODE 0x123b /* aligned ino chunks, var-size ibufs */
64#define XFS_LI_BUF 0x123c /* v2 bufs, variable sized inode bufs */
65#define XFS_LI_DQUOT 0x123d
66#define XFS_LI_QUOTAOFF 0x123e
67
68/*
69 * Transaction types. Used to distinguish types of buffers.
70 */
71#define XFS_TRANS_SETATTR_NOT_SIZE 1
72#define XFS_TRANS_SETATTR_SIZE 2
73#define XFS_TRANS_INACTIVE 3
74#define XFS_TRANS_CREATE 4
75#define XFS_TRANS_CREATE_TRUNC 5
76#define XFS_TRANS_TRUNCATE_FILE 6
77#define XFS_TRANS_REMOVE 7
78#define XFS_TRANS_LINK 8
79#define XFS_TRANS_RENAME 9
80#define XFS_TRANS_MKDIR 10
81#define XFS_TRANS_RMDIR 11
82#define XFS_TRANS_SYMLINK 12
83#define XFS_TRANS_SET_DMATTRS 13
84#define XFS_TRANS_GROWFS 14
85#define XFS_TRANS_STRAT_WRITE 15
86#define XFS_TRANS_DIOSTRAT 16
87#define XFS_TRANS_WRITE_SYNC 17
88#define XFS_TRANS_WRITEID 18
89#define XFS_TRANS_ADDAFORK 19
90#define XFS_TRANS_ATTRINVAL 20
91#define XFS_TRANS_ATRUNCATE 21
92#define XFS_TRANS_ATTR_SET 22
93#define XFS_TRANS_ATTR_RM 23
94#define XFS_TRANS_ATTR_FLAG 24
95#define XFS_TRANS_CLEAR_AGI_BUCKET 25
96#define XFS_TRANS_QM_SBCHANGE 26
97/*
98 * Dummy entries since we use the transaction type to index into the
99 * trans_type[] in xlog_recover_print_trans_head()
100 */
101#define XFS_TRANS_DUMMY1 27
102#define XFS_TRANS_DUMMY2 28
103#define XFS_TRANS_QM_QUOTAOFF 29
104#define XFS_TRANS_QM_DQALLOC 30
105#define XFS_TRANS_QM_SETQLIM 31
106#define XFS_TRANS_QM_DQCLUSTER 32
107#define XFS_TRANS_QM_QINOCREATE 33
108#define XFS_TRANS_QM_QUOTAOFF_END 34
109#define XFS_TRANS_SB_UNIT 35
110#define XFS_TRANS_FSYNC_TS 36
111#define XFS_TRANS_GROWFSRT_ALLOC 37
112#define XFS_TRANS_GROWFSRT_ZERO 38
113#define XFS_TRANS_GROWFSRT_FREE 39
114#define XFS_TRANS_SWAPEXT 40
115/* new transaction types need to be reflected in xfs_logprint(8) */
116
117
118#ifdef __KERNEL__
119struct xfs_buf;
120struct xfs_buftarg;
121struct xfs_efd_log_item;
122struct xfs_efi_log_item;
123struct xfs_inode;
124struct xfs_item_ops;
125struct xfs_log_iovec;
126struct xfs_log_item;
127struct xfs_log_item_desc;
128struct xfs_mount;
129struct xfs_trans;
130struct xfs_dquot_acct;
131
132typedef struct xfs_ail_entry {
133 struct xfs_log_item *ail_forw; /* AIL forw pointer */
134 struct xfs_log_item *ail_back; /* AIL back pointer */
135} xfs_ail_entry_t;
136
137/*
138 * This structure is passed as a parameter to xfs_trans_push_ail()
139 * and is used to track the what LSN the waiting processes are
140 * waiting to become unused.
141 */
142typedef struct xfs_ail_ticket {
143 xfs_lsn_t at_lsn; /* lsn waitin for */
144 struct xfs_ail_ticket *at_forw; /* wait list ptr */
145 struct xfs_ail_ticket *at_back; /* wait list ptr */
146 sv_t at_sema; /* wait sema */
147} xfs_ail_ticket_t;
148
149
150typedef struct xfs_log_item {
151 xfs_ail_entry_t li_ail; /* AIL pointers */
152 xfs_lsn_t li_lsn; /* last on-disk lsn */
153 struct xfs_log_item_desc *li_desc; /* ptr to current desc*/
154 struct xfs_mount *li_mountp; /* ptr to fs mount */
155 uint li_type; /* item type */
156 uint li_flags; /* misc flags */
157 struct xfs_log_item *li_bio_list; /* buffer item list */
158 void (*li_cb)(struct xfs_buf *,
159 struct xfs_log_item *);
160 /* buffer item iodone */
161 /* callback func */
162 struct xfs_item_ops *li_ops; /* function list */
163} xfs_log_item_t;
164
165#define XFS_LI_IN_AIL 0x1
166#define XFS_LI_ABORTED 0x2
167
168typedef struct xfs_item_ops {
169 uint (*iop_size)(xfs_log_item_t *);
170 void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
171 void (*iop_pin)(xfs_log_item_t *);
172 void (*iop_unpin)(xfs_log_item_t *, int);
173 void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *);
174 uint (*iop_trylock)(xfs_log_item_t *);
175 void (*iop_unlock)(xfs_log_item_t *);
176 xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
177 void (*iop_push)(xfs_log_item_t *);
178 void (*iop_abort)(xfs_log_item_t *);
179 void (*iop_pushbuf)(xfs_log_item_t *);
180 void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
181} xfs_item_ops_t;
182
183#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip)
184#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp)
185#define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip)
186#define IOP_UNPIN(ip, flags) (*(ip)->li_ops->iop_unpin)(ip, flags)
187#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp)
188#define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip)
189#define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip)
190#define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn)
191#define IOP_PUSH(ip) (*(ip)->li_ops->iop_push)(ip)
192#define IOP_ABORT(ip) (*(ip)->li_ops->iop_abort)(ip)
193#define IOP_PUSHBUF(ip) (*(ip)->li_ops->iop_pushbuf)(ip)
194#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn)
195
196/*
197 * Return values for the IOP_TRYLOCK() routines.
198 */
199#define XFS_ITEM_SUCCESS 0
200#define XFS_ITEM_PINNED 1
201#define XFS_ITEM_LOCKED 2
202#define XFS_ITEM_FLUSHING 3
203#define XFS_ITEM_PUSHBUF 4
204
205#endif /* __KERNEL__ */
206
207/*
208 * This structure is used to track log items associated with
209 * a transaction. It points to the log item and keeps some
210 * flags to track the state of the log item. It also tracks
211 * the amount of space needed to log the item it describes
212 * once we get to commit processing (see xfs_trans_commit()).
213 */
214typedef struct xfs_log_item_desc {
215 xfs_log_item_t *lid_item;
216 ushort lid_size;
217 unsigned char lid_flags;
218 unsigned char lid_index;
219} xfs_log_item_desc_t;
220
221#define XFS_LID_DIRTY 0x1
222#define XFS_LID_PINNED 0x2
223#define XFS_LID_BUF_STALE 0x8
224
225/*
226 * This structure is used to maintain a chunk list of log_item_desc
227 * structures. The free field is a bitmask indicating which descriptors
228 * in this chunk's array are free. The unused field is the first value
229 * not used since this chunk was allocated.
230 */
231#define XFS_LIC_NUM_SLOTS 15
232typedef struct xfs_log_item_chunk {
233 struct xfs_log_item_chunk *lic_next;
234 ushort lic_free;
235 ushort lic_unused;
236 xfs_log_item_desc_t lic_descs[XFS_LIC_NUM_SLOTS];
237} xfs_log_item_chunk_t;
238
239#define XFS_LIC_MAX_SLOT (XFS_LIC_NUM_SLOTS - 1)
240#define XFS_LIC_FREEMASK ((1 << XFS_LIC_NUM_SLOTS) - 1)
241
242
243/*
244 * Initialize the given chunk. Set the chunk's free descriptor mask
245 * to indicate that all descriptors are free. The caller gets to set
246 * lic_unused to the right value (0 matches all free). The
247 * lic_descs.lid_index values are set up as each desc is allocated.
248 */
249#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_INIT)
250void xfs_lic_init(xfs_log_item_chunk_t *cp);
251#define XFS_LIC_INIT(cp) xfs_lic_init(cp)
252#else
253#define XFS_LIC_INIT(cp) ((cp)->lic_free = XFS_LIC_FREEMASK)
254#endif
255#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_INIT_SLOT)
256void xfs_lic_init_slot(xfs_log_item_chunk_t *cp, int slot);
257#define XFS_LIC_INIT_SLOT(cp,slot) xfs_lic_init_slot(cp, slot)
258#else
259#define XFS_LIC_INIT_SLOT(cp,slot) \
260 ((cp)->lic_descs[slot].lid_index = (unsigned char)(slot))
261#endif
262#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_VACANCY)
263int xfs_lic_vacancy(xfs_log_item_chunk_t *cp);
264#define XFS_LIC_VACANCY(cp) xfs_lic_vacancy(cp)
265#else
266#define XFS_LIC_VACANCY(cp) (((cp)->lic_free) & XFS_LIC_FREEMASK)
267#endif
268#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_ALL_FREE)
269void xfs_lic_all_free(xfs_log_item_chunk_t *cp);
270#define XFS_LIC_ALL_FREE(cp) xfs_lic_all_free(cp)
271#else
272#define XFS_LIC_ALL_FREE(cp) ((cp)->lic_free = XFS_LIC_FREEMASK)
273#endif
274#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_ARE_ALL_FREE)
275int xfs_lic_are_all_free(xfs_log_item_chunk_t *cp);
276#define XFS_LIC_ARE_ALL_FREE(cp) xfs_lic_are_all_free(cp)
277#else
278#define XFS_LIC_ARE_ALL_FREE(cp) (((cp)->lic_free & XFS_LIC_FREEMASK) ==\
279 XFS_LIC_FREEMASK)
280#endif
281#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_ISFREE)
282int xfs_lic_isfree(xfs_log_item_chunk_t *cp, int slot);
283#define XFS_LIC_ISFREE(cp,slot) xfs_lic_isfree(cp,slot)
284#else
285#define XFS_LIC_ISFREE(cp,slot) ((cp)->lic_free & (1 << (slot)))
286#endif
287#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_CLAIM)
288void xfs_lic_claim(xfs_log_item_chunk_t *cp, int slot);
289#define XFS_LIC_CLAIM(cp,slot) xfs_lic_claim(cp,slot)
290#else
291#define XFS_LIC_CLAIM(cp,slot) ((cp)->lic_free &= ~(1 << (slot)))
292#endif
293#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_RELSE)
294void xfs_lic_relse(xfs_log_item_chunk_t *cp, int slot);
295#define XFS_LIC_RELSE(cp,slot) xfs_lic_relse(cp,slot)
296#else
297#define XFS_LIC_RELSE(cp,slot) ((cp)->lic_free |= 1 << (slot))
298#endif
299#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_SLOT)
300xfs_log_item_desc_t *xfs_lic_slot(xfs_log_item_chunk_t *cp, int slot);
301#define XFS_LIC_SLOT(cp,slot) xfs_lic_slot(cp,slot)
302#else
303#define XFS_LIC_SLOT(cp,slot) (&((cp)->lic_descs[slot]))
304#endif
305#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_DESC_TO_SLOT)
306int xfs_lic_desc_to_slot(xfs_log_item_desc_t *dp);
307#define XFS_LIC_DESC_TO_SLOT(dp) xfs_lic_desc_to_slot(dp)
308#else
309#define XFS_LIC_DESC_TO_SLOT(dp) ((uint)((dp)->lid_index))
310#endif
311/*
312 * Calculate the address of a chunk given a descriptor pointer:
313 * dp - dp->lid_index give the address of the start of the lic_descs array.
314 * From this we subtract the offset of the lic_descs field in a chunk.
315 * All of this yields the address of the chunk, which is
316 * cast to a chunk pointer.
317 */
318#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_DESC_TO_CHUNK)
319xfs_log_item_chunk_t *xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp);
320#define XFS_LIC_DESC_TO_CHUNK(dp) xfs_lic_desc_to_chunk(dp)
321#else
322#define XFS_LIC_DESC_TO_CHUNK(dp) ((xfs_log_item_chunk_t*) \
323 (((xfs_caddr_t)((dp) - (dp)->lid_index)) -\
324 (xfs_caddr_t)(((xfs_log_item_chunk_t*) \
325 0)->lic_descs)))
326#endif
327
328#ifdef __KERNEL__
329/*
330 * This structure is used to maintain a list of block ranges that have been
331 * freed in the transaction. The ranges are listed in the perag[] busy list
332 * between when they're freed and the transaction is committed to disk.
333 */
334
335typedef struct xfs_log_busy_slot {
336 xfs_agnumber_t lbc_ag;
337 ushort lbc_idx; /* index in perag.busy[] */
338} xfs_log_busy_slot_t;
339
340#define XFS_LBC_NUM_SLOTS 31
341typedef struct xfs_log_busy_chunk {
342 struct xfs_log_busy_chunk *lbc_next;
343 uint lbc_free; /* bitmask of free slots */
344 ushort lbc_unused; /* first unused */
345 xfs_log_busy_slot_t lbc_busy[XFS_LBC_NUM_SLOTS];
346} xfs_log_busy_chunk_t;
347
348#define XFS_LBC_MAX_SLOT (XFS_LBC_NUM_SLOTS - 1)
349#define XFS_LBC_FREEMASK ((1U << XFS_LBC_NUM_SLOTS) - 1)
350
351#define XFS_LBC_INIT(cp) ((cp)->lbc_free = XFS_LBC_FREEMASK)
352#define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot)))
353#define XFS_LBC_SLOT(cp, slot) (&((cp)->lbc_busy[(slot)]))
354#define XFS_LBC_VACANCY(cp) (((cp)->lbc_free) & XFS_LBC_FREEMASK)
355#define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot)))
356
357/*
358 * This is the type of function which can be given to xfs_trans_callback()
359 * to be called upon the transaction's commit to disk.
360 */
361typedef void (*xfs_trans_callback_t)(struct xfs_trans *, void *);
362
363/*
364 * This is the structure maintained for every active transaction.
365 */
366typedef struct xfs_trans {
367 unsigned int t_magic; /* magic number */
368 xfs_log_callback_t t_logcb; /* log callback struct */
369 struct xfs_trans *t_forw; /* async list pointers */
370 struct xfs_trans *t_back; /* async list pointers */
371 unsigned int t_type; /* transaction type */
372 unsigned int t_log_res; /* amt of log space resvd */
373 unsigned int t_log_count; /* count for perm log res */
374 unsigned int t_blk_res; /* # of blocks resvd */
375 unsigned int t_blk_res_used; /* # of resvd blocks used */
376 unsigned int t_rtx_res; /* # of rt extents resvd */
377 unsigned int t_rtx_res_used; /* # of resvd rt extents used */
378 xfs_log_ticket_t t_ticket; /* log mgr ticket */
379 sema_t t_sema; /* sema for commit completion */
380 xfs_lsn_t t_lsn; /* log seq num of start of
381 * transaction. */
382 xfs_lsn_t t_commit_lsn; /* log seq num of end of
383 * transaction. */
384 struct xfs_mount *t_mountp; /* ptr to fs mount struct */
385 struct xfs_dquot_acct *t_dqinfo; /* accting info for dquots */
386 xfs_trans_callback_t t_callback; /* transaction callback */
387 void *t_callarg; /* callback arg */
388 unsigned int t_flags; /* misc flags */
389 long t_icount_delta; /* superblock icount change */
390 long t_ifree_delta; /* superblock ifree change */
391 long t_fdblocks_delta; /* superblock fdblocks chg */
392 long t_res_fdblocks_delta; /* on-disk only chg */
393 long t_frextents_delta;/* superblock freextents chg*/
394 long t_res_frextents_delta; /* on-disk only chg */
395 long t_ag_freeblks_delta; /* debugging counter */
396 long t_ag_flist_delta; /* debugging counter */
397 long t_ag_btree_delta; /* debugging counter */
398 long t_dblocks_delta;/* superblock dblocks change */
399 long t_agcount_delta;/* superblock agcount change */
400 long t_imaxpct_delta;/* superblock imaxpct change */
401 long t_rextsize_delta;/* superblock rextsize chg */
402 long t_rbmblocks_delta;/* superblock rbmblocks chg */
403 long t_rblocks_delta;/* superblock rblocks change */
404 long t_rextents_delta;/* superblocks rextents chg */
405 long t_rextslog_delta;/* superblocks rextslog chg */
406 unsigned int t_items_free; /* log item descs free */
407 xfs_log_item_chunk_t t_items; /* first log item desc chunk */
408 xfs_trans_header_t t_header; /* header for in-log trans */
409 unsigned int t_busy_free; /* busy descs free */
410 xfs_log_busy_chunk_t t_busy; /* busy/async free blocks */
411 xfs_pflags_t t_pflags; /* saved pflags state */
412} xfs_trans_t;
413
414#endif /* __KERNEL__ */
415
416
417#define XFS_TRANS_MAGIC 0x5452414E /* 'TRAN' */
418/*
419 * Values for t_flags.
420 */
421#define XFS_TRANS_DIRTY 0x01 /* something needs to be logged */
422#define XFS_TRANS_SB_DIRTY 0x02 /* superblock is modified */
423#define XFS_TRANS_PERM_LOG_RES 0x04 /* xact took a permanent log res */
424#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */
425#define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */
426#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */
427
428/*
429 * Values for call flags parameter.
430 */
431#define XFS_TRANS_NOSLEEP 0x1
432#define XFS_TRANS_WAIT 0x2
433#define XFS_TRANS_RELEASE_LOG_RES 0x4
434#define XFS_TRANS_ABORT 0x8
435
436/*
437 * Field values for xfs_trans_mod_sb.
438 */
439#define XFS_TRANS_SB_ICOUNT 0x00000001
440#define XFS_TRANS_SB_IFREE 0x00000002
441#define XFS_TRANS_SB_FDBLOCKS 0x00000004
442#define XFS_TRANS_SB_RES_FDBLOCKS 0x00000008
443#define XFS_TRANS_SB_FREXTENTS 0x00000010
444#define XFS_TRANS_SB_RES_FREXTENTS 0x00000020
445#define XFS_TRANS_SB_DBLOCKS 0x00000040
446#define XFS_TRANS_SB_AGCOUNT 0x00000080
447#define XFS_TRANS_SB_IMAXPCT 0x00000100
448#define XFS_TRANS_SB_REXTSIZE 0x00000200
449#define XFS_TRANS_SB_RBMBLOCKS 0x00000400
450#define XFS_TRANS_SB_RBLOCKS 0x00000800
451#define XFS_TRANS_SB_REXTENTS 0x00001000
452#define XFS_TRANS_SB_REXTSLOG 0x00002000
453
454
455/*
456 * Various log reservation values.
457 * These are based on the size of the file system block
458 * because that is what most transactions manipulate.
459 * Each adds in an additional 128 bytes per item logged to
460 * try to account for the overhead of the transaction mechanism.
461 *
462 * Note:
463 * Most of the reservations underestimate the number of allocation
464 * groups into which they could free extents in the xfs_bmap_finish()
465 * call. This is because the number in the worst case is quite high
466 * and quite unusual. In order to fix this we need to change
467 * xfs_bmap_finish() to free extents in only a single AG at a time.
468 * This will require changes to the EFI code as well, however, so that
469 * the EFI for the extents not freed is logged again in each transaction.
470 * See bug 261917.
471 */
472
473/*
474 * Per-extent log reservation for the allocation btree changes
475 * involved in freeing or allocating an extent.
476 * 2 trees * (2 blocks/level * max depth - 1) * block size
477 */
478#define XFS_ALLOCFREE_LOG_RES(mp,nx) \
479 ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1)))
480#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
481 ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
482
483/*
484 * Per-directory log reservation for any directory change.
485 * dir blocks: (1 btree block per level + data block + free block) * dblock size
486 * bmap btree: (levels + 2) * max depth * block size
487 * v2 directory blocks can be fragmented below the dirblksize down to the fsb
488 * size, so account for that in the DAENTER macros.
489 */
490#define XFS_DIROP_LOG_RES(mp) \
491 (XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \
492 (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)))
493#define XFS_DIROP_LOG_COUNT(mp) \
494 (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
495 XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
496
497/*
498 * In a write transaction we can allocate a maximum of 2
499 * extents. This gives:
500 * the inode getting the new extents: inode size
501 * the inode\'s bmap btree: max depth * block size
502 * the agfs of the ags from which the extents are allocated: 2 * sector
503 * the superblock free block counter: sector size
504 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
505 * And the bmap_finish transaction can free bmap blocks in a join:
506 * the agfs of the ags containing the blocks: 2 * sector size
507 * the agfls of the ags containing the blocks: 2 * sector size
508 * the super block free block counter: sector size
509 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
510 */
511#define XFS_CALC_WRITE_LOG_RES(mp) \
512 (MAX( \
513 ((mp)->m_sb.sb_inodesize + \
514 XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \
515 (2 * (mp)->m_sb.sb_sectsize) + \
516 (mp)->m_sb.sb_sectsize + \
517 XFS_ALLOCFREE_LOG_RES(mp, 2) + \
518 (128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))),\
519 ((2 * (mp)->m_sb.sb_sectsize) + \
520 (2 * (mp)->m_sb.sb_sectsize) + \
521 (mp)->m_sb.sb_sectsize + \
522 XFS_ALLOCFREE_LOG_RES(mp, 2) + \
523 (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
524
525#define XFS_WRITE_LOG_RES(mp) ((mp)->m_reservations.tr_write)
526
527/*
528 * In truncating a file we free up to two extents at once. We can modify:
529 * the inode being truncated: inode size
530 * the inode\'s bmap btree: (max depth + 1) * block size
531 * And the bmap_finish transaction can free the blocks and bmap blocks:
532 * the agf for each of the ags: 4 * sector size
533 * the agfl for each of the ags: 4 * sector size
534 * the super block to reflect the freed blocks: sector size
535 * worst case split in allocation btrees per extent assuming 4 extents:
536 * 4 exts * 2 trees * (2 * max depth - 1) * block size
537 * the inode btree: max depth * blocksize
538 * the allocation btrees: 2 trees * (max depth - 1) * block size
539 */
540#define XFS_CALC_ITRUNCATE_LOG_RES(mp) \
541 (MAX( \
542 ((mp)->m_sb.sb_inodesize + \
543 XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) + \
544 (128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \
545 ((4 * (mp)->m_sb.sb_sectsize) + \
546 (4 * (mp)->m_sb.sb_sectsize) + \
547 (mp)->m_sb.sb_sectsize + \
548 XFS_ALLOCFREE_LOG_RES(mp, 4) + \
549 (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))) + \
550 (128 * 5) + \
551 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
552 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \
553 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
554
555#define XFS_ITRUNCATE_LOG_RES(mp) ((mp)->m_reservations.tr_itruncate)
556
557/*
558 * In renaming a files we can modify:
559 * the four inodes involved: 4 * inode size
560 * the two directory btrees: 2 * (max depth + v2) * dir block size
561 * the two directory bmap btrees: 2 * max depth * block size
562 * And the bmap_finish transaction can free dir and bmap blocks (two sets
563 * of bmap blocks) giving:
564 * the agf for the ags in which the blocks live: 3 * sector size
565 * the agfl for the ags in which the blocks live: 3 * sector size
566 * the superblock for the free block count: sector size
567 * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
568 */
569#define XFS_CALC_RENAME_LOG_RES(mp) \
570 (MAX( \
571 ((4 * (mp)->m_sb.sb_inodesize) + \
572 (2 * XFS_DIROP_LOG_RES(mp)) + \
573 (128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp)))), \
574 ((3 * (mp)->m_sb.sb_sectsize) + \
575 (3 * (mp)->m_sb.sb_sectsize) + \
576 (mp)->m_sb.sb_sectsize + \
577 XFS_ALLOCFREE_LOG_RES(mp, 3) + \
578 (128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3))))))
579
580#define XFS_RENAME_LOG_RES(mp) ((mp)->m_reservations.tr_rename)
581
582/*
583 * For creating a link to an inode:
584 * the parent directory inode: inode size
585 * the linked inode: inode size
586 * the directory btree could split: (max depth + v2) * dir block size
587 * the directory bmap btree could join or split: (max depth + v2) * blocksize
588 * And the bmap_finish transaction can free some bmap blocks giving:
589 * the agf for the ag in which the blocks live: sector size
590 * the agfl for the ag in which the blocks live: sector size
591 * the superblock for the free block count: sector size
592 * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
593 */
594#define XFS_CALC_LINK_LOG_RES(mp) \
595 (MAX( \
596 ((mp)->m_sb.sb_inodesize + \
597 (mp)->m_sb.sb_inodesize + \
598 XFS_DIROP_LOG_RES(mp) + \
599 (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \
600 ((mp)->m_sb.sb_sectsize + \
601 (mp)->m_sb.sb_sectsize + \
602 (mp)->m_sb.sb_sectsize + \
603 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
604 (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
605
606#define XFS_LINK_LOG_RES(mp) ((mp)->m_reservations.tr_link)
607
608/*
609 * For removing a directory entry we can modify:
610 * the parent directory inode: inode size
611 * the removed inode: inode size
612 * the directory btree could join: (max depth + v2) * dir block size
613 * the directory bmap btree could join or split: (max depth + v2) * blocksize
614 * And the bmap_finish transaction can free the dir and bmap blocks giving:
615 * the agf for the ag in which the blocks live: 2 * sector size
616 * the agfl for the ag in which the blocks live: 2 * sector size
617 * the superblock for the free block count: sector size
618 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
619 */
620#define XFS_CALC_REMOVE_LOG_RES(mp) \
621 (MAX( \
622 ((mp)->m_sb.sb_inodesize + \
623 (mp)->m_sb.sb_inodesize + \
624 XFS_DIROP_LOG_RES(mp) + \
625 (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \
626 ((2 * (mp)->m_sb.sb_sectsize) + \
627 (2 * (mp)->m_sb.sb_sectsize) + \
628 (mp)->m_sb.sb_sectsize + \
629 XFS_ALLOCFREE_LOG_RES(mp, 2) + \
630 (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
631
632#define XFS_REMOVE_LOG_RES(mp) ((mp)->m_reservations.tr_remove)
633
634/*
635 * For symlink we can modify:
636 * the parent directory inode: inode size
637 * the new inode: inode size
638 * the inode btree entry: 1 block
639 * the directory btree: (max depth + v2) * dir block size
640 * the directory inode\'s bmap btree: (max depth + v2) * block size
641 * the blocks for the symlink: 1 KB
642 * Or in the first xact we allocate some inodes giving:
643 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
644 * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
645 * the inode btree: max depth * blocksize
646 * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
647 */
648#define XFS_CALC_SYMLINK_LOG_RES(mp) \
649 (MAX( \
650 ((mp)->m_sb.sb_inodesize + \
651 (mp)->m_sb.sb_inodesize + \
652 XFS_FSB_TO_B(mp, 1) + \
653 XFS_DIROP_LOG_RES(mp) + \
654 1024 + \
655 (128 * (4 + XFS_DIROP_LOG_COUNT(mp)))), \
656 (2 * (mp)->m_sb.sb_sectsize + \
657 XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
658 XFS_FSB_TO_B((mp), XFS_IN_MAXLEVELS(mp)) + \
659 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
660 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \
661 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
662
663#define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink)
664
665/*
666 * For create we can modify:
667 * the parent directory inode: inode size
668 * the new inode: inode size
669 * the inode btree entry: block size
670 * the superblock for the nlink flag: sector size
671 * the directory btree: (max depth + v2) * dir block size
672 * the directory inode\'s bmap btree: (max depth + v2) * block size
673 * Or in the first xact we allocate some inodes giving:
674 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
675 * the superblock for the nlink flag: sector size
676 * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
677 * the inode btree: max depth * blocksize
678 * the allocation btrees: 2 trees * (max depth - 1) * block size
679 */
680#define XFS_CALC_CREATE_LOG_RES(mp) \
681 (MAX( \
682 ((mp)->m_sb.sb_inodesize + \
683 (mp)->m_sb.sb_inodesize + \
684 (mp)->m_sb.sb_sectsize + \
685 XFS_FSB_TO_B(mp, 1) + \
686 XFS_DIROP_LOG_RES(mp) + \
687 (128 * (3 + XFS_DIROP_LOG_COUNT(mp)))), \
688 (3 * (mp)->m_sb.sb_sectsize + \
689 XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
690 XFS_FSB_TO_B((mp), XFS_IN_MAXLEVELS(mp)) + \
691 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
692 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \
693 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
694
695#define XFS_CREATE_LOG_RES(mp) ((mp)->m_reservations.tr_create)
696
697/*
698 * Making a new directory is the same as creating a new file.
699 */
700#define XFS_CALC_MKDIR_LOG_RES(mp) XFS_CALC_CREATE_LOG_RES(mp)
701
702#define XFS_MKDIR_LOG_RES(mp) ((mp)->m_reservations.tr_mkdir)
703
704/*
705 * In freeing an inode we can modify:
706 * the inode being freed: inode size
707 * the super block free inode counter: sector size
708 * the agi hash list and counters: sector size
709 * the inode btree entry: block size
710 * the on disk inode before ours in the agi hash list: inode cluster size
711 * the inode btree: max depth * blocksize
712 * the allocation btrees: 2 trees * (max depth - 1) * block size
713 */
714#define XFS_CALC_IFREE_LOG_RES(mp) \
715 ((mp)->m_sb.sb_inodesize + \
716 (mp)->m_sb.sb_sectsize + \
717 (mp)->m_sb.sb_sectsize + \
718 XFS_FSB_TO_B((mp), 1) + \
719 MAX((__uint16_t)XFS_FSB_TO_B((mp), 1), XFS_INODE_CLUSTER_SIZE(mp)) + \
720 (128 * 5) + \
721 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
722 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \
723 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
724
725
726#define XFS_IFREE_LOG_RES(mp) ((mp)->m_reservations.tr_ifree)
727
728/*
729 * When only changing the inode we log the inode and possibly the superblock
730 * We also add a bit of slop for the transaction stuff.
731 */
732#define XFS_CALC_ICHANGE_LOG_RES(mp) ((mp)->m_sb.sb_inodesize + \
733 (mp)->m_sb.sb_sectsize + 512)
734
735#define XFS_ICHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_ichange)
736
737/*
738 * Growing the data section of the filesystem.
739 * superblock
740 * agi and agf
741 * allocation btrees
742 */
743#define XFS_CALC_GROWDATA_LOG_RES(mp) \
744 ((mp)->m_sb.sb_sectsize * 3 + \
745 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
746 (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
747
748#define XFS_GROWDATA_LOG_RES(mp) ((mp)->m_reservations.tr_growdata)
749
750/*
751 * Growing the rt section of the filesystem.
752 * In the first set of transactions (ALLOC) we allocate space to the
753 * bitmap or summary files.
754 * superblock: sector size
755 * agf of the ag from which the extent is allocated: sector size
756 * bmap btree for bitmap/summary inode: max depth * blocksize
757 * bitmap/summary inode: inode size
758 * allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
759 */
760#define XFS_CALC_GROWRTALLOC_LOG_RES(mp) \
761 (2 * (mp)->m_sb.sb_sectsize + \
762 XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \
763 (mp)->m_sb.sb_inodesize + \
764 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
765 (128 * \
766 (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + \
767 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
768
769#define XFS_GROWRTALLOC_LOG_RES(mp) ((mp)->m_reservations.tr_growrtalloc)
770
771/*
772 * Growing the rt section of the filesystem.
773 * In the second set of transactions (ZERO) we zero the new metadata blocks.
774 * one bitmap/summary block: blocksize
775 */
776#define XFS_CALC_GROWRTZERO_LOG_RES(mp) \
777 ((mp)->m_sb.sb_blocksize + 128)
778
779#define XFS_GROWRTZERO_LOG_RES(mp) ((mp)->m_reservations.tr_growrtzero)
780
781/*
782 * Growing the rt section of the filesystem.
783 * In the third set of transactions (FREE) we update metadata without
784 * allocating any new blocks.
785 * superblock: sector size
786 * bitmap inode: inode size
787 * summary inode: inode size
788 * one bitmap block: blocksize
789 * summary blocks: new summary size
790 */
791#define XFS_CALC_GROWRTFREE_LOG_RES(mp) \
792 ((mp)->m_sb.sb_sectsize + \
793 2 * (mp)->m_sb.sb_inodesize + \
794 (mp)->m_sb.sb_blocksize + \
795 (mp)->m_rsumsize + \
796 (128 * 5))
797
798#define XFS_GROWRTFREE_LOG_RES(mp) ((mp)->m_reservations.tr_growrtfree)
799
800/*
801 * Logging the inode modification timestamp on a synchronous write.
802 * inode
803 */
804#define XFS_CALC_SWRITE_LOG_RES(mp) \
805 ((mp)->m_sb.sb_inodesize + 128)
806
807#define XFS_SWRITE_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
808
809/*
810 * Logging the inode timestamps on an fsync -- same as SWRITE
811 * as long as SWRITE logs the entire inode core
812 */
813#define XFS_FSYNC_TS_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
814
815/*
816 * Logging the inode mode bits when writing a setuid/setgid file
817 * inode
818 */
819#define XFS_CALC_WRITEID_LOG_RES(mp) \
820 ((mp)->m_sb.sb_inodesize + 128)
821
822#define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
823
824/*
825 * Converting the inode from non-attributed to attributed.
826 * the inode being converted: inode size
827 * agf block and superblock (for block allocation)
828 * the new block (directory sized)
829 * bmap blocks for the new directory block
830 * allocation btrees
831 */
832#define XFS_CALC_ADDAFORK_LOG_RES(mp) \
833 ((mp)->m_sb.sb_inodesize + \
834 (mp)->m_sb.sb_sectsize * 2 + \
835 (mp)->m_dirblksize + \
836 (XFS_DIR_IS_V1(mp) ? 0 : \
837 XFS_FSB_TO_B(mp, (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1))) + \
838 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
839 (128 * (4 + \
840 (XFS_DIR_IS_V1(mp) ? 0 : \
841 XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) + \
842 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
843
844#define XFS_ADDAFORK_LOG_RES(mp) ((mp)->m_reservations.tr_addafork)
845
846/*
847 * Removing the attribute fork of a file
848 * the inode being truncated: inode size
849 * the inode\'s bmap btree: max depth * block size
850 * And the bmap_finish transaction can free the blocks and bmap blocks:
851 * the agf for each of the ags: 4 * sector size
852 * the agfl for each of the ags: 4 * sector size
853 * the super block to reflect the freed blocks: sector size
854 * worst case split in allocation btrees per extent assuming 4 extents:
855 * 4 exts * 2 trees * (2 * max depth - 1) * block size
856 */
857#define XFS_CALC_ATTRINVAL_LOG_RES(mp) \
858 (MAX( \
859 ((mp)->m_sb.sb_inodesize + \
860 XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \
861 (128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))), \
862 ((4 * (mp)->m_sb.sb_sectsize) + \
863 (4 * (mp)->m_sb.sb_sectsize) + \
864 (mp)->m_sb.sb_sectsize + \
865 XFS_ALLOCFREE_LOG_RES(mp, 4) + \
866 (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))))))
867
868#define XFS_ATTRINVAL_LOG_RES(mp) ((mp)->m_reservations.tr_attrinval)
869
870/*
871 * Setting an attribute.
872 * the inode getting the attribute
873 * the superblock for allocations
874 * the agfs extents are allocated from
875 * the attribute btree * max depth
876 * the inode allocation btree
877 * Since attribute transaction space is dependent on the size of the attribute,
878 * the calculation is done partially at mount time and partially at runtime.
879 */
880#define XFS_CALC_ATTRSET_LOG_RES(mp) \
881 ((mp)->m_sb.sb_inodesize + \
882 (mp)->m_sb.sb_sectsize + \
883 XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \
884 (128 * (2 + XFS_DA_NODE_MAXDEPTH)))
885
886#define XFS_ATTRSET_LOG_RES(mp, ext) \
887 ((mp)->m_reservations.tr_attrset + \
888 (ext * (mp)->m_sb.sb_sectsize) + \
889 (ext * XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))) + \
890 (128 * (ext + (ext * XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))))
891
892/*
893 * Removing an attribute.
894 * the inode: inode size
895 * the attribute btree could join: max depth * block size
896 * the inode bmap btree could join or split: max depth * block size
897 * And the bmap_finish transaction can free the attr blocks freed giving:
898 * the agf for the ag in which the blocks live: 2 * sector size
899 * the agfl for the ag in which the blocks live: 2 * sector size
900 * the superblock for the free block count: sector size
901 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
902 */
903#define XFS_CALC_ATTRRM_LOG_RES(mp) \
904 (MAX( \
905 ((mp)->m_sb.sb_inodesize + \
906 XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \
907 XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \
908 (128 * (1 + XFS_DA_NODE_MAXDEPTH + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \
909 ((2 * (mp)->m_sb.sb_sectsize) + \
910 (2 * (mp)->m_sb.sb_sectsize) + \
911 (mp)->m_sb.sb_sectsize + \
912 XFS_ALLOCFREE_LOG_RES(mp, 2) + \
913 (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
914
915#define XFS_ATTRRM_LOG_RES(mp) ((mp)->m_reservations.tr_attrrm)
916
917/*
918 * Clearing a bad agino number in an agi hash bucket.
919 */
920#define XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp) \
921 ((mp)->m_sb.sb_sectsize + 128)
922
923#define XFS_CLEAR_AGI_BUCKET_LOG_RES(mp) ((mp)->m_reservations.tr_clearagi)
924
925
926/*
927 * Various log count values.
928 */
929#define XFS_DEFAULT_LOG_COUNT 1
930#define XFS_DEFAULT_PERM_LOG_COUNT 2
931#define XFS_ITRUNCATE_LOG_COUNT 2
932#define XFS_INACTIVE_LOG_COUNT 2
933#define XFS_CREATE_LOG_COUNT 2
934#define XFS_MKDIR_LOG_COUNT 3
935#define XFS_SYMLINK_LOG_COUNT 3
936#define XFS_REMOVE_LOG_COUNT 2
937#define XFS_LINK_LOG_COUNT 2
938#define XFS_RENAME_LOG_COUNT 2
939#define XFS_WRITE_LOG_COUNT 2
940#define XFS_ADDAFORK_LOG_COUNT 2
941#define XFS_ATTRINVAL_LOG_COUNT 1
942#define XFS_ATTRSET_LOG_COUNT 3
943#define XFS_ATTRRM_LOG_COUNT 3
944
945/*
946 * Here we centralize the specification of XFS meta-data buffer
947 * reference count values. This determine how hard the buffer
948 * cache tries to hold onto the buffer.
949 */
950#define XFS_AGF_REF 4
951#define XFS_AGI_REF 4
952#define XFS_AGFL_REF 3
953#define XFS_INO_BTREE_REF 3
954#define XFS_ALLOC_BTREE_REF 2
955#define XFS_BMAP_BTREE_REF 2
956#define XFS_DIR_BTREE_REF 2
957#define XFS_ATTR_BTREE_REF 1
958#define XFS_INO_REF 1
959#define XFS_DQUOT_REF 1
960
961#ifdef __KERNEL__
962/*
963 * XFS transaction mechanism exported interfaces that are
964 * actually macros.
965 */
966#define xfs_trans_get_log_res(tp) ((tp)->t_log_res)
967#define xfs_trans_get_log_count(tp) ((tp)->t_log_count)
968#define xfs_trans_get_block_res(tp) ((tp)->t_blk_res)
969#define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC)
970
971#ifdef DEBUG
972#define xfs_trans_agblocks_delta(tp, d) ((tp)->t_ag_freeblks_delta += (long)d)
973#define xfs_trans_agflist_delta(tp, d) ((tp)->t_ag_flist_delta += (long)d)
974#define xfs_trans_agbtree_delta(tp, d) ((tp)->t_ag_btree_delta += (long)d)
975#else
976#define xfs_trans_agblocks_delta(tp, d)
977#define xfs_trans_agflist_delta(tp, d)
978#define xfs_trans_agbtree_delta(tp, d)
979#endif
980
981/*
982 * XFS transaction mechanism exported interfaces.
983 */
984void xfs_trans_init(struct xfs_mount *);
985xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint);
986xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint);
987xfs_trans_t *xfs_trans_dup(xfs_trans_t *);
988int xfs_trans_reserve(xfs_trans_t *, uint, uint, uint,
989 uint, uint);
990void xfs_trans_callback(xfs_trans_t *,
991 void (*)(xfs_trans_t *, void *), void *);
992void xfs_trans_mod_sb(xfs_trans_t *, uint, long);
993struct xfs_buf *xfs_trans_get_buf(xfs_trans_t *, struct xfs_buftarg *, xfs_daddr_t,
994 int, uint);
995int xfs_trans_read_buf(struct xfs_mount *, xfs_trans_t *,
996 struct xfs_buftarg *, xfs_daddr_t, int, uint,
997 struct xfs_buf **);
998struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int);
999
1000void xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *);
1001void xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *);
1002void xfs_trans_bhold(xfs_trans_t *, struct xfs_buf *);
1003void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
1004void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
1005void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
1006void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
1007void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
1008void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
1009int xfs_trans_iget(struct xfs_mount *, xfs_trans_t *,
1010 xfs_ino_t , uint, uint, struct xfs_inode **);
1011void xfs_trans_ijoin(xfs_trans_t *, struct xfs_inode *, uint);
1012void xfs_trans_ihold(xfs_trans_t *, struct xfs_inode *);
1013void xfs_trans_ihold_release(xfs_trans_t *, struct xfs_inode *);
1014void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
1015void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
1016struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint);
1017void xfs_efi_release(struct xfs_efi_log_item *, uint);
1018void xfs_trans_log_efi_extent(xfs_trans_t *,
1019 struct xfs_efi_log_item *,
1020 xfs_fsblock_t,
1021 xfs_extlen_t);
1022struct xfs_efd_log_item *xfs_trans_get_efd(xfs_trans_t *,
1023 struct xfs_efi_log_item *,
1024 uint);
1025void xfs_trans_log_efd_extent(xfs_trans_t *,
1026 struct xfs_efd_log_item *,
1027 xfs_fsblock_t,
1028 xfs_extlen_t);
1029int xfs_trans_commit(xfs_trans_t *, uint flags, xfs_lsn_t *);
1030void xfs_trans_cancel(xfs_trans_t *, int);
1031void xfs_trans_ail_init(struct xfs_mount *);
1032xfs_lsn_t xfs_trans_push_ail(struct xfs_mount *, xfs_lsn_t);
1033xfs_lsn_t xfs_trans_tail_ail(struct xfs_mount *);
1034void xfs_trans_unlocked_item(struct xfs_mount *,
1035 xfs_log_item_t *);
1036xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp,
1037 xfs_agnumber_t ag,
1038 xfs_extlen_t idx);
1039
1040#endif /* __KERNEL__ */
1041
1042#endif /* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
new file mode 100644
index 000000000000..7bc5eab4c2c1
--- /dev/null
+++ b/fs/xfs/xfs_trans_ail.c
@@ -0,0 +1,596 @@
1/*
2 * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34#include "xfs_macros.h"
35#include "xfs_types.h"
36#include "xfs_inum.h"
37#include "xfs_log.h"
38#include "xfs_trans.h"
39#include "xfs_sb.h"
40#include "xfs_dir.h"
41#include "xfs_dmapi.h"
42#include "xfs_mount.h"
43#include "xfs_trans_priv.h"
44#include "xfs_error.h"
45
46STATIC void xfs_ail_insert(xfs_ail_entry_t *, xfs_log_item_t *);
47STATIC xfs_log_item_t * xfs_ail_delete(xfs_ail_entry_t *, xfs_log_item_t *);
48STATIC xfs_log_item_t * xfs_ail_min(xfs_ail_entry_t *);
49STATIC xfs_log_item_t * xfs_ail_next(xfs_ail_entry_t *, xfs_log_item_t *);
50
51#ifdef DEBUG
52STATIC void xfs_ail_check(xfs_ail_entry_t *);
53#else
54#define xfs_ail_check(a)
55#endif /* DEBUG */
56
57
58/*
59 * This is called by the log manager code to determine the LSN
60 * of the tail of the log. This is exactly the LSN of the first
61 * item in the AIL. If the AIL is empty, then this function
62 * returns 0.
63 *
64 * We need the AIL lock in order to get a coherent read of the
65 * lsn of the last item in the AIL.
66 */
67xfs_lsn_t
68xfs_trans_tail_ail(
69 xfs_mount_t *mp)
70{
71 xfs_lsn_t lsn;
72 xfs_log_item_t *lip;
73 SPLDECL(s);
74
75 AIL_LOCK(mp,s);
76 lip = xfs_ail_min(&(mp->m_ail));
77 if (lip == NULL) {
78 lsn = (xfs_lsn_t)0;
79 } else {
80 lsn = lip->li_lsn;
81 }
82 AIL_UNLOCK(mp, s);
83
84 return lsn;
85}
86
87/*
88 * xfs_trans_push_ail
89 *
90 * This routine is called to move the tail of the AIL
91 * forward. It does this by trying to flush items in the AIL
92 * whose lsns are below the given threshold_lsn.
93 *
94 * The routine returns the lsn of the tail of the log.
95 */
96xfs_lsn_t
97xfs_trans_push_ail(
98 xfs_mount_t *mp,
99 xfs_lsn_t threshold_lsn)
100{
101 xfs_lsn_t lsn;
102 xfs_log_item_t *lip;
103 int gen;
104 int restarts;
105 int lock_result;
106 int flush_log;
107 SPLDECL(s);
108
109#define XFS_TRANS_PUSH_AIL_RESTARTS 10
110
111 AIL_LOCK(mp,s);
112 lip = xfs_trans_first_ail(mp, &gen);
113 if (lip == NULL || XFS_FORCED_SHUTDOWN(mp)) {
114 /*
115 * Just return if the AIL is empty.
116 */
117 AIL_UNLOCK(mp, s);
118 return (xfs_lsn_t)0;
119 }
120
121 XFS_STATS_INC(xs_push_ail);
122
123 /*
124 * While the item we are looking at is below the given threshold
125 * try to flush it out. Make sure to limit the number of times
126 * we allow xfs_trans_next_ail() to restart scanning from the
127 * beginning of the list. We'd like not to stop until we've at least
128 * tried to push on everything in the AIL with an LSN less than
129 * the given threshold. However, we may give up before that if
130 * we realize that we've been holding the AIL_LOCK for 'too long',
131 * blocking interrupts. Currently, too long is < 500us roughly.
132 */
133 flush_log = 0;
134 restarts = 0;
135 while (((restarts < XFS_TRANS_PUSH_AIL_RESTARTS) &&
136 (XFS_LSN_CMP(lip->li_lsn, threshold_lsn) < 0))) {
137 /*
138 * If we can lock the item without sleeping, unlock
139 * the AIL lock and flush the item. Then re-grab the
140 * AIL lock so we can look for the next item on the
141 * AIL. Since we unlock the AIL while we flush the
142 * item, the next routine may start over again at the
143 * the beginning of the list if anything has changed.
144 * That is what the generation count is for.
145 *
146 * If we can't lock the item, either its holder will flush
147 * it or it is already being flushed or it is being relogged.
148 * In any of these case it is being taken care of and we
149 * can just skip to the next item in the list.
150 */
151 lock_result = IOP_TRYLOCK(lip);
152 switch (lock_result) {
153 case XFS_ITEM_SUCCESS:
154 AIL_UNLOCK(mp, s);
155 XFS_STATS_INC(xs_push_ail_success);
156 IOP_PUSH(lip);
157 AIL_LOCK(mp,s);
158 break;
159
160 case XFS_ITEM_PUSHBUF:
161 AIL_UNLOCK(mp, s);
162 XFS_STATS_INC(xs_push_ail_pushbuf);
163#ifdef XFSRACEDEBUG
164 delay_for_intr();
165 delay(300);
166#endif
167 ASSERT(lip->li_ops->iop_pushbuf);
168 ASSERT(lip);
169 IOP_PUSHBUF(lip);
170 AIL_LOCK(mp,s);
171 break;
172
173 case XFS_ITEM_PINNED:
174 XFS_STATS_INC(xs_push_ail_pinned);
175 flush_log = 1;
176 break;
177
178 case XFS_ITEM_LOCKED:
179 XFS_STATS_INC(xs_push_ail_locked);
180 break;
181
182 case XFS_ITEM_FLUSHING:
183 XFS_STATS_INC(xs_push_ail_flushing);
184 break;
185
186 default:
187 ASSERT(0);
188 break;
189 }
190
191 lip = xfs_trans_next_ail(mp, lip, &gen, &restarts);
192 if (lip == NULL) {
193 break;
194 }
195 if (XFS_FORCED_SHUTDOWN(mp)) {
196 /*
197 * Just return if we shut down during the last try.
198 */
199 AIL_UNLOCK(mp, s);
200 return (xfs_lsn_t)0;
201 }
202
203 }
204
205 if (flush_log) {
206 /*
207 * If something we need to push out was pinned, then
208 * push out the log so it will become unpinned and
209 * move forward in the AIL.
210 */
211 AIL_UNLOCK(mp, s);
212 XFS_STATS_INC(xs_push_ail_flush);
213 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
214 AIL_LOCK(mp, s);
215 }
216
217 lip = xfs_ail_min(&(mp->m_ail));
218 if (lip == NULL) {
219 lsn = (xfs_lsn_t)0;
220 } else {
221 lsn = lip->li_lsn;
222 }
223
224 AIL_UNLOCK(mp, s);
225 return lsn;
226} /* xfs_trans_push_ail */
227
228
229/*
230 * This is to be called when an item is unlocked that may have
231 * been in the AIL. It will wake up the first member of the AIL
232 * wait list if this item's unlocking might allow it to progress.
233 * If the item is in the AIL, then we need to get the AIL lock
234 * while doing our checking so we don't race with someone going
235 * to sleep waiting for this event in xfs_trans_push_ail().
236 */
237void
238xfs_trans_unlocked_item(
239 xfs_mount_t *mp,
240 xfs_log_item_t *lip)
241{
242 xfs_log_item_t *min_lip;
243
244 /*
245 * If we're forcibly shutting down, we may have
246 * unlocked log items arbitrarily. The last thing
247 * we want to do is to move the tail of the log
248 * over some potentially valid data.
249 */
250 if (!(lip->li_flags & XFS_LI_IN_AIL) ||
251 XFS_FORCED_SHUTDOWN(mp)) {
252 return;
253 }
254
255 /*
256 * This is the one case where we can call into xfs_ail_min()
257 * without holding the AIL lock because we only care about the
258 * case where we are at the tail of the AIL. If the object isn't
259 * at the tail, it doesn't matter what result we get back. This
260 * is slightly racy because since we were just unlocked, we could
261 * go to sleep between the call to xfs_ail_min and the call to
262 * xfs_log_move_tail, have someone else lock us, commit to us disk,
263 * move us out of the tail of the AIL, and then we wake up. However,
264 * the call to xfs_log_move_tail() doesn't do anything if there's
265 * not enough free space to wake people up so we're safe calling it.
266 */
267 min_lip = xfs_ail_min(&mp->m_ail);
268
269 if (min_lip == lip)
270 xfs_log_move_tail(mp, 1);
271} /* xfs_trans_unlocked_item */
272
273
274/*
275 * Update the position of the item in the AIL with the new
276 * lsn. If it is not yet in the AIL, add it. Otherwise, move
277 * it to its new position by removing it and re-adding it.
278 *
279 * Wakeup anyone with an lsn less than the item's lsn. If the item
280 * we move in the AIL is the minimum one, update the tail lsn in the
281 * log manager.
282 *
283 * Increment the AIL's generation count to indicate that the tree
284 * has changed.
285 *
286 * This function must be called with the AIL lock held. The lock
287 * is dropped before returning, so the caller must pass in the
288 * cookie returned by AIL_LOCK.
289 */
290void
291xfs_trans_update_ail(
292 xfs_mount_t *mp,
293 xfs_log_item_t *lip,
294 xfs_lsn_t lsn,
295 unsigned long s)
296{
297 xfs_ail_entry_t *ailp;
298 xfs_log_item_t *dlip=NULL;
299 xfs_log_item_t *mlip; /* ptr to minimum lip */
300
301 ailp = &(mp->m_ail);
302 mlip = xfs_ail_min(ailp);
303
304 if (lip->li_flags & XFS_LI_IN_AIL) {
305 dlip = xfs_ail_delete(ailp, lip);
306 ASSERT(dlip == lip);
307 } else {
308 lip->li_flags |= XFS_LI_IN_AIL;
309 }
310
311 lip->li_lsn = lsn;
312
313 xfs_ail_insert(ailp, lip);
314 mp->m_ail_gen++;
315
316 if (mlip == dlip) {
317 mlip = xfs_ail_min(&(mp->m_ail));
318 AIL_UNLOCK(mp, s);
319 xfs_log_move_tail(mp, mlip->li_lsn);
320 } else {
321 AIL_UNLOCK(mp, s);
322 }
323
324
325} /* xfs_trans_update_ail */
326
327/*
328 * Delete the given item from the AIL. It must already be in
329 * the AIL.
330 *
331 * Wakeup anyone with an lsn less than item's lsn. If the item
332 * we delete in the AIL is the minimum one, update the tail lsn in the
333 * log manager.
334 *
335 * Clear the IN_AIL flag from the item, reset its lsn to 0, and
336 * bump the AIL's generation count to indicate that the tree
337 * has changed.
338 *
339 * This function must be called with the AIL lock held. The lock
340 * is dropped before returning, so the caller must pass in the
341 * cookie returned by AIL_LOCK.
342 */
343void
344xfs_trans_delete_ail(
345 xfs_mount_t *mp,
346 xfs_log_item_t *lip,
347 unsigned long s)
348{
349 xfs_ail_entry_t *ailp;
350 xfs_log_item_t *dlip;
351 xfs_log_item_t *mlip;
352
353 if (lip->li_flags & XFS_LI_IN_AIL) {
354 ailp = &(mp->m_ail);
355 mlip = xfs_ail_min(ailp);
356 dlip = xfs_ail_delete(ailp, lip);
357 ASSERT(dlip == lip);
358
359
360 lip->li_flags &= ~XFS_LI_IN_AIL;
361 lip->li_lsn = 0;
362 mp->m_ail_gen++;
363
364 if (mlip == dlip) {
365 mlip = xfs_ail_min(&(mp->m_ail));
366 AIL_UNLOCK(mp, s);
367 xfs_log_move_tail(mp, (mlip ? mlip->li_lsn : 0));
368 } else {
369 AIL_UNLOCK(mp, s);
370 }
371 }
372 else {
373 /*
374 * If the file system is not being shutdown, we are in
375 * serious trouble if we get to this stage.
376 */
377 if (XFS_FORCED_SHUTDOWN(mp))
378 AIL_UNLOCK(mp, s);
379 else {
380 xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
381 "xfs_trans_delete_ail: attempting to delete a log item that is not in the AIL");
382 xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
383 AIL_UNLOCK(mp, s);
384 }
385 }
386}
387
388
389
390/*
391 * Return the item in the AIL with the smallest lsn.
392 * Return the current tree generation number for use
393 * in calls to xfs_trans_next_ail().
394 */
395xfs_log_item_t *
396xfs_trans_first_ail(
397 xfs_mount_t *mp,
398 int *gen)
399{
400 xfs_log_item_t *lip;
401
402 lip = xfs_ail_min(&(mp->m_ail));
403 *gen = (int)mp->m_ail_gen;
404
405 return (lip);
406}
407
408/*
409 * If the generation count of the tree has not changed since the
410 * caller last took something from the AIL, then return the elmt
411 * in the tree which follows the one given. If the count has changed,
412 * then return the minimum elmt of the AIL and bump the restarts counter
413 * if one is given.
414 */
415xfs_log_item_t *
416xfs_trans_next_ail(
417 xfs_mount_t *mp,
418 xfs_log_item_t *lip,
419 int *gen,
420 int *restarts)
421{
422 xfs_log_item_t *nlip;
423
424 ASSERT(mp && lip && gen);
425 if (mp->m_ail_gen == *gen) {
426 nlip = xfs_ail_next(&(mp->m_ail), lip);
427 } else {
428 nlip = xfs_ail_min(&(mp->m_ail));
429 *gen = (int)mp->m_ail_gen;
430 if (restarts != NULL) {
431 XFS_STATS_INC(xs_push_ail_restarts);
432 (*restarts)++;
433 }
434 }
435
436 return (nlip);
437}
438
439
440/*
441 * The active item list (AIL) is a doubly linked list of log
442 * items sorted by ascending lsn. The base of the list is
443 * a forw/back pointer pair embedded in the xfs mount structure.
444 * The base is initialized with both pointers pointing to the
445 * base. This case always needs to be distinguished, because
446 * the base has no lsn to look at. We almost always insert
447 * at the end of the list, so on inserts we search from the
448 * end of the list to find where the new item belongs.
449 */
450
451/*
452 * Initialize the doubly linked list to point only to itself.
453 */
454void
455xfs_trans_ail_init(
456 xfs_mount_t *mp)
457{
458 mp->m_ail.ail_forw = (xfs_log_item_t*)&(mp->m_ail);
459 mp->m_ail.ail_back = (xfs_log_item_t*)&(mp->m_ail);
460}
461
462/*
463 * Insert the given log item into the AIL.
464 * We almost always insert at the end of the list, so on inserts
465 * we search from the end of the list to find where the
466 * new item belongs.
467 */
468STATIC void
469xfs_ail_insert(
470 xfs_ail_entry_t *base,
471 xfs_log_item_t *lip)
472/* ARGSUSED */
473{
474 xfs_log_item_t *next_lip;
475
476 /*
477 * If the list is empty, just insert the item.
478 */
479 if (base->ail_back == (xfs_log_item_t*)base) {
480 base->ail_forw = lip;
481 base->ail_back = lip;
482 lip->li_ail.ail_forw = (xfs_log_item_t*)base;
483 lip->li_ail.ail_back = (xfs_log_item_t*)base;
484 return;
485 }
486
487 next_lip = base->ail_back;
488 while ((next_lip != (xfs_log_item_t*)base) &&
489 (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) > 0)) {
490 next_lip = next_lip->li_ail.ail_back;
491 }
492 ASSERT((next_lip == (xfs_log_item_t*)base) ||
493 (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0));
494 lip->li_ail.ail_forw = next_lip->li_ail.ail_forw;
495 lip->li_ail.ail_back = next_lip;
496 next_lip->li_ail.ail_forw = lip;
497 lip->li_ail.ail_forw->li_ail.ail_back = lip;
498
499 xfs_ail_check(base);
500 return;
501}
502
503/*
504 * Delete the given item from the AIL. Return a pointer to the item.
505 */
506/*ARGSUSED*/
507STATIC xfs_log_item_t *
508xfs_ail_delete(
509 xfs_ail_entry_t *base,
510 xfs_log_item_t *lip)
511/* ARGSUSED */
512{
513 lip->li_ail.ail_forw->li_ail.ail_back = lip->li_ail.ail_back;
514 lip->li_ail.ail_back->li_ail.ail_forw = lip->li_ail.ail_forw;
515 lip->li_ail.ail_forw = NULL;
516 lip->li_ail.ail_back = NULL;
517
518 xfs_ail_check(base);
519 return lip;
520}
521
522/*
523 * Return a pointer to the first item in the AIL.
524 * If the AIL is empty, then return NULL.
525 */
526STATIC xfs_log_item_t *
527xfs_ail_min(
528 xfs_ail_entry_t *base)
529/* ARGSUSED */
530{
531 register xfs_log_item_t *forw = base->ail_forw;
532 if (forw == (xfs_log_item_t*)base) {
533 return NULL;
534 }
535 return forw;
536}
537
538/*
539 * Return a pointer to the item which follows
540 * the given item in the AIL. If the given item
541 * is the last item in the list, then return NULL.
542 */
543STATIC xfs_log_item_t *
544xfs_ail_next(
545 xfs_ail_entry_t *base,
546 xfs_log_item_t *lip)
547/* ARGSUSED */
548{
549 if (lip->li_ail.ail_forw == (xfs_log_item_t*)base) {
550 return NULL;
551 }
552 return lip->li_ail.ail_forw;
553
554}
555
556#ifdef DEBUG
557/*
558 * Check that the list is sorted as it should be.
559 */
560STATIC void
561xfs_ail_check(
562 xfs_ail_entry_t *base)
563{
564 xfs_log_item_t *lip;
565 xfs_log_item_t *prev_lip;
566
567 lip = base->ail_forw;
568 if (lip == (xfs_log_item_t*)base) {
569 /*
570 * Make sure the pointers are correct when the list
571 * is empty.
572 */
573 ASSERT(base->ail_back == (xfs_log_item_t*)base);
574 return;
575 }
576
577 /*
578 * Walk the list checking forward and backward pointers,
579 * lsn ordering, and that every entry has the XFS_LI_IN_AIL
580 * flag set.
581 */
582 prev_lip = (xfs_log_item_t*)base;
583 while (lip != (xfs_log_item_t*)base) {
584 if (prev_lip != (xfs_log_item_t*)base) {
585 ASSERT(prev_lip->li_ail.ail_forw == lip);
586 ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
587 }
588 ASSERT(lip->li_ail.ail_back == prev_lip);
589 ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
590 prev_lip = lip;
591 lip = lip->li_ail.ail_forw;
592 }
593 ASSERT(lip == (xfs_log_item_t*)base);
594 ASSERT(base->ail_back == prev_lip);
595}
596#endif /* DEBUG */
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
new file mode 100644
index 000000000000..a9682b9510c1
--- /dev/null
+++ b/fs/xfs/xfs_trans_buf.c
@@ -0,0 +1,1093 @@
1/*
2 * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34#include "xfs_macros.h"
35#include "xfs_types.h"
36#include "xfs_inum.h"
37#include "xfs_log.h"
38#include "xfs_trans.h"
39#include "xfs_buf_item.h"
40#include "xfs_sb.h"
41#include "xfs_ag.h"
42#include "xfs_dir.h"
43#include "xfs_dmapi.h"
44#include "xfs_mount.h"
45#include "xfs_trans_priv.h"
46#include "xfs_error.h"
47#include "xfs_rw.h"
48
49
50STATIC xfs_buf_t *xfs_trans_buf_item_match(xfs_trans_t *, xfs_buftarg_t *,
51 xfs_daddr_t, int);
52STATIC xfs_buf_t *xfs_trans_buf_item_match_all(xfs_trans_t *, xfs_buftarg_t *,
53 xfs_daddr_t, int);
54
55
56/*
57 * Get and lock the buffer for the caller if it is not already
58 * locked within the given transaction. If it is already locked
59 * within the transaction, just increment its lock recursion count
60 * and return a pointer to it.
61 *
62 * Use the fast path function xfs_trans_buf_item_match() or the buffer
63 * cache routine incore_match() to find the buffer
64 * if it is already owned by this transaction.
65 *
66 * If we don't already own the buffer, use get_buf() to get it.
67 * If it doesn't yet have an associated xfs_buf_log_item structure,
68 * then allocate one and add the item to this transaction.
69 *
70 * If the transaction pointer is NULL, make this just a normal
71 * get_buf() call.
72 */
73xfs_buf_t *
74xfs_trans_get_buf(xfs_trans_t *tp,
75 xfs_buftarg_t *target_dev,
76 xfs_daddr_t blkno,
77 int len,
78 uint flags)
79{
80 xfs_buf_t *bp;
81 xfs_buf_log_item_t *bip;
82
83 if (flags == 0)
84 flags = XFS_BUF_LOCK | XFS_BUF_MAPPED;
85
86 /*
87 * Default to a normal get_buf() call if the tp is NULL.
88 */
89 if (tp == NULL) {
90 bp = xfs_buf_get_flags(target_dev, blkno, len,
91 flags | BUF_BUSY);
92 return(bp);
93 }
94
95 /*
96 * If we find the buffer in the cache with this transaction
97 * pointer in its b_fsprivate2 field, then we know we already
98 * have it locked. In this case we just increment the lock
99 * recursion count and return the buffer to the caller.
100 */
101 if (tp->t_items.lic_next == NULL) {
102 bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len);
103 } else {
104 bp = xfs_trans_buf_item_match_all(tp, target_dev, blkno, len);
105 }
106 if (bp != NULL) {
107 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
108 if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) {
109 xfs_buftrace("TRANS GET RECUR SHUT", bp);
110 XFS_BUF_SUPER_STALE(bp);
111 }
112 /*
113 * If the buffer is stale then it was binval'ed
114 * since last read. This doesn't matter since the
115 * caller isn't allowed to use the data anyway.
116 */
117 else if (XFS_BUF_ISSTALE(bp)) {
118 xfs_buftrace("TRANS GET RECUR STALE", bp);
119 ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
120 }
121 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
122 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
123 ASSERT(bip != NULL);
124 ASSERT(atomic_read(&bip->bli_refcount) > 0);
125 bip->bli_recur++;
126 xfs_buftrace("TRANS GET RECUR", bp);
127 xfs_buf_item_trace("GET RECUR", bip);
128 return (bp);
129 }
130
131 /*
132 * We always specify the BUF_BUSY flag within a transaction so
133 * that get_buf does not try to push out a delayed write buffer
134 * which might cause another transaction to take place (if the
135 * buffer was delayed alloc). Such recursive transactions can
136 * easily deadlock with our current transaction as well as cause
137 * us to run out of stack space.
138 */
139 bp = xfs_buf_get_flags(target_dev, blkno, len, flags | BUF_BUSY);
140 if (bp == NULL) {
141 return NULL;
142 }
143
144 ASSERT(!XFS_BUF_GETERROR(bp));
145
146 /*
147 * The xfs_buf_log_item pointer is stored in b_fsprivate. If
148 * it doesn't have one yet, then allocate one and initialize it.
149 * The checks to see if one is there are in xfs_buf_item_init().
150 */
151 xfs_buf_item_init(bp, tp->t_mountp);
152
153 /*
154 * Set the recursion count for the buffer within this transaction
155 * to 0.
156 */
157 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
158 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
159 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
160 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
161 bip->bli_recur = 0;
162
163 /*
164 * Take a reference for this transaction on the buf item.
165 */
166 atomic_inc(&bip->bli_refcount);
167
168 /*
169 * Get a log_item_desc to point at the new item.
170 */
171 (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip);
172
173 /*
174 * Initialize b_fsprivate2 so we can find it with incore_match()
175 * above.
176 */
177 XFS_BUF_SET_FSPRIVATE2(bp, tp);
178
179 xfs_buftrace("TRANS GET", bp);
180 xfs_buf_item_trace("GET", bip);
181 return (bp);
182}
183
184/*
185 * Get and lock the superblock buffer of this file system for the
186 * given transaction.
187 *
188 * We don't need to use incore_match() here, because the superblock
189 * buffer is a private buffer which we keep a pointer to in the
190 * mount structure.
191 */
192xfs_buf_t *
193xfs_trans_getsb(xfs_trans_t *tp,
194 struct xfs_mount *mp,
195 int flags)
196{
197 xfs_buf_t *bp;
198 xfs_buf_log_item_t *bip;
199
200 /*
201 * Default to just trying to lock the superblock buffer
202 * if tp is NULL.
203 */
204 if (tp == NULL) {
205 return (xfs_getsb(mp, flags));
206 }
207
208 /*
209 * If the superblock buffer already has this transaction
210 * pointer in its b_fsprivate2 field, then we know we already
211 * have it locked. In this case we just increment the lock
212 * recursion count and return the buffer to the caller.
213 */
214 bp = mp->m_sb_bp;
215 if (XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp) {
216 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
217 ASSERT(bip != NULL);
218 ASSERT(atomic_read(&bip->bli_refcount) > 0);
219 bip->bli_recur++;
220 xfs_buf_item_trace("GETSB RECUR", bip);
221 return (bp);
222 }
223
224 bp = xfs_getsb(mp, flags);
225 if (bp == NULL) {
226 return NULL;
227 }
228
229 /*
230 * The xfs_buf_log_item pointer is stored in b_fsprivate. If
231 * it doesn't have one yet, then allocate one and initialize it.
232 * The checks to see if one is there are in xfs_buf_item_init().
233 */
234 xfs_buf_item_init(bp, mp);
235
236 /*
237 * Set the recursion count for the buffer within this transaction
238 * to 0.
239 */
240 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
241 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
242 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
243 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
244 bip->bli_recur = 0;
245
246 /*
247 * Take a reference for this transaction on the buf item.
248 */
249 atomic_inc(&bip->bli_refcount);
250
251 /*
252 * Get a log_item_desc to point at the new item.
253 */
254 (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip);
255
256 /*
257 * Initialize b_fsprivate2 so we can find it with incore_match()
258 * above.
259 */
260 XFS_BUF_SET_FSPRIVATE2(bp, tp);
261
262 xfs_buf_item_trace("GETSB", bip);
263 return (bp);
264}
265
266#ifdef DEBUG
267xfs_buftarg_t *xfs_error_target;
268int xfs_do_error;
269int xfs_req_num;
270int xfs_error_mod = 33;
271#endif
272
273/*
274 * Get and lock the buffer for the caller if it is not already
275 * locked within the given transaction. If it has not yet been
276 * read in, read it from disk. If it is already locked
277 * within the transaction and already read in, just increment its
278 * lock recursion count and return a pointer to it.
279 *
280 * Use the fast path function xfs_trans_buf_item_match() or the buffer
281 * cache routine incore_match() to find the buffer
282 * if it is already owned by this transaction.
283 *
284 * If we don't already own the buffer, use read_buf() to get it.
285 * If it doesn't yet have an associated xfs_buf_log_item structure,
286 * then allocate one and add the item to this transaction.
287 *
288 * If the transaction pointer is NULL, make this just a normal
289 * read_buf() call.
290 */
291int
292xfs_trans_read_buf(
293 xfs_mount_t *mp,
294 xfs_trans_t *tp,
295 xfs_buftarg_t *target,
296 xfs_daddr_t blkno,
297 int len,
298 uint flags,
299 xfs_buf_t **bpp)
300{
301 xfs_buf_t *bp;
302 xfs_buf_log_item_t *bip;
303 int error;
304
305 if (flags == 0)
306 flags = XFS_BUF_LOCK | XFS_BUF_MAPPED;
307
308 /*
309 * Default to a normal get_buf() call if the tp is NULL.
310 */
311 if (tp == NULL) {
312 bp = xfs_buf_read_flags(target, blkno, len, flags | BUF_BUSY);
313 if (!bp)
314 return XFS_ERROR(ENOMEM);
315
316 if ((bp != NULL) && (XFS_BUF_GETERROR(bp) != 0)) {
317 xfs_ioerror_alert("xfs_trans_read_buf", mp,
318 bp, blkno);
319 error = XFS_BUF_GETERROR(bp);
320 xfs_buf_relse(bp);
321 return error;
322 }
323#ifdef DEBUG
324 if (xfs_do_error && (bp != NULL)) {
325 if (xfs_error_target == target) {
326 if (((xfs_req_num++) % xfs_error_mod) == 0) {
327 xfs_buf_relse(bp);
328 printk("Returning error!\n");
329 return XFS_ERROR(EIO);
330 }
331 }
332 }
333#endif
334 if (XFS_FORCED_SHUTDOWN(mp))
335 goto shutdown_abort;
336 *bpp = bp;
337 return 0;
338 }
339
340 /*
341 * If we find the buffer in the cache with this transaction
342 * pointer in its b_fsprivate2 field, then we know we already
343 * have it locked. If it is already read in we just increment
344 * the lock recursion count and return the buffer to the caller.
345 * If the buffer is not yet read in, then we read it in, increment
346 * the lock recursion count, and return it to the caller.
347 */
348 if (tp->t_items.lic_next == NULL) {
349 bp = xfs_trans_buf_item_match(tp, target, blkno, len);
350 } else {
351 bp = xfs_trans_buf_item_match_all(tp, target, blkno, len);
352 }
353 if (bp != NULL) {
354 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
355 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
356 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
357 ASSERT((XFS_BUF_ISERROR(bp)) == 0);
358 if (!(XFS_BUF_ISDONE(bp))) {
359 xfs_buftrace("READ_BUF_INCORE !DONE", bp);
360 ASSERT(!XFS_BUF_ISASYNC(bp));
361 XFS_BUF_READ(bp);
362 xfsbdstrat(tp->t_mountp, bp);
363 xfs_iowait(bp);
364 if (XFS_BUF_GETERROR(bp) != 0) {
365 xfs_ioerror_alert("xfs_trans_read_buf", mp,
366 bp, blkno);
367 error = XFS_BUF_GETERROR(bp);
368 xfs_buf_relse(bp);
369 /*
370 * We can gracefully recover from most
371 * read errors. Ones we can't are those
372 * that happen after the transaction's
373 * already dirty.
374 */
375 if (tp->t_flags & XFS_TRANS_DIRTY)
376 xfs_force_shutdown(tp->t_mountp,
377 XFS_METADATA_IO_ERROR);
378 return error;
379 }
380 }
381 /*
382 * We never locked this buf ourselves, so we shouldn't
383 * brelse it either. Just get out.
384 */
385 if (XFS_FORCED_SHUTDOWN(mp)) {
386 xfs_buftrace("READ_BUF_INCORE XFSSHUTDN", bp);
387 *bpp = NULL;
388 return XFS_ERROR(EIO);
389 }
390
391
392 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
393 bip->bli_recur++;
394
395 ASSERT(atomic_read(&bip->bli_refcount) > 0);
396 xfs_buf_item_trace("READ RECUR", bip);
397 *bpp = bp;
398 return 0;
399 }
400
401 /*
402 * We always specify the BUF_BUSY flag within a transaction so
403 * that get_buf does not try to push out a delayed write buffer
404 * which might cause another transaction to take place (if the
405 * buffer was delayed alloc). Such recursive transactions can
406 * easily deadlock with our current transaction as well as cause
407 * us to run out of stack space.
408 */
409 bp = xfs_buf_read_flags(target, blkno, len, flags | BUF_BUSY);
410 if (bp == NULL) {
411 *bpp = NULL;
412 return 0;
413 }
414 if (XFS_BUF_GETERROR(bp) != 0) {
415 XFS_BUF_SUPER_STALE(bp);
416 xfs_buftrace("READ ERROR", bp);
417 error = XFS_BUF_GETERROR(bp);
418
419 xfs_ioerror_alert("xfs_trans_read_buf", mp,
420 bp, blkno);
421 if (tp->t_flags & XFS_TRANS_DIRTY)
422 xfs_force_shutdown(tp->t_mountp, XFS_METADATA_IO_ERROR);
423 xfs_buf_relse(bp);
424 return error;
425 }
426#ifdef DEBUG
427 if (xfs_do_error && !(tp->t_flags & XFS_TRANS_DIRTY)) {
428 if (xfs_error_target == target) {
429 if (((xfs_req_num++) % xfs_error_mod) == 0) {
430 xfs_force_shutdown(tp->t_mountp,
431 XFS_METADATA_IO_ERROR);
432 xfs_buf_relse(bp);
433 printk("Returning error in trans!\n");
434 return XFS_ERROR(EIO);
435 }
436 }
437 }
438#endif
439 if (XFS_FORCED_SHUTDOWN(mp))
440 goto shutdown_abort;
441
442 /*
443 * The xfs_buf_log_item pointer is stored in b_fsprivate. If
444 * it doesn't have one yet, then allocate one and initialize it.
445 * The checks to see if one is there are in xfs_buf_item_init().
446 */
447 xfs_buf_item_init(bp, tp->t_mountp);
448
449 /*
450 * Set the recursion count for the buffer within this transaction
451 * to 0.
452 */
453 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
454 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
455 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
456 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
457 bip->bli_recur = 0;
458
459 /*
460 * Take a reference for this transaction on the buf item.
461 */
462 atomic_inc(&bip->bli_refcount);
463
464 /*
465 * Get a log_item_desc to point at the new item.
466 */
467 (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip);
468
469 /*
470 * Initialize b_fsprivate2 so we can find it with incore_match()
471 * above.
472 */
473 XFS_BUF_SET_FSPRIVATE2(bp, tp);
474
475 xfs_buftrace("TRANS READ", bp);
476 xfs_buf_item_trace("READ", bip);
477 *bpp = bp;
478 return 0;
479
480shutdown_abort:
481 /*
482 * the theory here is that buffer is good but we're
483 * bailing out because the filesystem is being forcibly
484 * shut down. So we should leave the b_flags alone since
485 * the buffer's not staled and just get out.
486 */
487#if defined(DEBUG)
488 if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp))
489 cmn_err(CE_NOTE, "about to pop assert, bp == 0x%p", bp);
490#endif
491 ASSERT((XFS_BUF_BFLAGS(bp) & (XFS_B_STALE|XFS_B_DELWRI)) !=
492 (XFS_B_STALE|XFS_B_DELWRI));
493
494 xfs_buftrace("READ_BUF XFSSHUTDN", bp);
495 xfs_buf_relse(bp);
496 *bpp = NULL;
497 return XFS_ERROR(EIO);
498}
499
500
501/*
502 * Release the buffer bp which was previously acquired with one of the
503 * xfs_trans_... buffer allocation routines if the buffer has not
504 * been modified within this transaction. If the buffer is modified
505 * within this transaction, do decrement the recursion count but do
506 * not release the buffer even if the count goes to 0. If the buffer is not
507 * modified within the transaction, decrement the recursion count and
508 * release the buffer if the recursion count goes to 0.
509 *
510 * If the buffer is to be released and it was not modified before
511 * this transaction began, then free the buf_log_item associated with it.
512 *
513 * If the transaction pointer is NULL, make this just a normal
514 * brelse() call.
515 */
516void
517xfs_trans_brelse(xfs_trans_t *tp,
518 xfs_buf_t *bp)
519{
520 xfs_buf_log_item_t *bip;
521 xfs_log_item_t *lip;
522 xfs_log_item_desc_t *lidp;
523
524 /*
525 * Default to a normal brelse() call if the tp is NULL.
526 */
527 if (tp == NULL) {
528 ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL);
529 /*
530 * If there's a buf log item attached to the buffer,
531 * then let the AIL know that the buffer is being
532 * unlocked.
533 */
534 if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
535 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
536 if (lip->li_type == XFS_LI_BUF) {
537 bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*);
538 xfs_trans_unlocked_item(
539 bip->bli_item.li_mountp,
540 lip);
541 }
542 }
543 xfs_buf_relse(bp);
544 return;
545 }
546
547 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
548 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
549 ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
550 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
551 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
552 ASSERT(atomic_read(&bip->bli_refcount) > 0);
553
554 /*
555 * Find the item descriptor pointing to this buffer's
556 * log item. It must be there.
557 */
558 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
559 ASSERT(lidp != NULL);
560
561 /*
562 * If the release is just for a recursive lock,
563 * then decrement the count and return.
564 */
565 if (bip->bli_recur > 0) {
566 bip->bli_recur--;
567 xfs_buf_item_trace("RELSE RECUR", bip);
568 return;
569 }
570
571 /*
572 * If the buffer is dirty within this transaction, we can't
573 * release it until we commit.
574 */
575 if (lidp->lid_flags & XFS_LID_DIRTY) {
576 xfs_buf_item_trace("RELSE DIRTY", bip);
577 return;
578 }
579
580 /*
581 * If the buffer has been invalidated, then we can't release
582 * it until the transaction commits to disk unless it is re-dirtied
583 * as part of this transaction. This prevents us from pulling
584 * the item from the AIL before we should.
585 */
586 if (bip->bli_flags & XFS_BLI_STALE) {
587 xfs_buf_item_trace("RELSE STALE", bip);
588 return;
589 }
590
591 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
592 xfs_buf_item_trace("RELSE", bip);
593
594 /*
595 * Free up the log item descriptor tracking the released item.
596 */
597 xfs_trans_free_item(tp, lidp);
598
599 /*
600 * Clear the hold flag in the buf log item if it is set.
601 * We wouldn't want the next user of the buffer to
602 * get confused.
603 */
604 if (bip->bli_flags & XFS_BLI_HOLD) {
605 bip->bli_flags &= ~XFS_BLI_HOLD;
606 }
607
608 /*
609 * Drop our reference to the buf log item.
610 */
611 atomic_dec(&bip->bli_refcount);
612
613 /*
614 * If the buf item is not tracking data in the log, then
615 * we must free it before releasing the buffer back to the
616 * free pool. Before releasing the buffer to the free pool,
617 * clear the transaction pointer in b_fsprivate2 to dissolve
618 * its relation to this transaction.
619 */
620 if (!xfs_buf_item_dirty(bip)) {
621/***
622 ASSERT(bp->b_pincount == 0);
623***/
624 ASSERT(atomic_read(&bip->bli_refcount) == 0);
625 ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL));
626 ASSERT(!(bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF));
627 xfs_buf_item_relse(bp);
628 bip = NULL;
629 }
630 XFS_BUF_SET_FSPRIVATE2(bp, NULL);
631
632 /*
633 * If we've still got a buf log item on the buffer, then
634 * tell the AIL that the buffer is being unlocked.
635 */
636 if (bip != NULL) {
637 xfs_trans_unlocked_item(bip->bli_item.li_mountp,
638 (xfs_log_item_t*)bip);
639 }
640
641 xfs_buf_relse(bp);
642 return;
643}
644
645/*
646 * Add the locked buffer to the transaction.
647 * The buffer must be locked, and it cannot be associated with any
648 * transaction.
649 *
650 * If the buffer does not yet have a buf log item associated with it,
651 * then allocate one for it. Then add the buf item to the transaction.
652 */
653void
654xfs_trans_bjoin(xfs_trans_t *tp,
655 xfs_buf_t *bp)
656{
657 xfs_buf_log_item_t *bip;
658
659 ASSERT(XFS_BUF_ISBUSY(bp));
660 ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL);
661
662 /*
663 * The xfs_buf_log_item pointer is stored in b_fsprivate. If
664 * it doesn't have one yet, then allocate one and initialize it.
665 * The checks to see if one is there are in xfs_buf_item_init().
666 */
667 xfs_buf_item_init(bp, tp->t_mountp);
668 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
669 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
670 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
671 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
672
673 /*
674 * Take a reference for this transaction on the buf item.
675 */
676 atomic_inc(&bip->bli_refcount);
677
678 /*
679 * Get a log_item_desc to point at the new item.
680 */
681 (void) xfs_trans_add_item(tp, (xfs_log_item_t *)bip);
682
683 /*
684 * Initialize b_fsprivate2 so we can find it with incore_match()
685 * in xfs_trans_get_buf() and friends above.
686 */
687 XFS_BUF_SET_FSPRIVATE2(bp, tp);
688
689 xfs_buf_item_trace("BJOIN", bip);
690}
691
692/*
693 * Mark the buffer as not needing to be unlocked when the buf item's
694 * IOP_UNLOCK() routine is called. The buffer must already be locked
695 * and associated with the given transaction.
696 */
697/* ARGSUSED */
698void
699xfs_trans_bhold(xfs_trans_t *tp,
700 xfs_buf_t *bp)
701{
702 xfs_buf_log_item_t *bip;
703
704 ASSERT(XFS_BUF_ISBUSY(bp));
705 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
706 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
707
708 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
709 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
710 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
711 ASSERT(atomic_read(&bip->bli_refcount) > 0);
712 bip->bli_flags |= XFS_BLI_HOLD;
713 xfs_buf_item_trace("BHOLD", bip);
714}
715
716/*
717 * This is called to mark bytes first through last inclusive of the given
718 * buffer as needing to be logged when the transaction is committed.
719 * The buffer must already be associated with the given transaction.
720 *
721 * First and last are numbers relative to the beginning of this buffer,
722 * so the first byte in the buffer is numbered 0 regardless of the
723 * value of b_blkno.
724 */
725void
726xfs_trans_log_buf(xfs_trans_t *tp,
727 xfs_buf_t *bp,
728 uint first,
729 uint last)
730{
731 xfs_buf_log_item_t *bip;
732 xfs_log_item_desc_t *lidp;
733
734 ASSERT(XFS_BUF_ISBUSY(bp));
735 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
736 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
737 ASSERT((first <= last) && (last < XFS_BUF_COUNT(bp)));
738 ASSERT((XFS_BUF_IODONE_FUNC(bp) == NULL) ||
739 (XFS_BUF_IODONE_FUNC(bp) == xfs_buf_iodone_callbacks));
740
741 /*
742 * Mark the buffer as needing to be written out eventually,
743 * and set its iodone function to remove the buffer's buf log
744 * item from the AIL and free it when the buffer is flushed
745 * to disk. See xfs_buf_attach_iodone() for more details
746 * on li_cb and xfs_buf_iodone_callbacks().
747 * If we end up aborting this transaction, we trap this buffer
748 * inside the b_bdstrat callback so that this won't get written to
749 * disk.
750 */
751 XFS_BUF_DELAYWRITE(bp);
752 XFS_BUF_DONE(bp);
753
754 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
755 ASSERT(atomic_read(&bip->bli_refcount) > 0);
756 XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
757 bip->bli_item.li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*))xfs_buf_iodone;
758
759 /*
760 * If we invalidated the buffer within this transaction, then
761 * cancel the invalidation now that we're dirtying the buffer
762 * again. There are no races with the code in xfs_buf_item_unpin(),
763 * because we have a reference to the buffer this entire time.
764 */
765 if (bip->bli_flags & XFS_BLI_STALE) {
766 xfs_buf_item_trace("BLOG UNSTALE", bip);
767 bip->bli_flags &= ~XFS_BLI_STALE;
768 ASSERT(XFS_BUF_ISSTALE(bp));
769 XFS_BUF_UNSTALE(bp);
770 bip->bli_format.blf_flags &= ~XFS_BLI_CANCEL;
771 }
772
773 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
774 ASSERT(lidp != NULL);
775
776 tp->t_flags |= XFS_TRANS_DIRTY;
777 lidp->lid_flags |= XFS_LID_DIRTY;
778 lidp->lid_flags &= ~XFS_LID_BUF_STALE;
779 bip->bli_flags |= XFS_BLI_LOGGED;
780 xfs_buf_item_log(bip, first, last);
781 xfs_buf_item_trace("BLOG", bip);
782}
783
784
785/*
786 * This called to invalidate a buffer that is being used within
787 * a transaction. Typically this is because the blocks in the
788 * buffer are being freed, so we need to prevent it from being
789 * written out when we're done. Allowing it to be written again
790 * might overwrite data in the free blocks if they are reallocated
791 * to a file.
792 *
793 * We prevent the buffer from being written out by clearing the
794 * B_DELWRI flag. We can't always
795 * get rid of the buf log item at this point, though, because
796 * the buffer may still be pinned by another transaction. If that
797 * is the case, then we'll wait until the buffer is committed to
798 * disk for the last time (we can tell by the ref count) and
799 * free it in xfs_buf_item_unpin(). Until it is cleaned up we
800 * will keep the buffer locked so that the buffer and buf log item
801 * are not reused.
802 */
803void
804xfs_trans_binval(
805 xfs_trans_t *tp,
806 xfs_buf_t *bp)
807{
808 xfs_log_item_desc_t *lidp;
809 xfs_buf_log_item_t *bip;
810
811 ASSERT(XFS_BUF_ISBUSY(bp));
812 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
813 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
814
815 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
816 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
817 ASSERT(lidp != NULL);
818 ASSERT(atomic_read(&bip->bli_refcount) > 0);
819
820 if (bip->bli_flags & XFS_BLI_STALE) {
821 /*
822 * If the buffer is already invalidated, then
823 * just return.
824 */
825 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
826 ASSERT(XFS_BUF_ISSTALE(bp));
827 ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
828 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_INODE_BUF));
829 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
830 ASSERT(lidp->lid_flags & XFS_LID_DIRTY);
831 ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
832 xfs_buftrace("XFS_BINVAL RECUR", bp);
833 xfs_buf_item_trace("BINVAL RECUR", bip);
834 return;
835 }
836
837 /*
838 * Clear the dirty bit in the buffer and set the STALE flag
839 * in the buf log item. The STALE flag will be used in
840 * xfs_buf_item_unpin() to determine if it should clean up
841 * when the last reference to the buf item is given up.
842 * We set the XFS_BLI_CANCEL flag in the buf log format structure
843 * and log the buf item. This will be used at recovery time
844 * to determine that copies of the buffer in the log before
845 * this should not be replayed.
846 * We mark the item descriptor and the transaction dirty so
847 * that we'll hold the buffer until after the commit.
848 *
849 * Since we're invalidating the buffer, we also clear the state
850 * about which parts of the buffer have been logged. We also
851 * clear the flag indicating that this is an inode buffer since
852 * the data in the buffer will no longer be valid.
853 *
854 * We set the stale bit in the buffer as well since we're getting
855 * rid of it.
856 */
857 XFS_BUF_UNDELAYWRITE(bp);
858 XFS_BUF_STALE(bp);
859 bip->bli_flags |= XFS_BLI_STALE;
860 bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_DIRTY);
861 bip->bli_format.blf_flags &= ~XFS_BLI_INODE_BUF;
862 bip->bli_format.blf_flags |= XFS_BLI_CANCEL;
863 memset((char *)(bip->bli_format.blf_data_map), 0,
864 (bip->bli_format.blf_map_size * sizeof(uint)));
865 lidp->lid_flags |= XFS_LID_DIRTY|XFS_LID_BUF_STALE;
866 tp->t_flags |= XFS_TRANS_DIRTY;
867 xfs_buftrace("XFS_BINVAL", bp);
868 xfs_buf_item_trace("BINVAL", bip);
869}
870
871/*
872 * This call is used to indicate that the buffer contains on-disk
873 * inodes which must be handled specially during recovery. They
874 * require special handling because only the di_next_unlinked from
875 * the inodes in the buffer should be recovered. The rest of the
876 * data in the buffer is logged via the inodes themselves.
877 *
878 * All we do is set the XFS_BLI_INODE_BUF flag in the buffer's log
879 * format structure so that we'll know what to do at recovery time.
880 */
881/* ARGSUSED */
882void
883xfs_trans_inode_buf(
884 xfs_trans_t *tp,
885 xfs_buf_t *bp)
886{
887 xfs_buf_log_item_t *bip;
888
889 ASSERT(XFS_BUF_ISBUSY(bp));
890 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
891 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
892
893 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
894 ASSERT(atomic_read(&bip->bli_refcount) > 0);
895
896 bip->bli_format.blf_flags |= XFS_BLI_INODE_BUF;
897}
898
899/*
900 * This call is used to indicate that the buffer is going to
901 * be staled and was an inode buffer. This means it gets
902 * special processing during unpin - where any inodes
903 * associated with the buffer should be removed from ail.
904 * There is also special processing during recovery,
905 * any replay of the inodes in the buffer needs to be
906 * prevented as the buffer may have been reused.
907 */
908void
909xfs_trans_stale_inode_buf(
910 xfs_trans_t *tp,
911 xfs_buf_t *bp)
912{
913 xfs_buf_log_item_t *bip;
914
915 ASSERT(XFS_BUF_ISBUSY(bp));
916 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
917 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
918
919 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
920 ASSERT(atomic_read(&bip->bli_refcount) > 0);
921
922 bip->bli_flags |= XFS_BLI_STALE_INODE;
923 bip->bli_item.li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*))
924 xfs_buf_iodone;
925}
926
927
928
929/*
930 * Mark the buffer as being one which contains newly allocated
931 * inodes. We need to make sure that even if this buffer is
932 * relogged as an 'inode buf' we still recover all of the inode
933 * images in the face of a crash. This works in coordination with
934 * xfs_buf_item_committed() to ensure that the buffer remains in the
935 * AIL at its original location even after it has been relogged.
936 */
937/* ARGSUSED */
938void
939xfs_trans_inode_alloc_buf(
940 xfs_trans_t *tp,
941 xfs_buf_t *bp)
942{
943 xfs_buf_log_item_t *bip;
944
945 ASSERT(XFS_BUF_ISBUSY(bp));
946 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
947 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
948
949 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
950 ASSERT(atomic_read(&bip->bli_refcount) > 0);
951
952 bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF;
953}
954
955
956/*
957 * Similar to xfs_trans_inode_buf(), this marks the buffer as a cluster of
958 * dquots. However, unlike in inode buffer recovery, dquot buffers get
959 * recovered in their entirety. (Hence, no XFS_BLI_DQUOT_ALLOC_BUF flag).
960 * The only thing that makes dquot buffers different from regular
961 * buffers is that we must not replay dquot bufs when recovering
962 * if a _corresponding_ quotaoff has happened. We also have to distinguish
963 * between usr dquot bufs and grp dquot bufs, because usr and grp quotas
964 * can be turned off independently.
965 */
966/* ARGSUSED */
967void
968xfs_trans_dquot_buf(
969 xfs_trans_t *tp,
970 xfs_buf_t *bp,
971 uint type)
972{
973 xfs_buf_log_item_t *bip;
974
975 ASSERT(XFS_BUF_ISBUSY(bp));
976 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
977 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
978 ASSERT(type == XFS_BLI_UDQUOT_BUF ||
979 type == XFS_BLI_GDQUOT_BUF);
980
981 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
982 ASSERT(atomic_read(&bip->bli_refcount) > 0);
983
984 bip->bli_format.blf_flags |= type;
985}
986
987/*
988 * Check to see if a buffer matching the given parameters is already
989 * a part of the given transaction. Only check the first, embedded
990 * chunk, since we don't want to spend all day scanning large transactions.
991 */
992STATIC xfs_buf_t *
993xfs_trans_buf_item_match(
994 xfs_trans_t *tp,
995 xfs_buftarg_t *target,
996 xfs_daddr_t blkno,
997 int len)
998{
999 xfs_log_item_chunk_t *licp;
1000 xfs_log_item_desc_t *lidp;
1001 xfs_buf_log_item_t *blip;
1002 xfs_buf_t *bp;
1003 int i;
1004
1005 bp = NULL;
1006 len = BBTOB(len);
1007 licp = &tp->t_items;
1008 if (!XFS_LIC_ARE_ALL_FREE(licp)) {
1009 for (i = 0; i < licp->lic_unused; i++) {
1010 /*
1011 * Skip unoccupied slots.
1012 */
1013 if (XFS_LIC_ISFREE(licp, i)) {
1014 continue;
1015 }
1016
1017 lidp = XFS_LIC_SLOT(licp, i);
1018 blip = (xfs_buf_log_item_t *)lidp->lid_item;
1019 if (blip->bli_item.li_type != XFS_LI_BUF) {
1020 continue;
1021 }
1022
1023 bp = blip->bli_buf;
1024 if ((XFS_BUF_TARGET(bp) == target) &&
1025 (XFS_BUF_ADDR(bp) == blkno) &&
1026 (XFS_BUF_COUNT(bp) == len)) {
1027 /*
1028 * We found it. Break out and
1029 * return the pointer to the buffer.
1030 */
1031 break;
1032 } else {
1033 bp = NULL;
1034 }
1035 }
1036 }
1037 return bp;
1038}
1039
1040/*
1041 * Check to see if a buffer matching the given parameters is already
1042 * a part of the given transaction. Check all the chunks, we
1043 * want to be thorough.
1044 */
1045STATIC xfs_buf_t *
1046xfs_trans_buf_item_match_all(
1047 xfs_trans_t *tp,
1048 xfs_buftarg_t *target,
1049 xfs_daddr_t blkno,
1050 int len)
1051{
1052 xfs_log_item_chunk_t *licp;
1053 xfs_log_item_desc_t *lidp;
1054 xfs_buf_log_item_t *blip;
1055 xfs_buf_t *bp;
1056 int i;
1057
1058 bp = NULL;
1059 len = BBTOB(len);
1060 for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) {
1061 if (XFS_LIC_ARE_ALL_FREE(licp)) {
1062 ASSERT(licp == &tp->t_items);
1063 ASSERT(licp->lic_next == NULL);
1064 return NULL;
1065 }
1066 for (i = 0; i < licp->lic_unused; i++) {
1067 /*
1068 * Skip unoccupied slots.
1069 */
1070 if (XFS_LIC_ISFREE(licp, i)) {
1071 continue;
1072 }
1073
1074 lidp = XFS_LIC_SLOT(licp, i);
1075 blip = (xfs_buf_log_item_t *)lidp->lid_item;
1076 if (blip->bli_item.li_type != XFS_LI_BUF) {
1077 continue;
1078 }
1079
1080 bp = blip->bli_buf;
1081 if ((XFS_BUF_TARGET(bp) == target) &&
1082 (XFS_BUF_ADDR(bp) == blkno) &&
1083 (XFS_BUF_COUNT(bp) == len)) {
1084 /*
1085 * We found it. Break out and
1086 * return the pointer to the buffer.
1087 */
1088 return bp;
1089 }
1090 }
1091 }
1092 return NULL;
1093}
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
new file mode 100644
index 000000000000..93259a15f983
--- /dev/null
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -0,0 +1,156 @@
1/*
2 * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34#include "xfs_macros.h"
35#include "xfs_types.h"
36#include "xfs_inum.h"
37#include "xfs_log.h"
38#include "xfs_trans.h"
39#include "xfs_sb.h"
40#include "xfs_dir.h"
41#include "xfs_dmapi.h"
42#include "xfs_mount.h"
43#include "xfs_trans_priv.h"
44#include "xfs_extfree_item.h"
45
46/*
47 * This routine is called to allocate an "extent free intention"
48 * log item that will hold nextents worth of extents. The
49 * caller must use all nextents extents, because we are not
50 * flexible about this at all.
51 */
52xfs_efi_log_item_t *
53xfs_trans_get_efi(xfs_trans_t *tp,
54 uint nextents)
55{
56 xfs_efi_log_item_t *efip;
57
58 ASSERT(tp != NULL);
59 ASSERT(nextents > 0);
60
61 efip = xfs_efi_init(tp->t_mountp, nextents);
62 ASSERT(efip != NULL);
63
64 /*
65 * Get a log_item_desc to point at the new item.
66 */
67 (void) xfs_trans_add_item(tp, (xfs_log_item_t*)efip);
68
69 return (efip);
70}
71
72/*
73 * This routine is called to indicate that the described
74 * extent is to be logged as needing to be freed. It should
75 * be called once for each extent to be freed.
76 */
77void
78xfs_trans_log_efi_extent(xfs_trans_t *tp,
79 xfs_efi_log_item_t *efip,
80 xfs_fsblock_t start_block,
81 xfs_extlen_t ext_len)
82{
83 xfs_log_item_desc_t *lidp;
84 uint next_extent;
85 xfs_extent_t *extp;
86
87 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)efip);
88 ASSERT(lidp != NULL);
89
90 tp->t_flags |= XFS_TRANS_DIRTY;
91 lidp->lid_flags |= XFS_LID_DIRTY;
92
93 next_extent = efip->efi_next_extent;
94 ASSERT(next_extent < efip->efi_format.efi_nextents);
95 extp = &(efip->efi_format.efi_extents[next_extent]);
96 extp->ext_start = start_block;
97 extp->ext_len = ext_len;
98 efip->efi_next_extent++;
99}
100
101
102/*
103 * This routine is called to allocate an "extent free done"
104 * log item that will hold nextents worth of extents. The
105 * caller must use all nextents extents, because we are not
106 * flexible about this at all.
107 */
108xfs_efd_log_item_t *
109xfs_trans_get_efd(xfs_trans_t *tp,
110 xfs_efi_log_item_t *efip,
111 uint nextents)
112{
113 xfs_efd_log_item_t *efdp;
114
115 ASSERT(tp != NULL);
116 ASSERT(nextents > 0);
117
118 efdp = xfs_efd_init(tp->t_mountp, efip, nextents);
119 ASSERT(efdp != NULL);
120
121 /*
122 * Get a log_item_desc to point at the new item.
123 */
124 (void) xfs_trans_add_item(tp, (xfs_log_item_t*)efdp);
125
126 return (efdp);
127}
128
129/*
130 * This routine is called to indicate that the described
131 * extent is to be logged as having been freed. It should
132 * be called once for each extent freed.
133 */
134void
135xfs_trans_log_efd_extent(xfs_trans_t *tp,
136 xfs_efd_log_item_t *efdp,
137 xfs_fsblock_t start_block,
138 xfs_extlen_t ext_len)
139{
140 xfs_log_item_desc_t *lidp;
141 uint next_extent;
142 xfs_extent_t *extp;
143
144 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)efdp);
145 ASSERT(lidp != NULL);
146
147 tp->t_flags |= XFS_TRANS_DIRTY;
148 lidp->lid_flags |= XFS_LID_DIRTY;
149
150 next_extent = efdp->efd_next_extent;
151 ASSERT(next_extent < efdp->efd_format.efd_nextents);
152 extp = &(efdp->efd_format.efd_extents[next_extent]);
153 extp->ext_start = start_block;
154 extp->ext_len = ext_len;
155 efdp->efd_next_extent++;
156}
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
new file mode 100644
index 000000000000..e2c3706f453d
--- /dev/null
+++ b/fs/xfs/xfs_trans_inode.c
@@ -0,0 +1,342 @@
1/*
2 * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34#include "xfs_macros.h"
35#include "xfs_types.h"
36#include "xfs_inum.h"
37#include "xfs_log.h"
38#include "xfs_trans.h"
39#include "xfs_sb.h"
40#include "xfs_ag.h"
41#include "xfs_dir.h"
42#include "xfs_dir2.h"
43#include "xfs_dmapi.h"
44#include "xfs_mount.h"
45#include "xfs_trans_priv.h"
46#include "xfs_alloc_btree.h"
47#include "xfs_bmap_btree.h"
48#include "xfs_ialloc_btree.h"
49#include "xfs_btree.h"
50#include "xfs_ialloc.h"
51#include "xfs_attr_sf.h"
52#include "xfs_dir_sf.h"
53#include "xfs_dir2_sf.h"
54#include "xfs_dinode.h"
55#include "xfs_inode_item.h"
56#include "xfs_inode.h"
57
58#ifdef XFS_TRANS_DEBUG
59STATIC void
60xfs_trans_inode_broot_debug(
61 xfs_inode_t *ip);
62#else
63#define xfs_trans_inode_broot_debug(ip)
64#endif
65
66
67/*
68 * Get and lock the inode for the caller if it is not already
69 * locked within the given transaction. If it is already locked
70 * within the transaction, just increment its lock recursion count
71 * and return a pointer to it.
72 *
73 * For an inode to be locked in a transaction, the inode lock, as
74 * opposed to the io lock, must be taken exclusively. This ensures
75 * that the inode can be involved in only 1 transaction at a time.
76 * Lock recursion is handled on the io lock, but only for lock modes
77 * of equal or lesser strength. That is, you can recur on the io lock
78 * held EXCL with a SHARED request but not vice versa. Also, if
79 * the inode is already a part of the transaction then you cannot
80 * go from not holding the io lock to having it EXCL or SHARED.
81 *
82 * Use the inode cache routine xfs_inode_incore() to find the inode
83 * if it is already owned by this transaction.
84 *
85 * If we don't already own the inode, use xfs_iget() to get it.
86 * Since the inode log item structure is embedded in the incore
87 * inode structure and is initialized when the inode is brought
88 * into memory, there is nothing to do with it here.
89 *
90 * If the given transaction pointer is NULL, just call xfs_iget().
91 * This simplifies code which must handle both cases.
92 */
93int
94xfs_trans_iget(
95 xfs_mount_t *mp,
96 xfs_trans_t *tp,
97 xfs_ino_t ino,
98 uint flags,
99 uint lock_flags,
100 xfs_inode_t **ipp)
101{
102 int error;
103 xfs_inode_t *ip;
104 xfs_inode_log_item_t *iip;
105
106 /*
107 * If the transaction pointer is NULL, just call the normal
108 * xfs_iget().
109 */
110 if (tp == NULL)
111 return xfs_iget(mp, NULL, ino, flags, lock_flags, ipp, 0);
112
113 /*
114 * If we find the inode in core with this transaction
115 * pointer in its i_transp field, then we know we already
116 * have it locked. In this case we just increment the lock
117 * recursion count and return the inode to the caller.
118 * Assert that the inode is already locked in the mode requested
119 * by the caller. We cannot do lock promotions yet, so
120 * die if someone gets this wrong.
121 */
122 if ((ip = xfs_inode_incore(tp->t_mountp, ino, tp)) != NULL) {
123 /*
124 * Make sure that the inode lock is held EXCL and
125 * that the io lock is never upgraded when the inode
126 * is already a part of the transaction.
127 */
128 ASSERT(ip->i_itemp != NULL);
129 ASSERT(lock_flags & XFS_ILOCK_EXCL);
130 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
131 ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) ||
132 ismrlocked(&ip->i_iolock, MR_UPDATE));
133 ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) ||
134 (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_EXCL));
135 ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) ||
136 ismrlocked(&ip->i_iolock, (MR_UPDATE | MR_ACCESS)));
137 ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) ||
138 (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_ANY));
139
140 if (lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) {
141 ip->i_itemp->ili_iolock_recur++;
142 }
143 if (lock_flags & XFS_ILOCK_EXCL) {
144 ip->i_itemp->ili_ilock_recur++;
145 }
146 *ipp = ip;
147 return 0;
148 }
149
150 ASSERT(lock_flags & XFS_ILOCK_EXCL);
151 error = xfs_iget(tp->t_mountp, tp, ino, flags, lock_flags, &ip, 0);
152 if (error) {
153 return error;
154 }
155 ASSERT(ip != NULL);
156
157 /*
158 * Get a log_item_desc to point at the new item.
159 */
160 if (ip->i_itemp == NULL)
161 xfs_inode_item_init(ip, mp);
162 iip = ip->i_itemp;
163 (void) xfs_trans_add_item(tp, (xfs_log_item_t *)(iip));
164
165 xfs_trans_inode_broot_debug(ip);
166
167 /*
168 * If the IO lock has been acquired, mark that in
169 * the inode log item so we'll know to unlock it
170 * when the transaction commits.
171 */
172 ASSERT(iip->ili_flags == 0);
173 if (lock_flags & XFS_IOLOCK_EXCL) {
174 iip->ili_flags |= XFS_ILI_IOLOCKED_EXCL;
175 } else if (lock_flags & XFS_IOLOCK_SHARED) {
176 iip->ili_flags |= XFS_ILI_IOLOCKED_SHARED;
177 }
178
179 /*
180 * Initialize i_transp so we can find it with xfs_inode_incore()
181 * above.
182 */
183 ip->i_transp = tp;
184
185 *ipp = ip;
186 return 0;
187}
188
189/*
190 * Add the locked inode to the transaction.
191 * The inode must be locked, and it cannot be associated with any
192 * transaction. The caller must specify the locks already held
193 * on the inode.
194 */
195void
196xfs_trans_ijoin(
197 xfs_trans_t *tp,
198 xfs_inode_t *ip,
199 uint lock_flags)
200{
201 xfs_inode_log_item_t *iip;
202
203 ASSERT(ip->i_transp == NULL);
204 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
205 ASSERT(lock_flags & XFS_ILOCK_EXCL);
206 if (ip->i_itemp == NULL)
207 xfs_inode_item_init(ip, ip->i_mount);
208 iip = ip->i_itemp;
209 ASSERT(iip->ili_flags == 0);
210 ASSERT(iip->ili_ilock_recur == 0);
211 ASSERT(iip->ili_iolock_recur == 0);
212
213 /*
214 * Get a log_item_desc to point at the new item.
215 */
216 (void) xfs_trans_add_item(tp, (xfs_log_item_t*)(iip));
217
218 xfs_trans_inode_broot_debug(ip);
219
220 /*
221 * If the IO lock is already held, mark that in the inode log item.
222 */
223 if (lock_flags & XFS_IOLOCK_EXCL) {
224 iip->ili_flags |= XFS_ILI_IOLOCKED_EXCL;
225 } else if (lock_flags & XFS_IOLOCK_SHARED) {
226 iip->ili_flags |= XFS_ILI_IOLOCKED_SHARED;
227 }
228
229 /*
230 * Initialize i_transp so we can find it with xfs_inode_incore()
231 * in xfs_trans_iget() above.
232 */
233 ip->i_transp = tp;
234}
235
236
237
238/*
239 * Mark the inode as not needing to be unlocked when the inode item's
240 * IOP_UNLOCK() routine is called. The inode must already be locked
241 * and associated with the given transaction.
242 */
243/*ARGSUSED*/
244void
245xfs_trans_ihold(
246 xfs_trans_t *tp,
247 xfs_inode_t *ip)
248{
249 ASSERT(ip->i_transp == tp);
250 ASSERT(ip->i_itemp != NULL);
251 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
252
253 ip->i_itemp->ili_flags |= XFS_ILI_HOLD;
254}
255
256/*
257 * Cancel the previous inode hold request made on this inode
258 * for this transaction.
259 */
260/*ARGSUSED*/
261void
262xfs_trans_ihold_release(
263 xfs_trans_t *tp,
264 xfs_inode_t *ip)
265{
266 ASSERT(ip->i_transp == tp);
267 ASSERT(ip->i_itemp != NULL);
268 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
269 ASSERT(ip->i_itemp->ili_flags & XFS_ILI_HOLD);
270
271 ip->i_itemp->ili_flags &= ~XFS_ILI_HOLD;
272}
273
274
275/*
276 * This is called to mark the fields indicated in fieldmask as needing
277 * to be logged when the transaction is committed. The inode must
278 * already be associated with the given transaction.
279 *
280 * The values for fieldmask are defined in xfs_inode_item.h. We always
281 * log all of the core inode if any of it has changed, and we always log
282 * all of the inline data/extents/b-tree root if any of them has changed.
283 */
284void
285xfs_trans_log_inode(
286 xfs_trans_t *tp,
287 xfs_inode_t *ip,
288 uint flags)
289{
290 xfs_log_item_desc_t *lidp;
291
292 ASSERT(ip->i_transp == tp);
293 ASSERT(ip->i_itemp != NULL);
294 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
295
296 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(ip->i_itemp));
297 ASSERT(lidp != NULL);
298
299 tp->t_flags |= XFS_TRANS_DIRTY;
300 lidp->lid_flags |= XFS_LID_DIRTY;
301
302 /*
303 * Always OR in the bits from the ili_last_fields field.
304 * This is to coordinate with the xfs_iflush() and xfs_iflush_done()
305 * routines in the eventual clearing of the ilf_fields bits.
306 * See the big comment in xfs_iflush() for an explanation of
307 * this coorination mechanism.
308 */
309 flags |= ip->i_itemp->ili_last_fields;
310 ip->i_itemp->ili_format.ilf_fields |= flags;
311}
312
313#ifdef XFS_TRANS_DEBUG
314/*
315 * Keep track of the state of the inode btree root to make sure we
316 * log it properly.
317 */
318STATIC void
319xfs_trans_inode_broot_debug(
320 xfs_inode_t *ip)
321{
322 xfs_inode_log_item_t *iip;
323
324 ASSERT(ip->i_itemp != NULL);
325 iip = ip->i_itemp;
326 if (iip->ili_root_size != 0) {
327 ASSERT(iip->ili_orig_root != NULL);
328 kmem_free(iip->ili_orig_root, iip->ili_root_size);
329 iip->ili_root_size = 0;
330 iip->ili_orig_root = NULL;
331 }
332 if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
333 ASSERT((ip->i_df.if_broot != NULL) &&
334 (ip->i_df.if_broot_bytes > 0));
335 iip->ili_root_size = ip->i_df.if_broot_bytes;
336 iip->ili_orig_root =
337 (char*)kmem_alloc(iip->ili_root_size, KM_SLEEP);
338 memcpy(iip->ili_orig_root, (char*)(ip->i_df.if_broot),
339 iip->ili_root_size);
340 }
341}
342#endif
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
new file mode 100644
index 000000000000..1b8a756d80ed
--- /dev/null
+++ b/fs/xfs/xfs_trans_item.c
@@ -0,0 +1,553 @@
1/*
2 * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34#include "xfs_macros.h"
35#include "xfs_types.h"
36#include "xfs_inum.h"
37#include "xfs_log.h"
38#include "xfs_trans.h"
39
40STATIC int xfs_trans_unlock_chunk(xfs_log_item_chunk_t *,
41 int, int, xfs_lsn_t);
42
43/*
44 * This is called to add the given log item to the transaction's
45 * list of log items. It must find a free log item descriptor
46 * or allocate a new one and add the item to that descriptor.
47 * The function returns a pointer to item descriptor used to point
48 * to the new item. The log item will now point to its new descriptor
49 * with its li_desc field.
50 */
51xfs_log_item_desc_t *
52xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
53{
54 xfs_log_item_desc_t *lidp;
55 xfs_log_item_chunk_t *licp;
56 int i=0;
57
58 /*
59 * If there are no free descriptors, allocate a new chunk
60 * of them and put it at the front of the chunk list.
61 */
62 if (tp->t_items_free == 0) {
63 licp = (xfs_log_item_chunk_t*)
64 kmem_alloc(sizeof(xfs_log_item_chunk_t), KM_SLEEP);
65 ASSERT(licp != NULL);
66 /*
67 * Initialize the chunk, and then
68 * claim the first slot in the newly allocated chunk.
69 */
70 XFS_LIC_INIT(licp);
71 XFS_LIC_CLAIM(licp, 0);
72 licp->lic_unused = 1;
73 XFS_LIC_INIT_SLOT(licp, 0);
74 lidp = XFS_LIC_SLOT(licp, 0);
75
76 /*
77 * Link in the new chunk and update the free count.
78 */
79 licp->lic_next = tp->t_items.lic_next;
80 tp->t_items.lic_next = licp;
81 tp->t_items_free = XFS_LIC_NUM_SLOTS - 1;
82
83 /*
84 * Initialize the descriptor and the generic portion
85 * of the log item.
86 *
87 * Point the new slot at this item and return it.
88 * Also point the log item at its currently active
89 * descriptor and set the item's mount pointer.
90 */
91 lidp->lid_item = lip;
92 lidp->lid_flags = 0;
93 lidp->lid_size = 0;
94 lip->li_desc = lidp;
95 lip->li_mountp = tp->t_mountp;
96 return (lidp);
97 }
98
99 /*
100 * Find the free descriptor. It is somewhere in the chunklist
101 * of descriptors.
102 */
103 licp = &tp->t_items;
104 while (licp != NULL) {
105 if (XFS_LIC_VACANCY(licp)) {
106 if (licp->lic_unused <= XFS_LIC_MAX_SLOT) {
107 i = licp->lic_unused;
108 ASSERT(XFS_LIC_ISFREE(licp, i));
109 break;
110 }
111 for (i = 0; i <= XFS_LIC_MAX_SLOT; i++) {
112 if (XFS_LIC_ISFREE(licp, i))
113 break;
114 }
115 ASSERT(i <= XFS_LIC_MAX_SLOT);
116 break;
117 }
118 licp = licp->lic_next;
119 }
120 ASSERT(licp != NULL);
121 /*
122 * If we find a free descriptor, claim it,
123 * initialize it, and return it.
124 */
125 XFS_LIC_CLAIM(licp, i);
126 if (licp->lic_unused <= i) {
127 licp->lic_unused = i + 1;
128 XFS_LIC_INIT_SLOT(licp, i);
129 }
130 lidp = XFS_LIC_SLOT(licp, i);
131 tp->t_items_free--;
132 lidp->lid_item = lip;
133 lidp->lid_flags = 0;
134 lidp->lid_size = 0;
135 lip->li_desc = lidp;
136 lip->li_mountp = tp->t_mountp;
137 return (lidp);
138}
139
140/*
141 * Free the given descriptor.
142 *
143 * This requires setting the bit in the chunk's free mask corresponding
144 * to the given slot.
145 */
146void
147xfs_trans_free_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
148{
149 uint slot;
150 xfs_log_item_chunk_t *licp;
151 xfs_log_item_chunk_t **licpp;
152
153 slot = XFS_LIC_DESC_TO_SLOT(lidp);
154 licp = XFS_LIC_DESC_TO_CHUNK(lidp);
155 XFS_LIC_RELSE(licp, slot);
156 lidp->lid_item->li_desc = NULL;
157 tp->t_items_free++;
158
159 /*
160 * If there are no more used items in the chunk and this is not
161 * the chunk embedded in the transaction structure, then free
162 * the chunk. First pull it from the chunk list and then
163 * free it back to the heap. We didn't bother with a doubly
164 * linked list here because the lists should be very short
165 * and this is not a performance path. It's better to save
166 * the memory of the extra pointer.
167 *
168 * Also decrement the transaction structure's count of free items
169 * by the number in a chunk since we are freeing an empty chunk.
170 */
171 if (XFS_LIC_ARE_ALL_FREE(licp) && (licp != &(tp->t_items))) {
172 licpp = &(tp->t_items.lic_next);
173 while (*licpp != licp) {
174 ASSERT(*licpp != NULL);
175 licpp = &((*licpp)->lic_next);
176 }
177 *licpp = licp->lic_next;
178 kmem_free(licp, sizeof(xfs_log_item_chunk_t));
179 tp->t_items_free -= XFS_LIC_NUM_SLOTS;
180 }
181}
182
183/*
184 * This is called to find the descriptor corresponding to the given
185 * log item. It returns a pointer to the descriptor.
186 * The log item MUST have a corresponding descriptor in the given
187 * transaction. This routine does not return NULL, it panics.
188 *
189 * The descriptor pointer is kept in the log item's li_desc field.
190 * Just return it.
191 */
192/*ARGSUSED*/
193xfs_log_item_desc_t *
194xfs_trans_find_item(xfs_trans_t *tp, xfs_log_item_t *lip)
195{
196 ASSERT(lip->li_desc != NULL);
197
198 return (lip->li_desc);
199}
200
201
202/*
203 * Return a pointer to the first descriptor in the chunk list.
204 * This does not return NULL if there are none, it panics.
205 *
206 * The first descriptor must be in either the first or second chunk.
207 * This is because the only chunk allowed to be empty is the first.
208 * All others are freed when they become empty.
209 *
210 * At some point this and xfs_trans_next_item() should be optimized
211 * to quickly look at the mask to determine if there is anything to
212 * look at.
213 */
214xfs_log_item_desc_t *
215xfs_trans_first_item(xfs_trans_t *tp)
216{
217 xfs_log_item_chunk_t *licp;
218 int i;
219
220 licp = &tp->t_items;
221 /*
222 * If it's not in the first chunk, skip to the second.
223 */
224 if (XFS_LIC_ARE_ALL_FREE(licp)) {
225 licp = licp->lic_next;
226 }
227
228 /*
229 * Return the first non-free descriptor in the chunk.
230 */
231 ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
232 for (i = 0; i < licp->lic_unused; i++) {
233 if (XFS_LIC_ISFREE(licp, i)) {
234 continue;
235 }
236
237 return (XFS_LIC_SLOT(licp, i));
238 }
239 cmn_err(CE_WARN, "xfs_trans_first_item() -- no first item");
240 return(NULL);
241}
242
243
244/*
245 * Given a descriptor, return the next descriptor in the chunk list.
246 * This returns NULL if there are no more used descriptors in the list.
247 *
248 * We do this by first locating the chunk in which the descriptor resides,
249 * and then scanning forward in the chunk and the list for the next
250 * used descriptor.
251 */
252/*ARGSUSED*/
253xfs_log_item_desc_t *
254xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
255{
256 xfs_log_item_chunk_t *licp;
257 int i;
258
259 licp = XFS_LIC_DESC_TO_CHUNK(lidp);
260
261 /*
262 * First search the rest of the chunk. The for loop keeps us
263 * from referencing things beyond the end of the chunk.
264 */
265 for (i = (int)XFS_LIC_DESC_TO_SLOT(lidp) + 1; i < licp->lic_unused; i++) {
266 if (XFS_LIC_ISFREE(licp, i)) {
267 continue;
268 }
269
270 return (XFS_LIC_SLOT(licp, i));
271 }
272
273 /*
274 * Now search the next chunk. It must be there, because the
275 * next chunk would have been freed if it were empty.
276 * If there is no next chunk, return NULL.
277 */
278 if (licp->lic_next == NULL) {
279 return (NULL);
280 }
281
282 licp = licp->lic_next;
283 ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
284 for (i = 0; i < licp->lic_unused; i++) {
285 if (XFS_LIC_ISFREE(licp, i)) {
286 continue;
287 }
288
289 return (XFS_LIC_SLOT(licp, i));
290 }
291 ASSERT(0);
292 /* NOTREACHED */
293 return NULL; /* keep gcc quite */
294}
295
296/*
297 * This is called to unlock all of the items of a transaction and to free
298 * all the descriptors of that transaction.
299 *
300 * It walks the list of descriptors and unlocks each item. It frees
301 * each chunk except that embedded in the transaction as it goes along.
302 */
303void
304xfs_trans_free_items(
305 xfs_trans_t *tp,
306 int flags)
307{
308 xfs_log_item_chunk_t *licp;
309 xfs_log_item_chunk_t *next_licp;
310 int abort;
311
312 abort = flags & XFS_TRANS_ABORT;
313 licp = &tp->t_items;
314 /*
315 * Special case the embedded chunk so we don't free it below.
316 */
317 if (!XFS_LIC_ARE_ALL_FREE(licp)) {
318 (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
319 XFS_LIC_ALL_FREE(licp);
320 licp->lic_unused = 0;
321 }
322 licp = licp->lic_next;
323
324 /*
325 * Unlock each item in each chunk and free the chunks.
326 */
327 while (licp != NULL) {
328 ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
329 (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
330 next_licp = licp->lic_next;
331 kmem_free(licp, sizeof(xfs_log_item_chunk_t));
332 licp = next_licp;
333 }
334
335 /*
336 * Reset the transaction structure's free item count.
337 */
338 tp->t_items_free = XFS_LIC_NUM_SLOTS;
339 tp->t_items.lic_next = NULL;
340}
341
342
343
344/*
345 * This is called to unlock the items associated with a transaction.
346 * Items which were not logged should be freed.
347 * Those which were logged must still be tracked so they can be unpinned
348 * when the transaction commits.
349 */
350void
351xfs_trans_unlock_items(xfs_trans_t *tp, xfs_lsn_t commit_lsn)
352{
353 xfs_log_item_chunk_t *licp;
354 xfs_log_item_chunk_t *next_licp;
355 xfs_log_item_chunk_t **licpp;
356 int freed;
357
358 freed = 0;
359 licp = &tp->t_items;
360
361 /*
362 * Special case the embedded chunk so we don't free.
363 */
364 if (!XFS_LIC_ARE_ALL_FREE(licp)) {
365 freed = xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn);
366 }
367 licpp = &(tp->t_items.lic_next);
368 licp = licp->lic_next;
369
370 /*
371 * Unlock each item in each chunk, free non-dirty descriptors,
372 * and free empty chunks.
373 */
374 while (licp != NULL) {
375 ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
376 freed += xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn);
377 next_licp = licp->lic_next;
378 if (XFS_LIC_ARE_ALL_FREE(licp)) {
379 *licpp = next_licp;
380 kmem_free(licp, sizeof(xfs_log_item_chunk_t));
381 freed -= XFS_LIC_NUM_SLOTS;
382 } else {
383 licpp = &(licp->lic_next);
384 }
385 ASSERT(*licpp == next_licp);
386 licp = next_licp;
387 }
388
389 /*
390 * Fix the free descriptor count in the transaction.
391 */
392 tp->t_items_free += freed;
393}
394
395/*
396 * Unlock each item pointed to by a descriptor in the given chunk.
397 * Stamp the commit lsn into each item if necessary.
398 * Free descriptors pointing to items which are not dirty if freeing_chunk
399 * is zero. If freeing_chunk is non-zero, then we need to unlock all
400 * items in the chunk.
401 *
402 * Return the number of descriptors freed.
403 */
404STATIC int
405xfs_trans_unlock_chunk(
406 xfs_log_item_chunk_t *licp,
407 int freeing_chunk,
408 int abort,
409 xfs_lsn_t commit_lsn)
410{
411 xfs_log_item_desc_t *lidp;
412 xfs_log_item_t *lip;
413 int i;
414 int freed;
415
416 freed = 0;
417 lidp = licp->lic_descs;
418 for (i = 0; i < licp->lic_unused; i++, lidp++) {
419 if (XFS_LIC_ISFREE(licp, i)) {
420 continue;
421 }
422 lip = lidp->lid_item;
423 lip->li_desc = NULL;
424
425 if (commit_lsn != NULLCOMMITLSN)
426 IOP_COMMITTING(lip, commit_lsn);
427 if (abort)
428 lip->li_flags |= XFS_LI_ABORTED;
429 IOP_UNLOCK(lip);
430
431 /*
432 * Free the descriptor if the item is not dirty
433 * within this transaction and the caller is not
434 * going to just free the entire thing regardless.
435 */
436 if (!(freeing_chunk) &&
437 (!(lidp->lid_flags & XFS_LID_DIRTY) || abort)) {
438 XFS_LIC_RELSE(licp, i);
439 freed++;
440 }
441 }
442
443 return (freed);
444}
445
446
447/*
448 * This is called to add the given busy item to the transaction's
449 * list of busy items. It must find a free busy item descriptor
450 * or allocate a new one and add the item to that descriptor.
451 * The function returns a pointer to busy descriptor used to point
452 * to the new busy entry. The log busy entry will now point to its new
453 * descriptor with its ???? field.
454 */
455xfs_log_busy_slot_t *
456xfs_trans_add_busy(xfs_trans_t *tp, xfs_agnumber_t ag, xfs_extlen_t idx)
457{
458 xfs_log_busy_chunk_t *lbcp;
459 xfs_log_busy_slot_t *lbsp;
460 int i=0;
461
462 /*
463 * If there are no free descriptors, allocate a new chunk
464 * of them and put it at the front of the chunk list.
465 */
466 if (tp->t_busy_free == 0) {
467 lbcp = (xfs_log_busy_chunk_t*)
468 kmem_alloc(sizeof(xfs_log_busy_chunk_t), KM_SLEEP);
469 ASSERT(lbcp != NULL);
470 /*
471 * Initialize the chunk, and then
472 * claim the first slot in the newly allocated chunk.
473 */
474 XFS_LBC_INIT(lbcp);
475 XFS_LBC_CLAIM(lbcp, 0);
476 lbcp->lbc_unused = 1;
477 lbsp = XFS_LBC_SLOT(lbcp, 0);
478
479 /*
480 * Link in the new chunk and update the free count.
481 */
482 lbcp->lbc_next = tp->t_busy.lbc_next;
483 tp->t_busy.lbc_next = lbcp;
484 tp->t_busy_free = XFS_LIC_NUM_SLOTS - 1;
485
486 /*
487 * Initialize the descriptor and the generic portion
488 * of the log item.
489 *
490 * Point the new slot at this item and return it.
491 * Also point the log item at its currently active
492 * descriptor and set the item's mount pointer.
493 */
494 lbsp->lbc_ag = ag;
495 lbsp->lbc_idx = idx;
496 return (lbsp);
497 }
498
499 /*
500 * Find the free descriptor. It is somewhere in the chunklist
501 * of descriptors.
502 */
503 lbcp = &tp->t_busy;
504 while (lbcp != NULL) {
505 if (XFS_LBC_VACANCY(lbcp)) {
506 if (lbcp->lbc_unused <= XFS_LBC_MAX_SLOT) {
507 i = lbcp->lbc_unused;
508 break;
509 } else {
510 /* out-of-order vacancy */
511 printk("OOO vacancy lbcp 0x%p\n", lbcp);
512 ASSERT(0);
513 }
514 }
515 lbcp = lbcp->lbc_next;
516 }
517 ASSERT(lbcp != NULL);
518 /*
519 * If we find a free descriptor, claim it,
520 * initialize it, and return it.
521 */
522 XFS_LBC_CLAIM(lbcp, i);
523 if (lbcp->lbc_unused <= i) {
524 lbcp->lbc_unused = i + 1;
525 }
526 lbsp = XFS_LBC_SLOT(lbcp, i);
527 tp->t_busy_free--;
528 lbsp->lbc_ag = ag;
529 lbsp->lbc_idx = idx;
530 return (lbsp);
531}
532
533
534/*
535 * xfs_trans_free_busy
536 * Free all of the busy lists from a transaction
537 */
538void
539xfs_trans_free_busy(xfs_trans_t *tp)
540{
541 xfs_log_busy_chunk_t *lbcp;
542 xfs_log_busy_chunk_t *lbcq;
543
544 lbcp = tp->t_busy.lbc_next;
545 while (lbcp != NULL) {
546 lbcq = lbcp->lbc_next;
547 kmem_free(lbcp, sizeof(xfs_log_busy_chunk_t));
548 lbcp = lbcq;
549 }
550
551 XFS_LBC_INIT(&tp->t_busy);
552 tp->t_busy.lbc_unused = 0;
553}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
new file mode 100644
index 000000000000..d4dae7d06afc
--- /dev/null
+++ b/fs/xfs/xfs_trans_priv.h
@@ -0,0 +1,73 @@
1/*
2 * Copyright (c) 2000, 2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_TRANS_PRIV_H__
33#define __XFS_TRANS_PRIV_H__
34
35struct xfs_log_item;
36struct xfs_log_item_desc;
37struct xfs_mount;
38struct xfs_trans;
39
40/*
41 * From xfs_trans_item.c
42 */
43struct xfs_log_item_desc *xfs_trans_add_item(struct xfs_trans *,
44 struct xfs_log_item *);
45void xfs_trans_free_item(struct xfs_trans *,
46 struct xfs_log_item_desc *);
47struct xfs_log_item_desc *xfs_trans_find_item(struct xfs_trans *,
48 struct xfs_log_item *);
49struct xfs_log_item_desc *xfs_trans_first_item(struct xfs_trans *);
50struct xfs_log_item_desc *xfs_trans_next_item(struct xfs_trans *,
51 struct xfs_log_item_desc *);
52void xfs_trans_free_items(struct xfs_trans *, int);
53void xfs_trans_unlock_items(struct xfs_trans *,
54 xfs_lsn_t);
55void xfs_trans_free_busy(xfs_trans_t *tp);
56xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp,
57 xfs_agnumber_t ag,
58 xfs_extlen_t idx);
59
60/*
61 * From xfs_trans_ail.c
62 */
63void xfs_trans_update_ail(struct xfs_mount *,
64 struct xfs_log_item *, xfs_lsn_t,
65 unsigned long);
66void xfs_trans_delete_ail(struct xfs_mount *,
67 struct xfs_log_item *, unsigned long);
68struct xfs_log_item *xfs_trans_first_ail(struct xfs_mount *, int *);
69struct xfs_log_item *xfs_trans_next_ail(struct xfs_mount *,
70 struct xfs_log_item *, int *, int *);
71
72
73#endif /* __XFS_TRANS_PRIV_H__ */
diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/xfs_trans_space.h
new file mode 100644
index 000000000000..e91d173f4ed3
--- /dev/null
+++ b/fs/xfs/xfs_trans_space.h
@@ -0,0 +1,105 @@
1/*
2 * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_TRANS_SPACE_H__
33#define __XFS_TRANS_SPACE_H__
34
35/*
36 * Components of space reservations.
37 */
38#define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) \
39 (((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0]))
40#define XFS_EXTENTADD_SPACE_RES(mp,w) (XFS_BM_MAXLEVELS(mp,w) - 1)
41#define XFS_NEXTENTADD_SPACE_RES(mp,b,w)\
42 (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \
43 XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \
44 XFS_EXTENTADD_SPACE_RES(mp,w))
45#define XFS_DAENTER_1B(mp,w) ((w) == XFS_DATA_FORK ? (mp)->m_dirblkfsbs : 1)
46#define XFS_DAENTER_DBS(mp,w) \
47 (XFS_DA_NODE_MAXDEPTH + \
48 ((XFS_DIR_IS_V2(mp) && (w) == XFS_DATA_FORK) ? 2 : 0))
49#define XFS_DAENTER_BLOCKS(mp,w) \
50 (XFS_DAENTER_1B(mp,w) * XFS_DAENTER_DBS(mp,w))
51#define XFS_DAENTER_BMAP1B(mp,w) \
52 XFS_NEXTENTADD_SPACE_RES(mp, XFS_DAENTER_1B(mp, w), w)
53#define XFS_DAENTER_BMAPS(mp,w) \
54 (XFS_DAENTER_DBS(mp,w) * XFS_DAENTER_BMAP1B(mp,w))
55#define XFS_DAENTER_SPACE_RES(mp,w) \
56 (XFS_DAENTER_BLOCKS(mp,w) + XFS_DAENTER_BMAPS(mp,w))
57#define XFS_DAREMOVE_SPACE_RES(mp,w) XFS_DAENTER_BMAPS(mp,w)
58#define XFS_DIRENTER_MAX_SPLIT(mp,nl) \
59 (((mp)->m_sb.sb_blocksize == 512 && \
60 XFS_DIR_IS_V1(mp) && \
61 (nl) >= XFS_DIR_LEAF_CAN_DOUBLE_SPLIT_LEN) ? 2 : 1)
62#define XFS_DIRENTER_SPACE_RES(mp,nl) \
63 (XFS_DAENTER_SPACE_RES(mp, XFS_DATA_FORK) * \
64 XFS_DIRENTER_MAX_SPLIT(mp,nl))
65#define XFS_DIRREMOVE_SPACE_RES(mp) \
66 XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
67#define XFS_IALLOC_SPACE_RES(mp) \
68 (XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp)-1)
69
70/*
71 * Space reservation values for various transactions.
72 */
73#define XFS_ADDAFORK_SPACE_RES(mp) \
74 ((mp)->m_dirblkfsbs + \
75 (XFS_DIR_IS_V1(mp) ? 0 : XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK)))
76#define XFS_ATTRRM_SPACE_RES(mp) \
77 XFS_DAREMOVE_SPACE_RES(mp, XFS_ATTR_FORK)
78/* This macro is not used - see inline code in xfs_attr_set */
79#define XFS_ATTRSET_SPACE_RES(mp, v) \
80 (XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK) + XFS_B_TO_FSB(mp, v))
81#define XFS_CREATE_SPACE_RES(mp,nl) \
82 (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
83#define XFS_DIOSTRAT_SPACE_RES(mp, v) \
84 (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v))
85#define XFS_GROWFS_SPACE_RES(mp) \
86 (2 * XFS_AG_MAXLEVELS(mp))
87#define XFS_GROWFSRT_SPACE_RES(mp,b) \
88 ((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK))
89#define XFS_LINK_SPACE_RES(mp,nl) \
90 XFS_DIRENTER_SPACE_RES(mp,nl)
91#define XFS_MKDIR_SPACE_RES(mp,nl) \
92 (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
93#define XFS_QM_DQALLOC_SPACE_RES(mp) \
94 (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + \
95 XFS_DQUOT_CLUSTER_SIZE_FSB)
96#define XFS_QM_QINOCREATE_SPACE_RES(mp) \
97 XFS_IALLOC_SPACE_RES(mp)
98#define XFS_REMOVE_SPACE_RES(mp) \
99 XFS_DIRREMOVE_SPACE_RES(mp)
100#define XFS_RENAME_SPACE_RES(mp,nl) \
101 (XFS_DIRREMOVE_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
102#define XFS_SYMLINK_SPACE_RES(mp,nl,b) \
103 (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b))
104
105#endif /* __XFS_TRANS_SPACE_H__ */
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
new file mode 100644
index 000000000000..04609d27ea51
--- /dev/null
+++ b/fs/xfs/xfs_types.h
@@ -0,0 +1,182 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_TYPES_H__
33#define __XFS_TYPES_H__
34
35#ifdef __KERNEL__
36
37/*
38 * POSIX Extensions
39 */
40typedef unsigned char uchar_t;
41typedef unsigned short ushort_t;
42typedef unsigned int uint_t;
43typedef unsigned long ulong_t;
44
45/*
46 * Additional type declarations for XFS
47 */
48typedef signed char __int8_t;
49typedef unsigned char __uint8_t;
50typedef signed short int __int16_t;
51typedef unsigned short int __uint16_t;
52typedef signed int __int32_t;
53typedef unsigned int __uint32_t;
54typedef signed long long int __int64_t;
55typedef unsigned long long int __uint64_t;
56
57typedef enum { B_FALSE,B_TRUE } boolean_t;
58typedef __int64_t prid_t; /* project ID */
59typedef __uint32_t inst_t; /* an instruction */
60
61typedef __s64 xfs_off_t; /* <file offset> type */
62typedef __u64 xfs_ino_t; /* <inode> type */
63typedef __s64 xfs_daddr_t; /* <disk address> type */
64typedef char * xfs_caddr_t; /* <core address> type */
65typedef __u32 xfs_dev_t;
66
67/* __psint_t is the same size as a pointer */
68#if (BITS_PER_LONG == 32)
69typedef __int32_t __psint_t;
70typedef __uint32_t __psunsigned_t;
71#elif (BITS_PER_LONG == 64)
72typedef __int64_t __psint_t;
73typedef __uint64_t __psunsigned_t;
74#else
75#error BITS_PER_LONG must be 32 or 64
76#endif
77
78#endif /* __KERNEL__ */
79
80typedef __uint32_t xfs_agblock_t; /* blockno in alloc. group */
81typedef __uint32_t xfs_extlen_t; /* extent length in blocks */
82typedef __uint32_t xfs_agnumber_t; /* allocation group number */
83typedef __int32_t xfs_extnum_t; /* # of extents in a file */
84typedef __int16_t xfs_aextnum_t; /* # extents in an attribute fork */
85typedef __int64_t xfs_fsize_t; /* bytes in a file */
86typedef __uint64_t xfs_ufsize_t; /* unsigned bytes in a file */
87
88typedef __int32_t xfs_suminfo_t; /* type of bitmap summary info */
89typedef __int32_t xfs_rtword_t; /* word type for bitmap manipulations */
90
91typedef __int64_t xfs_lsn_t; /* log sequence number */
92typedef __int32_t xfs_tid_t; /* transaction identifier */
93
94typedef __uint32_t xfs_dablk_t; /* dir/attr block number (in file) */
95typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */
96
97typedef __uint16_t xfs_prid_t; /* prid_t truncated to 16bits in XFS */
98
99/*
100 * These types are 64 bits on disk but are either 32 or 64 bits in memory.
101 * Disk based types:
102 */
103typedef __uint64_t xfs_dfsbno_t; /* blockno in filesystem (agno|agbno) */
104typedef __uint64_t xfs_drfsbno_t; /* blockno in filesystem (raw) */
105typedef __uint64_t xfs_drtbno_t; /* extent (block) in realtime area */
106typedef __uint64_t xfs_dfiloff_t; /* block number in a file */
107typedef __uint64_t xfs_dfilblks_t; /* number of blocks in a file */
108
109/*
110 * Memory based types are conditional.
111 */
112#if XFS_BIG_BLKNOS
113typedef __uint64_t xfs_fsblock_t; /* blockno in filesystem (agno|agbno) */
114typedef __uint64_t xfs_rfsblock_t; /* blockno in filesystem (raw) */
115typedef __uint64_t xfs_rtblock_t; /* extent (block) in realtime area */
116typedef __int64_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */
117#else
118typedef __uint32_t xfs_fsblock_t; /* blockno in filesystem (agno|agbno) */
119typedef __uint32_t xfs_rfsblock_t; /* blockno in filesystem (raw) */
120typedef __uint32_t xfs_rtblock_t; /* extent (block) in realtime area */
121typedef __int32_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */
122#endif
123typedef __uint64_t xfs_fileoff_t; /* block number in a file */
124typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */
125typedef __uint64_t xfs_filblks_t; /* number of blocks in a file */
126
127typedef __uint8_t xfs_arch_t; /* architecture of an xfs fs */
128
129/*
130 * Null values for the types.
131 */
132#define NULLDFSBNO ((xfs_dfsbno_t)-1)
133#define NULLDRFSBNO ((xfs_drfsbno_t)-1)
134#define NULLDRTBNO ((xfs_drtbno_t)-1)
135#define NULLDFILOFF ((xfs_dfiloff_t)-1)
136
137#define NULLFSBLOCK ((xfs_fsblock_t)-1)
138#define NULLRFSBLOCK ((xfs_rfsblock_t)-1)
139#define NULLRTBLOCK ((xfs_rtblock_t)-1)
140#define NULLFILEOFF ((xfs_fileoff_t)-1)
141
142#define NULLAGBLOCK ((xfs_agblock_t)-1)
143#define NULLAGNUMBER ((xfs_agnumber_t)-1)
144#define NULLEXTNUM ((xfs_extnum_t)-1)
145
146#define NULLCOMMITLSN ((xfs_lsn_t)-1)
147
148/*
149 * Max values for extlen, extnum, aextnum.
150 */
151#define MAXEXTLEN ((xfs_extlen_t)0x001fffff) /* 21 bits */
152#define MAXEXTNUM ((xfs_extnum_t)0x7fffffff) /* signed int */
153#define MAXAEXTNUM ((xfs_aextnum_t)0x7fff) /* signed short */
154
155/*
156 * MAXNAMELEN is the length (including the terminating null) of
157 * the longest permissible file (component) name.
158 */
159#define MAXNAMELEN 256
160
161typedef struct xfs_dirent { /* data from readdir() */
162 xfs_ino_t d_ino; /* inode number of entry */
163 xfs_off_t d_off; /* offset of disk directory entry */
164 unsigned short d_reclen; /* length of this record */
165 char d_name[1]; /* name of file */
166} xfs_dirent_t;
167
168#define DIRENTBASESIZE (((xfs_dirent_t *)0)->d_name - (char *)0)
169#define DIRENTSIZE(namelen) \
170 ((DIRENTBASESIZE + (namelen) + \
171 sizeof(xfs_off_t)) & ~(sizeof(xfs_off_t) - 1))
172
173typedef enum {
174 XFS_LOOKUP_EQi, XFS_LOOKUP_LEi, XFS_LOOKUP_GEi
175} xfs_lookup_t;
176
177typedef enum {
178 XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_BMAPi, XFS_BTNUM_INOi,
179 XFS_BTNUM_MAX
180} xfs_btnum_t;
181
182#endif /* __XFS_TYPES_H__ */
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
new file mode 100644
index 000000000000..816b945fa0ea
--- /dev/null
+++ b/fs/xfs/xfs_utils.c
@@ -0,0 +1,488 @@
1/*
2 * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34#include "xfs_macros.h"
35#include "xfs_types.h"
36#include "xfs_inum.h"
37#include "xfs_log.h"
38#include "xfs_trans.h"
39#include "xfs_sb.h"
40#include "xfs_dir.h"
41#include "xfs_dir2.h"
42#include "xfs_dmapi.h"
43#include "xfs_mount.h"
44#include "xfs_bmap_btree.h"
45#include "xfs_attr_sf.h"
46#include "xfs_dir_sf.h"
47#include "xfs_dir2_sf.h"
48#include "xfs_dinode.h"
49#include "xfs_inode_item.h"
50#include "xfs_inode.h"
51#include "xfs_bmap.h"
52#include "xfs_error.h"
53#include "xfs_quota.h"
54#include "xfs_rw.h"
55#include "xfs_itable.h"
56#include "xfs_utils.h"
57
58/*
59 * xfs_get_dir_entry is used to get a reference to an inode given
60 * its parent directory inode and the name of the file. It does
61 * not lock the child inode, and it unlocks the directory before
62 * returning. The directory's generation number is returned for
63 * use by a later call to xfs_lock_dir_and_entry.
64 */
65int
66xfs_get_dir_entry(
67 vname_t *dentry,
68 xfs_inode_t **ipp)
69{
70 vnode_t *vp;
71 bhv_desc_t *bdp;
72
73 vp = VNAME_TO_VNODE(dentry);
74 bdp = vn_bhv_lookup_unlocked(VN_BHV_HEAD(vp), &xfs_vnodeops);
75 if (!bdp) {
76 *ipp = NULL;
77 return XFS_ERROR(ENOENT);
78 }
79 VN_HOLD(vp);
80 *ipp = XFS_BHVTOI(bdp);
81 return 0;
82}
83
84int
85xfs_dir_lookup_int(
86 bhv_desc_t *dir_bdp,
87 uint lock_mode,
88 vname_t *dentry,
89 xfs_ino_t *inum,
90 xfs_inode_t **ipp)
91{
92 vnode_t *dir_vp;
93 xfs_inode_t *dp;
94 int error;
95
96 dir_vp = BHV_TO_VNODE(dir_bdp);
97 vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
98
99 dp = XFS_BHVTOI(dir_bdp);
100
101 error = XFS_DIR_LOOKUP(dp->i_mount, NULL, dp,
102 VNAME(dentry), VNAMELEN(dentry), inum);
103 if (!error) {
104 /*
105 * Unlock the directory. We do this because we can't
106 * hold the directory lock while doing the vn_get()
107 * in xfs_iget(). Doing so could cause us to hold
108 * a lock while waiting for the inode to finish
109 * being inactive while it's waiting for a log
110 * reservation in the inactive routine.
111 */
112 xfs_iunlock(dp, lock_mode);
113 error = xfs_iget(dp->i_mount, NULL, *inum, 0, 0, ipp, 0);
114 xfs_ilock(dp, lock_mode);
115
116 if (error) {
117 *ipp = NULL;
118 } else if ((*ipp)->i_d.di_mode == 0) {
119 /*
120 * The inode has been freed. Something is
121 * wrong so just get out of here.
122 */
123 xfs_iunlock(dp, lock_mode);
124 xfs_iput_new(*ipp, 0);
125 *ipp = NULL;
126 xfs_ilock(dp, lock_mode);
127 error = XFS_ERROR(ENOENT);
128 }
129 }
130 return error;
131}
132
133/*
134 * Allocates a new inode from disk and return a pointer to the
135 * incore copy. This routine will internally commit the current
136 * transaction and allocate a new one if the Space Manager needed
137 * to do an allocation to replenish the inode free-list.
138 *
139 * This routine is designed to be called from xfs_create and
140 * xfs_create_dir.
141 *
142 */
143int
144xfs_dir_ialloc(
145 xfs_trans_t **tpp, /* input: current transaction;
146 output: may be a new transaction. */
147 xfs_inode_t *dp, /* directory within whose allocate
148 the inode. */
149 mode_t mode,
150 nlink_t nlink,
151 xfs_dev_t rdev,
152 cred_t *credp,
153 prid_t prid, /* project id */
154 int okalloc, /* ok to allocate new space */
155 xfs_inode_t **ipp, /* pointer to inode; it will be
156 locked. */
157 int *committed)
158
159{
160 xfs_trans_t *tp;
161 xfs_trans_t *ntp;
162 xfs_inode_t *ip;
163 xfs_buf_t *ialloc_context = NULL;
164 boolean_t call_again = B_FALSE;
165 int code;
166 uint log_res;
167 uint log_count;
168 void *dqinfo;
169 uint tflags;
170
171 tp = *tpp;
172 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
173
174 /*
175 * xfs_ialloc will return a pointer to an incore inode if
176 * the Space Manager has an available inode on the free
177 * list. Otherwise, it will do an allocation and replenish
178 * the freelist. Since we can only do one allocation per
179 * transaction without deadlocks, we will need to commit the
180 * current transaction and start a new one. We will then
181 * need to call xfs_ialloc again to get the inode.
182 *
183 * If xfs_ialloc did an allocation to replenish the freelist,
184 * it returns the bp containing the head of the freelist as
185 * ialloc_context. We will hold a lock on it across the
186 * transaction commit so that no other process can steal
187 * the inode(s) that we've just allocated.
188 */
189 code = xfs_ialloc(tp, dp, mode, nlink, rdev, credp, prid, okalloc,
190 &ialloc_context, &call_again, &ip);
191
192 /*
193 * Return an error if we were unable to allocate a new inode.
194 * This should only happen if we run out of space on disk or
195 * encounter a disk error.
196 */
197 if (code) {
198 *ipp = NULL;
199 return code;
200 }
201 if (!call_again && (ip == NULL)) {
202 *ipp = NULL;
203 return XFS_ERROR(ENOSPC);
204 }
205
206 /*
207 * If call_again is set, then we were unable to get an
208 * inode in one operation. We need to commit the current
209 * transaction and call xfs_ialloc() again. It is guaranteed
210 * to succeed the second time.
211 */
212 if (call_again) {
213
214 /*
215 * Normally, xfs_trans_commit releases all the locks.
216 * We call bhold to hang on to the ialloc_context across
217 * the commit. Holding this buffer prevents any other
218 * processes from doing any allocations in this
219 * allocation group.
220 */
221 xfs_trans_bhold(tp, ialloc_context);
222 /*
223 * Save the log reservation so we can use
224 * them in the next transaction.
225 */
226 log_res = xfs_trans_get_log_res(tp);
227 log_count = xfs_trans_get_log_count(tp);
228
229 /*
230 * We want the quota changes to be associated with the next
231 * transaction, NOT this one. So, detach the dqinfo from this
232 * and attach it to the next transaction.
233 */
234 dqinfo = NULL;
235 tflags = 0;
236 if (tp->t_dqinfo) {
237 dqinfo = (void *)tp->t_dqinfo;
238 tp->t_dqinfo = NULL;
239 tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY;
240 tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
241 }
242
243 ntp = xfs_trans_dup(tp);
244 code = xfs_trans_commit(tp, 0, NULL);
245 tp = ntp;
246 if (committed != NULL) {
247 *committed = 1;
248 }
249 /*
250 * If we get an error during the commit processing,
251 * release the buffer that is still held and return
252 * to the caller.
253 */
254 if (code) {
255 xfs_buf_relse(ialloc_context);
256 if (dqinfo) {
257 tp->t_dqinfo = dqinfo;
258 XFS_TRANS_FREE_DQINFO(tp->t_mountp, tp);
259 }
260 *tpp = ntp;
261 *ipp = NULL;
262 return code;
263 }
264 code = xfs_trans_reserve(tp, 0, log_res, 0,
265 XFS_TRANS_PERM_LOG_RES, log_count);
266 /*
267 * Re-attach the quota info that we detached from prev trx.
268 */
269 if (dqinfo) {
270 tp->t_dqinfo = dqinfo;
271 tp->t_flags |= tflags;
272 }
273
274 if (code) {
275 xfs_buf_relse(ialloc_context);
276 *tpp = ntp;
277 *ipp = NULL;
278 return code;
279 }
280 xfs_trans_bjoin(tp, ialloc_context);
281
282 /*
283 * Call ialloc again. Since we've locked out all
284 * other allocations in this allocation group,
285 * this call should always succeed.
286 */
287 code = xfs_ialloc(tp, dp, mode, nlink, rdev, credp, prid,
288 okalloc, &ialloc_context, &call_again, &ip);
289
290 /*
291 * If we get an error at this point, return to the caller
292 * so that the current transaction can be aborted.
293 */
294 if (code) {
295 *tpp = tp;
296 *ipp = NULL;
297 return code;
298 }
299 ASSERT ((!call_again) && (ip != NULL));
300
301 } else {
302 if (committed != NULL) {
303 *committed = 0;
304 }
305 }
306
307 *ipp = ip;
308 *tpp = tp;
309
310 return 0;
311}
312
313/*
314 * Decrement the link count on an inode & log the change.
315 * If this causes the link count to go to zero, initiate the
316 * logging activity required to truncate a file.
317 */
318int /* error */
319xfs_droplink(
320 xfs_trans_t *tp,
321 xfs_inode_t *ip)
322{
323 int error;
324
325 xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
326
327 ASSERT (ip->i_d.di_nlink > 0);
328 ip->i_d.di_nlink--;
329 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
330
331 error = 0;
332 if (ip->i_d.di_nlink == 0) {
333 /*
334 * We're dropping the last link to this file.
335 * Move the on-disk inode to the AGI unlinked list.
336 * From xfs_inactive() we will pull the inode from
337 * the list and free it.
338 */
339 error = xfs_iunlink(tp, ip);
340 }
341 return error;
342}
343
344/*
345 * This gets called when the inode's version needs to be changed from 1 to 2.
346 * Currently this happens when the nlink field overflows the old 16-bit value
347 * or when chproj is called to change the project for the first time.
348 * As a side effect the superblock version will also get rev'd
349 * to contain the NLINK bit.
350 */
351void
352xfs_bump_ino_vers2(
353 xfs_trans_t *tp,
354 xfs_inode_t *ip)
355{
356 xfs_mount_t *mp;
357 unsigned long s;
358
359 ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE));
360 ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1);
361
362 ip->i_d.di_version = XFS_DINODE_VERSION_2;
363 ip->i_d.di_onlink = 0;
364 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
365 mp = tp->t_mountp;
366 if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) {
367 s = XFS_SB_LOCK(mp);
368 if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) {
369 XFS_SB_VERSION_ADDNLINK(&mp->m_sb);
370 XFS_SB_UNLOCK(mp, s);
371 xfs_mod_sb(tp, XFS_SB_VERSIONNUM);
372 } else {
373 XFS_SB_UNLOCK(mp, s);
374 }
375 }
376 /* Caller must log the inode */
377}
378
379/*
380 * Increment the link count on an inode & log the change.
381 */
382int
383xfs_bumplink(
384 xfs_trans_t *tp,
385 xfs_inode_t *ip)
386{
387 if (ip->i_d.di_nlink >= XFS_MAXLINK)
388 return XFS_ERROR(EMLINK);
389 xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
390
391 ASSERT(ip->i_d.di_nlink > 0);
392 ip->i_d.di_nlink++;
393 if ((ip->i_d.di_version == XFS_DINODE_VERSION_1) &&
394 (ip->i_d.di_nlink > XFS_MAXLINK_1)) {
395 /*
396 * The inode has increased its number of links beyond
397 * what can fit in an old format inode. It now needs
398 * to be converted to a version 2 inode with a 32 bit
399 * link count. If this is the first inode in the file
400 * system to do this, then we need to bump the superblock
401 * version number as well.
402 */
403 xfs_bump_ino_vers2(tp, ip);
404 }
405
406 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
407 return 0;
408}
409
410/*
411 * Try to truncate the given file to 0 length. Currently called
412 * only out of xfs_remove when it has to truncate a file to free
413 * up space for the remove to proceed.
414 */
415int
416xfs_truncate_file(
417 xfs_mount_t *mp,
418 xfs_inode_t *ip)
419{
420 xfs_trans_t *tp;
421 int error;
422
423#ifdef QUOTADEBUG
424 /*
425 * This is called to truncate the quotainodes too.
426 */
427 if (XFS_IS_UQUOTA_ON(mp)) {
428 if (ip->i_ino != mp->m_sb.sb_uquotino)
429 ASSERT(ip->i_udquot);
430 }
431 if (XFS_IS_GQUOTA_ON(mp)) {
432 if (ip->i_ino != mp->m_sb.sb_gquotino)
433 ASSERT(ip->i_gdquot);
434 }
435#endif
436 /*
437 * Make the call to xfs_itruncate_start before starting the
438 * transaction, because we cannot make the call while we're
439 * in a transaction.
440 */
441 xfs_ilock(ip, XFS_IOLOCK_EXCL);
442 xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, (xfs_fsize_t)0);
443
444 tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
445 if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
446 XFS_TRANS_PERM_LOG_RES,
447 XFS_ITRUNCATE_LOG_COUNT))) {
448 xfs_trans_cancel(tp, 0);
449 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
450 return error;
451 }
452
453 /*
454 * Follow the normal truncate locking protocol. Since we
455 * hold the inode in the transaction, we know that it's number
456 * of references will stay constant.
457 */
458 xfs_ilock(ip, XFS_ILOCK_EXCL);
459 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
460 xfs_trans_ihold(tp, ip);
461 /*
462 * Signal a sync xaction. The only case where that isn't
463 * the case is if we're truncating an already unlinked file
464 * on a wsync fs. In that case, we know the blocks can't
465 * reappear in the file because the links to file are
466 * permanently toast. Currently, we're always going to
467 * want a sync transaction because this code is being
468 * called from places where nlink is guaranteed to be 1
469 * but I'm leaving the tests in to protect against future
470 * changes -- rcc.
471 */
472 error = xfs_itruncate_finish(&tp, ip, (xfs_fsize_t)0,
473 XFS_DATA_FORK,
474 ((ip->i_d.di_nlink != 0 ||
475 !(mp->m_flags & XFS_MOUNT_WSYNC))
476 ? 1 : 0));
477 if (error) {
478 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
479 XFS_TRANS_ABORT);
480 } else {
481 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
482 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES,
483 NULL);
484 }
485 xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
486
487 return error;
488}
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
new file mode 100644
index 000000000000..e1ed6a588000
--- /dev/null
+++ b/fs/xfs/xfs_utils.h
@@ -0,0 +1,52 @@
1/*
2 * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32#ifndef __XFS_UTILS_H__
33#define __XFS_UTILS_H__
34
35#define IRELE(ip) VN_RELE(XFS_ITOV(ip))
36#define IHOLD(ip) VN_HOLD(XFS_ITOV(ip))
37#define ITRACE(ip) vn_trace_ref(XFS_ITOV(ip), __FILE__, __LINE__, \
38 (inst_t *)__return_address)
39
40extern int xfs_rename (bhv_desc_t *, vname_t *, vnode_t *, vname_t *, cred_t *);
41extern int xfs_get_dir_entry (vname_t *, xfs_inode_t **);
42extern int xfs_dir_lookup_int (bhv_desc_t *, uint, vname_t *, xfs_ino_t *,
43 xfs_inode_t **);
44extern int xfs_truncate_file (xfs_mount_t *, xfs_inode_t *);
45extern int xfs_dir_ialloc (xfs_trans_t **, xfs_inode_t *, mode_t, nlink_t,
46 xfs_dev_t, cred_t *, prid_t, int,
47 xfs_inode_t **, int *);
48extern int xfs_droplink (xfs_trans_t *, xfs_inode_t *);
49extern int xfs_bumplink (xfs_trans_t *, xfs_inode_t *);
50extern void xfs_bump_ino_vers2 (xfs_trans_t *, xfs_inode_t *);
51
52#endif /* __XFS_UTILS_H__ */
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
new file mode 100644
index 000000000000..00aae9c6a904
--- /dev/null
+++ b/fs/xfs/xfs_vfsops.c
@@ -0,0 +1,1941 @@
1/*
2 * XFS filesystem operations.
3 *
4 * Copyright (c) 2000-2005 Silicon Graphics, Inc. All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of version 2 of the GNU General Public License as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
13 *
14 * Further, this software is distributed without any warranty that it is
15 * free of the rightful claim of any third person regarding infringement
16 * or the like. Any license provided herein, whether implied or
17 * otherwise, applies only to this software file. Patent licenses, if
18 * any, provided herein do not apply to combinations of this program with
19 * other software, or any other product whatsoever.
20 *
21 * You should have received a copy of the GNU General Public License along
22 * with this program; if not, write the Free Software Foundation, Inc., 59
23 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
24 *
25 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
26 * Mountain View, CA 94043, or:
27 *
28 * http://www.sgi.com
29 *
30 * For further information regarding this notice, see:
31 *
32 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
33 */
34
35#include "xfs.h"
36#include "xfs_macros.h"
37#include "xfs_types.h"
38#include "xfs_inum.h"
39#include "xfs_log.h"
40#include "xfs_trans.h"
41#include "xfs_sb.h"
42#include "xfs_dir.h"
43#include "xfs_dir2.h"
44#include "xfs_dmapi.h"
45#include "xfs_mount.h"
46#include "xfs_bmap_btree.h"
47#include "xfs_ialloc_btree.h"
48#include "xfs_alloc_btree.h"
49#include "xfs_btree.h"
50#include "xfs_alloc.h"
51#include "xfs_ialloc.h"
52#include "xfs_attr_sf.h"
53#include "xfs_dir_sf.h"
54#include "xfs_dir2_sf.h"
55#include "xfs_dinode.h"
56#include "xfs_inode_item.h"
57#include "xfs_inode.h"
58#include "xfs_ag.h"
59#include "xfs_error.h"
60#include "xfs_bmap.h"
61#include "xfs_da_btree.h"
62#include "xfs_rw.h"
63#include "xfs_refcache.h"
64#include "xfs_buf_item.h"
65#include "xfs_extfree_item.h"
66#include "xfs_quota.h"
67#include "xfs_dir2_trace.h"
68#include "xfs_acl.h"
69#include "xfs_attr.h"
70#include "xfs_clnt.h"
71#include "xfs_log_priv.h"
72
73STATIC int xfs_sync(bhv_desc_t *, int, cred_t *);
74
75int
76xfs_init(void)
77{
78 extern kmem_zone_t *xfs_bmap_free_item_zone;
79 extern kmem_zone_t *xfs_btree_cur_zone;
80 extern kmem_zone_t *xfs_trans_zone;
81 extern kmem_zone_t *xfs_buf_item_zone;
82 extern kmem_zone_t *xfs_dabuf_zone;
83#ifdef XFS_DABUF_DEBUG
84 extern lock_t xfs_dabuf_global_lock;
85 spinlock_init(&xfs_dabuf_global_lock, "xfsda");
86#endif
87
88 /*
89 * Initialize all of the zone allocators we use.
90 */
91 xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t),
92 "xfs_bmap_free_item");
93 xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t),
94 "xfs_btree_cur");
95 xfs_inode_zone = kmem_zone_init(sizeof(xfs_inode_t), "xfs_inode");
96 xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans");
97 xfs_da_state_zone =
98 kmem_zone_init(sizeof(xfs_da_state_t), "xfs_da_state");
99 xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf");
100
101 /*
102 * The size of the zone allocated buf log item is the maximum
103 * size possible under XFS. This wastes a little bit of memory,
104 * but it is much faster.
105 */
106 xfs_buf_item_zone =
107 kmem_zone_init((sizeof(xfs_buf_log_item_t) +
108 (((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) /
109 NBWORD) * sizeof(int))),
110 "xfs_buf_item");
111 xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) +
112 ((XFS_EFD_MAX_FAST_EXTENTS - 1) * sizeof(xfs_extent_t))),
113 "xfs_efd_item");
114 xfs_efi_zone = kmem_zone_init((sizeof(xfs_efi_log_item_t) +
115 ((XFS_EFI_MAX_FAST_EXTENTS - 1) * sizeof(xfs_extent_t))),
116 "xfs_efi_item");
117 xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork");
118 xfs_ili_zone = kmem_zone_init(sizeof(xfs_inode_log_item_t), "xfs_ili");
119 xfs_chashlist_zone = kmem_zone_init(sizeof(xfs_chashlist_t),
120 "xfs_chashlist");
121 xfs_acl_zone_init(xfs_acl_zone, "xfs_acl");
122
123 /*
124 * Allocate global trace buffers.
125 */
126#ifdef XFS_ALLOC_TRACE
127 xfs_alloc_trace_buf = ktrace_alloc(XFS_ALLOC_TRACE_SIZE, KM_SLEEP);
128#endif
129#ifdef XFS_BMAP_TRACE
130 xfs_bmap_trace_buf = ktrace_alloc(XFS_BMAP_TRACE_SIZE, KM_SLEEP);
131#endif
132#ifdef XFS_BMBT_TRACE
133 xfs_bmbt_trace_buf = ktrace_alloc(XFS_BMBT_TRACE_SIZE, KM_SLEEP);
134#endif
135#ifdef XFS_DIR_TRACE
136 xfs_dir_trace_buf = ktrace_alloc(XFS_DIR_TRACE_SIZE, KM_SLEEP);
137#endif
138#ifdef XFS_ATTR_TRACE
139 xfs_attr_trace_buf = ktrace_alloc(XFS_ATTR_TRACE_SIZE, KM_SLEEP);
140#endif
141#ifdef XFS_DIR2_TRACE
142 xfs_dir2_trace_buf = ktrace_alloc(XFS_DIR2_GTRACE_SIZE, KM_SLEEP);
143#endif
144
145 xfs_dir_startup();
146
147#if (defined(DEBUG) || defined(INDUCE_IO_ERROR))
148 xfs_error_test_init();
149#endif /* DEBUG || INDUCE_IO_ERROR */
150
151 xfs_init_procfs();
152 xfs_sysctl_register();
153 return 0;
154}
155
156void
157xfs_cleanup(void)
158{
159 extern kmem_zone_t *xfs_bmap_free_item_zone;
160 extern kmem_zone_t *xfs_btree_cur_zone;
161 extern kmem_zone_t *xfs_inode_zone;
162 extern kmem_zone_t *xfs_trans_zone;
163 extern kmem_zone_t *xfs_da_state_zone;
164 extern kmem_zone_t *xfs_dabuf_zone;
165 extern kmem_zone_t *xfs_efd_zone;
166 extern kmem_zone_t *xfs_efi_zone;
167 extern kmem_zone_t *xfs_buf_item_zone;
168 extern kmem_zone_t *xfs_chashlist_zone;
169
170 xfs_cleanup_procfs();
171 xfs_sysctl_unregister();
172 xfs_refcache_destroy();
173 xfs_acl_zone_destroy(xfs_acl_zone);
174
175#ifdef XFS_DIR2_TRACE
176 ktrace_free(xfs_dir2_trace_buf);
177#endif
178#ifdef XFS_ATTR_TRACE
179 ktrace_free(xfs_attr_trace_buf);
180#endif
181#ifdef XFS_DIR_TRACE
182 ktrace_free(xfs_dir_trace_buf);
183#endif
184#ifdef XFS_BMBT_TRACE
185 ktrace_free(xfs_bmbt_trace_buf);
186#endif
187#ifdef XFS_BMAP_TRACE
188 ktrace_free(xfs_bmap_trace_buf);
189#endif
190#ifdef XFS_ALLOC_TRACE
191 ktrace_free(xfs_alloc_trace_buf);
192#endif
193
194 kmem_cache_destroy(xfs_bmap_free_item_zone);
195 kmem_cache_destroy(xfs_btree_cur_zone);
196 kmem_cache_destroy(xfs_inode_zone);
197 kmem_cache_destroy(xfs_trans_zone);
198 kmem_cache_destroy(xfs_da_state_zone);
199 kmem_cache_destroy(xfs_dabuf_zone);
200 kmem_cache_destroy(xfs_buf_item_zone);
201 kmem_cache_destroy(xfs_efd_zone);
202 kmem_cache_destroy(xfs_efi_zone);
203 kmem_cache_destroy(xfs_ifork_zone);
204 kmem_cache_destroy(xfs_ili_zone);
205 kmem_cache_destroy(xfs_chashlist_zone);
206}
207
208/*
209 * xfs_start_flags
210 *
211 * This function fills in xfs_mount_t fields based on mount args.
212 * Note: the superblock has _not_ yet been read in.
213 */
214STATIC int
215xfs_start_flags(
216 struct vfs *vfs,
217 struct xfs_mount_args *ap,
218 struct xfs_mount *mp)
219{
220 /* Values are in BBs */
221 if ((ap->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) {
222 /*
223 * At this point the superblock has not been read
224 * in, therefore we do not know the block size.
225 * Before the mount call ends we will convert
226 * these to FSBs.
227 */
228 mp->m_dalign = ap->sunit;
229 mp->m_swidth = ap->swidth;
230 }
231
232 if (ap->logbufs != -1 &&
233#if defined(DEBUG) || defined(XLOG_NOLOG)
234 ap->logbufs != 0 &&
235#endif
236 (ap->logbufs < XLOG_MIN_ICLOGS ||
237 ap->logbufs > XLOG_MAX_ICLOGS)) {
238 cmn_err(CE_WARN,
239 "XFS: invalid logbufs value: %d [not %d-%d]",
240 ap->logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
241 return XFS_ERROR(EINVAL);
242 }
243 mp->m_logbufs = ap->logbufs;
244 if (ap->logbufsize != -1 &&
245 ap->logbufsize != 16 * 1024 &&
246 ap->logbufsize != 32 * 1024 &&
247 ap->logbufsize != 64 * 1024 &&
248 ap->logbufsize != 128 * 1024 &&
249 ap->logbufsize != 256 * 1024) {
250 cmn_err(CE_WARN,
251 "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
252 ap->logbufsize);
253 return XFS_ERROR(EINVAL);
254 }
255 mp->m_ihsize = ap->ihashsize;
256 mp->m_logbsize = ap->logbufsize;
257 mp->m_fsname_len = strlen(ap->fsname) + 1;
258 mp->m_fsname = kmem_alloc(mp->m_fsname_len, KM_SLEEP);
259 strcpy(mp->m_fsname, ap->fsname);
260
261 if (ap->flags & XFSMNT_WSYNC)
262 mp->m_flags |= XFS_MOUNT_WSYNC;
263#if XFS_BIG_INUMS
264 if (ap->flags & XFSMNT_INO64) {
265 mp->m_flags |= XFS_MOUNT_INO64;
266 mp->m_inoadd = XFS_INO64_OFFSET;
267 }
268#endif
269 if (ap->flags & XFSMNT_NOATIME)
270 mp->m_flags |= XFS_MOUNT_NOATIME;
271
272 if (ap->flags & XFSMNT_RETERR)
273 mp->m_flags |= XFS_MOUNT_RETERR;
274
275 if (ap->flags & XFSMNT_NOALIGN)
276 mp->m_flags |= XFS_MOUNT_NOALIGN;
277
278 if (ap->flags & XFSMNT_SWALLOC)
279 mp->m_flags |= XFS_MOUNT_SWALLOC;
280
281 if (ap->flags & XFSMNT_OSYNCISOSYNC)
282 mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
283
284 if (ap->flags & XFSMNT_32BITINODES)
285 mp->m_flags |= (XFS_MOUNT_32BITINODES | XFS_MOUNT_32BITINOOPT);
286
287 if (ap->flags & XFSMNT_IOSIZE) {
288 if (ap->iosizelog > XFS_MAX_IO_LOG ||
289 ap->iosizelog < XFS_MIN_IO_LOG) {
290 cmn_err(CE_WARN,
291 "XFS: invalid log iosize: %d [not %d-%d]",
292 ap->iosizelog, XFS_MIN_IO_LOG,
293 XFS_MAX_IO_LOG);
294 return XFS_ERROR(EINVAL);
295 }
296
297 mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
298 mp->m_readio_log = mp->m_writeio_log = ap->iosizelog;
299 }
300
301 if (ap->flags & XFSMNT_IHASHSIZE)
302 mp->m_flags |= XFS_MOUNT_IHASHSIZE;
303
304 if (ap->flags & XFSMNT_IDELETE)
305 mp->m_flags |= XFS_MOUNT_IDELETE;
306
307 if (ap->flags & XFSMNT_DIRSYNC)
308 mp->m_flags |= XFS_MOUNT_DIRSYNC;
309
310 /*
311 * no recovery flag requires a read-only mount
312 */
313 if (ap->flags & XFSMNT_NORECOVERY) {
314 if (!(vfs->vfs_flag & VFS_RDONLY)) {
315 cmn_err(CE_WARN,
316 "XFS: tried to mount a FS read-write without recovery!");
317 return XFS_ERROR(EINVAL);
318 }
319 mp->m_flags |= XFS_MOUNT_NORECOVERY;
320 }
321
322 if (ap->flags & XFSMNT_NOUUID)
323 mp->m_flags |= XFS_MOUNT_NOUUID;
324 if (ap->flags & XFSMNT_NOLOGFLUSH)
325 mp->m_flags |= XFS_MOUNT_NOLOGFLUSH;
326
327 return 0;
328}
329
330/*
331 * This function fills in xfs_mount_t fields based on mount args.
332 * Note: the superblock _has_ now been read in.
333 */
334STATIC int
335xfs_finish_flags(
336 struct vfs *vfs,
337 struct xfs_mount_args *ap,
338 struct xfs_mount *mp)
339{
340 int ronly = (vfs->vfs_flag & VFS_RDONLY);
341
342 /* Fail a mount where the logbuf is smaller then the log stripe */
343 if (XFS_SB_VERSION_HASLOGV2(&mp->m_sb)) {
344 if ((ap->logbufsize == -1) &&
345 (mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE)) {
346 mp->m_logbsize = mp->m_sb.sb_logsunit;
347 } else if (ap->logbufsize < mp->m_sb.sb_logsunit) {
348 cmn_err(CE_WARN,
349 "XFS: logbuf size must be greater than or equal to log stripe size");
350 return XFS_ERROR(EINVAL);
351 }
352 } else {
353 /* Fail a mount if the logbuf is larger than 32K */
354 if (ap->logbufsize > XLOG_BIG_RECORD_BSIZE) {
355 cmn_err(CE_WARN,
356 "XFS: logbuf size for version 1 logs must be 16K or 32K");
357 return XFS_ERROR(EINVAL);
358 }
359 }
360
361 /*
362 * prohibit r/w mounts of read-only filesystems
363 */
364 if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) {
365 cmn_err(CE_WARN,
366 "XFS: cannot mount a read-only filesystem as read-write");
367 return XFS_ERROR(EROFS);
368 }
369
370 /*
371 * disallow mount attempts with (IRIX) project quota enabled
372 */
373 if (XFS_SB_VERSION_HASQUOTA(&mp->m_sb) &&
374 (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT)) {
375 cmn_err(CE_WARN,
376 "XFS: cannot mount a filesystem with IRIX project quota enabled");
377 return XFS_ERROR(ENOSYS);
378 }
379
380 /*
381 * check for shared mount.
382 */
383 if (ap->flags & XFSMNT_SHARED) {
384 if (!XFS_SB_VERSION_HASSHARED(&mp->m_sb))
385 return XFS_ERROR(EINVAL);
386
387 /*
388 * For IRIX 6.5, shared mounts must have the shared
389 * version bit set, have the persistent readonly
390 * field set, must be version 0 and can only be mounted
391 * read-only.
392 */
393 if (!ronly || !(mp->m_sb.sb_flags & XFS_SBF_READONLY) ||
394 (mp->m_sb.sb_shared_vn != 0))
395 return XFS_ERROR(EINVAL);
396
397 mp->m_flags |= XFS_MOUNT_SHARED;
398
399 /*
400 * Shared XFS V0 can't deal with DMI. Return EINVAL.
401 */
402 if (mp->m_sb.sb_shared_vn == 0 && (ap->flags & XFSMNT_DMAPI))
403 return XFS_ERROR(EINVAL);
404 }
405
406 return 0;
407}
408
409/*
410 * xfs_mount
411 *
412 * The file system configurations are:
413 * (1) device (partition) with data and internal log
414 * (2) logical volume with data and log subvolumes.
415 * (3) logical volume with data, log, and realtime subvolumes.
416 *
417 * We only have to handle opening the log and realtime volumes here if
418 * they are present. The data subvolume has already been opened by
419 * get_sb_bdev() and is stored in vfsp->vfs_super->s_bdev.
420 */
421STATIC int
422xfs_mount(
423 struct bhv_desc *bhvp,
424 struct xfs_mount_args *args,
425 cred_t *credp)
426{
427 struct vfs *vfsp = bhvtovfs(bhvp);
428 struct bhv_desc *p;
429 struct xfs_mount *mp = XFS_BHVTOM(bhvp);
430 struct block_device *ddev, *logdev, *rtdev;
431 int flags = 0, error;
432
433 ddev = vfsp->vfs_super->s_bdev;
434 logdev = rtdev = NULL;
435
436 /*
437 * Setup xfs_mount function vectors from available behaviors
438 */
439 p = vfs_bhv_lookup(vfsp, VFS_POSITION_DM);
440 mp->m_dm_ops = p ? *(xfs_dmops_t *) vfs_bhv_custom(p) : xfs_dmcore_stub;
441 p = vfs_bhv_lookup(vfsp, VFS_POSITION_QM);
442 mp->m_qm_ops = p ? *(xfs_qmops_t *) vfs_bhv_custom(p) : xfs_qmcore_stub;
443 p = vfs_bhv_lookup(vfsp, VFS_POSITION_IO);
444 mp->m_io_ops = p ? *(xfs_ioops_t *) vfs_bhv_custom(p) : xfs_iocore_xfs;
445
446 /*
447 * Open real time and log devices - order is important.
448 */
449 if (args->logname[0]) {
450 error = xfs_blkdev_get(mp, args->logname, &logdev);
451 if (error)
452 return error;
453 }
454 if (args->rtname[0]) {
455 error = xfs_blkdev_get(mp, args->rtname, &rtdev);
456 if (error) {
457 xfs_blkdev_put(logdev);
458 return error;
459 }
460
461 if (rtdev == ddev || rtdev == logdev) {
462 cmn_err(CE_WARN,
463 "XFS: Cannot mount filesystem with identical rtdev and ddev/logdev.");
464 xfs_blkdev_put(logdev);
465 xfs_blkdev_put(rtdev);
466 return EINVAL;
467 }
468 }
469
470 /*
471 * Setup xfs_mount buffer target pointers
472 */
473 error = ENOMEM;
474 mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0);
475 if (!mp->m_ddev_targp) {
476 xfs_blkdev_put(logdev);
477 xfs_blkdev_put(rtdev);
478 return error;
479 }
480 if (rtdev) {
481 mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1);
482 if (!mp->m_rtdev_targp)
483 goto error0;
484 }
485 mp->m_logdev_targp = (logdev && logdev != ddev) ?
486 xfs_alloc_buftarg(logdev, 1) : mp->m_ddev_targp;
487 if (!mp->m_logdev_targp)
488 goto error0;
489
490 /*
491 * Setup flags based on mount(2) options and then the superblock
492 */
493 error = xfs_start_flags(vfsp, args, mp);
494 if (error)
495 goto error1;
496 error = xfs_readsb(mp);
497 if (error)
498 goto error1;
499 error = xfs_finish_flags(vfsp, args, mp);
500 if (error)
501 goto error2;
502
503 /*
504 * Setup xfs_mount buffer target pointers based on superblock
505 */
506 error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_blocksize,
507 mp->m_sb.sb_sectsize);
508 if (!error && logdev && logdev != ddev) {
509 unsigned int log_sector_size = BBSIZE;
510
511 if (XFS_SB_VERSION_HASSECTOR(&mp->m_sb))
512 log_sector_size = mp->m_sb.sb_logsectsize;
513 error = xfs_setsize_buftarg(mp->m_logdev_targp,
514 mp->m_sb.sb_blocksize,
515 log_sector_size);
516 }
517 if (!error && rtdev)
518 error = xfs_setsize_buftarg(mp->m_rtdev_targp,
519 mp->m_sb.sb_blocksize,
520 mp->m_sb.sb_sectsize);
521 if (error)
522 goto error2;
523
524 error = XFS_IOINIT(vfsp, args, flags);
525 if (!error)
526 return 0;
527error2:
528 if (mp->m_sb_bp)
529 xfs_freesb(mp);
530error1:
531 xfs_binval(mp->m_ddev_targp);
532 if (logdev && logdev != ddev)
533 xfs_binval(mp->m_logdev_targp);
534 if (rtdev)
535 xfs_binval(mp->m_rtdev_targp);
536error0:
537 xfs_unmountfs_close(mp, credp);
538 return error;
539}
540
541STATIC int
542xfs_unmount(
543 bhv_desc_t *bdp,
544 int flags,
545 cred_t *credp)
546{
547 struct vfs *vfsp = bhvtovfs(bdp);
548 xfs_mount_t *mp = XFS_BHVTOM(bdp);
549 xfs_inode_t *rip;
550 vnode_t *rvp;
551 int unmount_event_wanted = 0;
552 int unmount_event_flags = 0;
553 int xfs_unmountfs_needed = 0;
554 int error;
555
556 rip = mp->m_rootip;
557 rvp = XFS_ITOV(rip);
558
559 if (vfsp->vfs_flag & VFS_DMI) {
560 error = XFS_SEND_PREUNMOUNT(mp, vfsp,
561 rvp, DM_RIGHT_NULL, rvp, DM_RIGHT_NULL,
562 NULL, NULL, 0, 0,
563 (mp->m_dmevmask & (1<<DM_EVENT_PREUNMOUNT))?
564 0:DM_FLAGS_UNWANTED);
565 if (error)
566 return XFS_ERROR(error);
567 unmount_event_wanted = 1;
568 unmount_event_flags = (mp->m_dmevmask & (1<<DM_EVENT_UNMOUNT))?
569 0 : DM_FLAGS_UNWANTED;
570 }
571
572 /*
573 * First blow any referenced inode from this file system
574 * out of the reference cache, and delete the timer.
575 */
576 xfs_refcache_purge_mp(mp);
577
578 XFS_bflush(mp->m_ddev_targp);
579 error = xfs_unmount_flush(mp, 0);
580 if (error)
581 goto out;
582
583 ASSERT(vn_count(rvp) == 1);
584
585 /*
586 * Drop the reference count
587 */
588 VN_RELE(rvp);
589
590 /*
591 * If we're forcing a shutdown, typically because of a media error,
592 * we want to make sure we invalidate dirty pages that belong to
593 * referenced vnodes as well.
594 */
595 if (XFS_FORCED_SHUTDOWN(mp)) {
596 error = xfs_sync(&mp->m_bhv,
597 (SYNC_WAIT | SYNC_CLOSE), credp);
598 ASSERT(error != EFSCORRUPTED);
599 }
600 xfs_unmountfs_needed = 1;
601
602out:
603 /* Send DMAPI event, if required.
604 * Then do xfs_unmountfs() if needed.
605 * Then return error (or zero).
606 */
607 if (unmount_event_wanted) {
608 /* Note: mp structure must still exist for
609 * XFS_SEND_UNMOUNT() call.
610 */
611 XFS_SEND_UNMOUNT(mp, vfsp, error == 0 ? rvp : NULL,
612 DM_RIGHT_NULL, 0, error, unmount_event_flags);
613 }
614 if (xfs_unmountfs_needed) {
615 /*
616 * Call common unmount function to flush to disk
617 * and free the super block buffer & mount structures.
618 */
619 xfs_unmountfs(mp, credp);
620 }
621
622 return XFS_ERROR(error);
623}
624
625#define REMOUNT_READONLY_FLAGS (SYNC_REMOUNT|SYNC_ATTR|SYNC_WAIT)
626
627STATIC int
628xfs_mntupdate(
629 bhv_desc_t *bdp,
630 int *flags,
631 struct xfs_mount_args *args)
632{
633 struct vfs *vfsp = bhvtovfs(bdp);
634 xfs_mount_t *mp = XFS_BHVTOM(bdp);
635 int pincount, error;
636 int count = 0;
637
638 if (args->flags & XFSMNT_NOATIME)
639 mp->m_flags |= XFS_MOUNT_NOATIME;
640 else
641 mp->m_flags &= ~XFS_MOUNT_NOATIME;
642
643 if (!(vfsp->vfs_flag & VFS_RDONLY)) {
644 VFS_SYNC(vfsp, SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR, NULL, error);
645 }
646
647 if (*flags & MS_RDONLY) {
648 xfs_refcache_purge_mp(mp);
649 xfs_flush_buftarg(mp->m_ddev_targp, 0);
650 xfs_finish_reclaim_all(mp, 0);
651
652 /* This loop must run at least twice.
653 * The first instance of the loop will flush
654 * most meta data but that will generate more
655 * meta data (typically directory updates).
656 * Which then must be flushed and logged before
657 * we can write the unmount record.
658 */
659 do {
660 VFS_SYNC(vfsp, REMOUNT_READONLY_FLAGS, NULL, error);
661 pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
662 if (!pincount) {
663 delay(50);
664 count++;
665 }
666 } while (count < 2);
667
668 /* Ok now write out an unmount record */
669 xfs_log_unmount_write(mp);
670 xfs_unmountfs_writesb(mp);
671 vfsp->vfs_flag |= VFS_RDONLY;
672 } else {
673 vfsp->vfs_flag &= ~VFS_RDONLY;
674 }
675
676 return 0;
677}
678
679/*
680 * xfs_unmount_flush implements a set of flush operation on special
681 * inodes, which are needed as a separate set of operations so that
682 * they can be called as part of relocation process.
683 */
684int
685xfs_unmount_flush(
686 xfs_mount_t *mp, /* Mount structure we are getting
687 rid of. */
688 int relocation) /* Called from vfs relocation. */
689{
690 xfs_inode_t *rip = mp->m_rootip;
691 xfs_inode_t *rbmip;
692 xfs_inode_t *rsumip = NULL;
693 vnode_t *rvp = XFS_ITOV(rip);
694 int error;
695
696 xfs_ilock(rip, XFS_ILOCK_EXCL);
697 xfs_iflock(rip);
698
699 /*
700 * Flush out the real time inodes.
701 */
702 if ((rbmip = mp->m_rbmip) != NULL) {
703 xfs_ilock(rbmip, XFS_ILOCK_EXCL);
704 xfs_iflock(rbmip);
705 error = xfs_iflush(rbmip, XFS_IFLUSH_SYNC);
706 xfs_iunlock(rbmip, XFS_ILOCK_EXCL);
707
708 if (error == EFSCORRUPTED)
709 goto fscorrupt_out;
710
711 ASSERT(vn_count(XFS_ITOV(rbmip)) == 1);
712
713 rsumip = mp->m_rsumip;
714 xfs_ilock(rsumip, XFS_ILOCK_EXCL);
715 xfs_iflock(rsumip);
716 error = xfs_iflush(rsumip, XFS_IFLUSH_SYNC);
717 xfs_iunlock(rsumip, XFS_ILOCK_EXCL);
718
719 if (error == EFSCORRUPTED)
720 goto fscorrupt_out;
721
722 ASSERT(vn_count(XFS_ITOV(rsumip)) == 1);
723 }
724
725 /*
726 * Synchronously flush root inode to disk
727 */
728 error = xfs_iflush(rip, XFS_IFLUSH_SYNC);
729 if (error == EFSCORRUPTED)
730 goto fscorrupt_out2;
731
732 if (vn_count(rvp) != 1 && !relocation) {
733 xfs_iunlock(rip, XFS_ILOCK_EXCL);
734 return XFS_ERROR(EBUSY);
735 }
736
737 /*
738 * Release dquot that rootinode, rbmino and rsumino might be holding,
739 * flush and purge the quota inodes.
740 */
741 error = XFS_QM_UNMOUNT(mp);
742 if (error == EFSCORRUPTED)
743 goto fscorrupt_out2;
744
745 if (rbmip) {
746 VN_RELE(XFS_ITOV(rbmip));
747 VN_RELE(XFS_ITOV(rsumip));
748 }
749
750 xfs_iunlock(rip, XFS_ILOCK_EXCL);
751 return 0;
752
753fscorrupt_out:
754 xfs_ifunlock(rip);
755
756fscorrupt_out2:
757 xfs_iunlock(rip, XFS_ILOCK_EXCL);
758
759 return XFS_ERROR(EFSCORRUPTED);
760}
761
762/*
763 * xfs_root extracts the root vnode from a vfs.
764 *
765 * vfsp -- the vfs struct for the desired file system
766 * vpp -- address of the caller's vnode pointer which should be
767 * set to the desired fs root vnode
768 */
769STATIC int
770xfs_root(
771 bhv_desc_t *bdp,
772 vnode_t **vpp)
773{
774 vnode_t *vp;
775
776 vp = XFS_ITOV((XFS_BHVTOM(bdp))->m_rootip);
777 VN_HOLD(vp);
778 *vpp = vp;
779 return 0;
780}
781
782/*
783 * xfs_statvfs
784 *
785 * Fill in the statvfs structure for the given file system. We use
786 * the superblock lock in the mount structure to ensure a consistent
787 * snapshot of the counters returned.
788 */
789STATIC int
790xfs_statvfs(
791 bhv_desc_t *bdp,
792 xfs_statfs_t *statp,
793 vnode_t *vp)
794{
795 __uint64_t fakeinos;
796 xfs_extlen_t lsize;
797 xfs_mount_t *mp;
798 xfs_sb_t *sbp;
799 unsigned long s;
800 u64 id;
801
802 mp = XFS_BHVTOM(bdp);
803 sbp = &(mp->m_sb);
804
805 statp->f_type = XFS_SB_MAGIC;
806
807 s = XFS_SB_LOCK(mp);
808 statp->f_bsize = sbp->sb_blocksize;
809 lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0;
810 statp->f_blocks = sbp->sb_dblocks - lsize;
811 statp->f_bfree = statp->f_bavail = sbp->sb_fdblocks;
812 fakeinos = statp->f_bfree << sbp->sb_inopblog;
813#if XFS_BIG_INUMS
814 fakeinos += mp->m_inoadd;
815#endif
816 statp->f_files =
817 MIN(sbp->sb_icount + fakeinos, (__uint64_t)XFS_MAXINUMBER);
818 if (mp->m_maxicount)
819#if XFS_BIG_INUMS
820 if (!mp->m_inoadd)
821#endif
822 statp->f_files = min_t(typeof(statp->f_files),
823 statp->f_files,
824 mp->m_maxicount);
825 statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
826 XFS_SB_UNLOCK(mp, s);
827
828 id = huge_encode_dev(mp->m_dev);
829 statp->f_fsid.val[0] = (u32)id;
830 statp->f_fsid.val[1] = (u32)(id >> 32);
831 statp->f_namelen = MAXNAMELEN - 1;
832
833 return 0;
834}
835
836
837/*
838 * xfs_sync flushes any pending I/O to file system vfsp.
839 *
840 * This routine is called by vfs_sync() to make sure that things make it
841 * out to disk eventually, on sync() system calls to flush out everything,
842 * and when the file system is unmounted. For the vfs_sync() case, all
843 * we really need to do is sync out the log to make all of our meta-data
844 * updates permanent (except for timestamps). For calls from pflushd(),
845 * dirty pages are kept moving by calling pdflush() on the inodes
846 * containing them. We also flush the inodes that we can lock without
847 * sleeping and the superblock if we can lock it without sleeping from
848 * vfs_sync() so that items at the tail of the log are always moving out.
849 *
850 * Flags:
851 * SYNC_BDFLUSH - We're being called from vfs_sync() so we don't want
852 * to sleep if we can help it. All we really need
853 * to do is ensure that the log is synced at least
854 * periodically. We also push the inodes and
855 * superblock if we can lock them without sleeping
856 * and they are not pinned.
857 * SYNC_ATTR - We need to flush the inodes. If SYNC_BDFLUSH is not
858 * set, then we really want to lock each inode and flush
859 * it.
860 * SYNC_WAIT - All the flushes that take place in this call should
861 * be synchronous.
862 * SYNC_DELWRI - This tells us to push dirty pages associated with
863 * inodes. SYNC_WAIT and SYNC_BDFLUSH are used to
864 * determine if they should be flushed sync, async, or
865 * delwri.
866 * SYNC_CLOSE - This flag is passed when the system is being
867 * unmounted. We should sync and invalidate everthing.
868 * SYNC_FSDATA - This indicates that the caller would like to make
869 * sure the superblock is safe on disk. We can ensure
870 * this by simply makeing sure the log gets flushed
871 * if SYNC_BDFLUSH is set, and by actually writing it
872 * out otherwise.
873 *
874 */
875/*ARGSUSED*/
876STATIC int
877xfs_sync(
878 bhv_desc_t *bdp,
879 int flags,
880 cred_t *credp)
881{
882 xfs_mount_t *mp;
883
884 mp = XFS_BHVTOM(bdp);
885 return (xfs_syncsub(mp, flags, 0, NULL));
886}
887
888/*
889 * xfs sync routine for internal use
890 *
891 * This routine supports all of the flags defined for the generic VFS_SYNC
892 * interface as explained above under xfs_sync. In the interests of not
893 * changing interfaces within the 6.5 family, additional internallly-
894 * required functions are specified within a separate xflags parameter,
895 * only available by calling this routine.
896 *
897 */
898STATIC int
899xfs_sync_inodes(
900 xfs_mount_t *mp,
901 int flags,
902 int xflags,
903 int *bypassed)
904{
905 xfs_inode_t *ip = NULL;
906 xfs_inode_t *ip_next;
907 xfs_buf_t *bp;
908 vnode_t *vp = NULL;
909 vmap_t vmap;
910 int error;
911 int last_error;
912 uint64_t fflag;
913 uint lock_flags;
914 uint base_lock_flags;
915 boolean_t mount_locked;
916 boolean_t vnode_refed;
917 int preempt;
918 xfs_dinode_t *dip;
919 xfs_iptr_t *ipointer;
920#ifdef DEBUG
921 boolean_t ipointer_in = B_FALSE;
922
923#define IPOINTER_SET ipointer_in = B_TRUE
924#define IPOINTER_CLR ipointer_in = B_FALSE
925#else
926#define IPOINTER_SET
927#define IPOINTER_CLR
928#endif
929
930
931/* Insert a marker record into the inode list after inode ip. The list
932 * must be locked when this is called. After the call the list will no
933 * longer be locked.
934 */
935#define IPOINTER_INSERT(ip, mp) { \
936 ASSERT(ipointer_in == B_FALSE); \
937 ipointer->ip_mnext = ip->i_mnext; \
938 ipointer->ip_mprev = ip; \
939 ip->i_mnext = (xfs_inode_t *)ipointer; \
940 ipointer->ip_mnext->i_mprev = (xfs_inode_t *)ipointer; \
941 preempt = 0; \
942 XFS_MOUNT_IUNLOCK(mp); \
943 mount_locked = B_FALSE; \
944 IPOINTER_SET; \
945 }
946
947/* Remove the marker from the inode list. If the marker was the only item
948 * in the list then there are no remaining inodes and we should zero out
949 * the whole list. If we are the current head of the list then move the head
950 * past us.
951 */
952#define IPOINTER_REMOVE(ip, mp) { \
953 ASSERT(ipointer_in == B_TRUE); \
954 if (ipointer->ip_mnext != (xfs_inode_t *)ipointer) { \
955 ip = ipointer->ip_mnext; \
956 ip->i_mprev = ipointer->ip_mprev; \
957 ipointer->ip_mprev->i_mnext = ip; \
958 if (mp->m_inodes == (xfs_inode_t *)ipointer) { \
959 mp->m_inodes = ip; \
960 } \
961 } else { \
962 ASSERT(mp->m_inodes == (xfs_inode_t *)ipointer); \
963 mp->m_inodes = NULL; \
964 ip = NULL; \
965 } \
966 IPOINTER_CLR; \
967 }
968
969#define XFS_PREEMPT_MASK 0x7f
970
971 if (bypassed)
972 *bypassed = 0;
973 if (XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY)
974 return 0;
975 error = 0;
976 last_error = 0;
977 preempt = 0;
978
979 /* Allocate a reference marker */
980 ipointer = (xfs_iptr_t *)kmem_zalloc(sizeof(xfs_iptr_t), KM_SLEEP);
981
982 fflag = XFS_B_ASYNC; /* default is don't wait */
983 if (flags & SYNC_BDFLUSH)
984 fflag = XFS_B_DELWRI;
985 if (flags & SYNC_WAIT)
986 fflag = 0; /* synchronous overrides all */
987
988 base_lock_flags = XFS_ILOCK_SHARED;
989 if (flags & (SYNC_DELWRI | SYNC_CLOSE)) {
990 /*
991 * We need the I/O lock if we're going to call any of
992 * the flush/inval routines.
993 */
994 base_lock_flags |= XFS_IOLOCK_SHARED;
995 }
996
997 XFS_MOUNT_ILOCK(mp);
998
999 ip = mp->m_inodes;
1000
1001 mount_locked = B_TRUE;
1002 vnode_refed = B_FALSE;
1003
1004 IPOINTER_CLR;
1005
1006 do {
1007 ASSERT(ipointer_in == B_FALSE);
1008 ASSERT(vnode_refed == B_FALSE);
1009
1010 lock_flags = base_lock_flags;
1011
1012 /*
1013 * There were no inodes in the list, just break out
1014 * of the loop.
1015 */
1016 if (ip == NULL) {
1017 break;
1018 }
1019
1020 /*
1021 * We found another sync thread marker - skip it
1022 */
1023 if (ip->i_mount == NULL) {
1024 ip = ip->i_mnext;
1025 continue;
1026 }
1027
1028 vp = XFS_ITOV_NULL(ip);
1029
1030 /*
1031 * If the vnode is gone then this is being torn down,
1032 * call reclaim if it is flushed, else let regular flush
1033 * code deal with it later in the loop.
1034 */
1035
1036 if (vp == NULL) {
1037 /* Skip ones already in reclaim */
1038 if (ip->i_flags & XFS_IRECLAIM) {
1039 ip = ip->i_mnext;
1040 continue;
1041 }
1042 if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) {
1043 ip = ip->i_mnext;
1044 } else if ((xfs_ipincount(ip) == 0) &&
1045 xfs_iflock_nowait(ip)) {
1046 IPOINTER_INSERT(ip, mp);
1047
1048 xfs_finish_reclaim(ip, 1,
1049 XFS_IFLUSH_DELWRI_ELSE_ASYNC);
1050
1051 XFS_MOUNT_ILOCK(mp);
1052 mount_locked = B_TRUE;
1053 IPOINTER_REMOVE(ip, mp);
1054 } else {
1055 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1056 ip = ip->i_mnext;
1057 }
1058 continue;
1059 }
1060
1061 if (VN_BAD(vp)) {
1062 ip = ip->i_mnext;
1063 continue;
1064 }
1065
1066 if (XFS_FORCED_SHUTDOWN(mp) && !(flags & SYNC_CLOSE)) {
1067 XFS_MOUNT_IUNLOCK(mp);
1068 kmem_free(ipointer, sizeof(xfs_iptr_t));
1069 return 0;
1070 }
1071
1072 /*
1073 * If this is just vfs_sync() or pflushd() calling
1074 * then we can skip inodes for which it looks like
1075 * there is nothing to do. Since we don't have the
1076 * inode locked this is racey, but these are periodic
1077 * calls so it doesn't matter. For the others we want
1078 * to know for sure, so we at least try to lock them.
1079 */
1080 if (flags & SYNC_BDFLUSH) {
1081 if (((ip->i_itemp == NULL) ||
1082 !(ip->i_itemp->ili_format.ilf_fields &
1083 XFS_ILOG_ALL)) &&
1084 (ip->i_update_core == 0)) {
1085 ip = ip->i_mnext;
1086 continue;
1087 }
1088 }
1089
1090 /*
1091 * Try to lock without sleeping. We're out of order with
1092 * the inode list lock here, so if we fail we need to drop
1093 * the mount lock and try again. If we're called from
1094 * bdflush() here, then don't bother.
1095 *
1096 * The inode lock here actually coordinates with the
1097 * almost spurious inode lock in xfs_ireclaim() to prevent
1098 * the vnode we handle here without a reference from
1099 * being freed while we reference it. If we lock the inode
1100 * while it's on the mount list here, then the spurious inode
1101 * lock in xfs_ireclaim() after the inode is pulled from
1102 * the mount list will sleep until we release it here.
1103 * This keeps the vnode from being freed while we reference
1104 * it. It is also cheaper and simpler than actually doing
1105 * a vn_get() for every inode we touch here.
1106 */
1107 if (xfs_ilock_nowait(ip, lock_flags) == 0) {
1108
1109 if ((flags & SYNC_BDFLUSH) || (vp == NULL)) {
1110 ip = ip->i_mnext;
1111 continue;
1112 }
1113
1114 /*
1115 * We need to unlock the inode list lock in order
1116 * to lock the inode. Insert a marker record into
1117 * the inode list to remember our position, dropping
1118 * the lock is now done inside the IPOINTER_INSERT
1119 * macro.
1120 *
1121 * We also use the inode list lock to protect us
1122 * in taking a snapshot of the vnode version number
1123 * for use in calling vn_get().
1124 */
1125 VMAP(vp, vmap);
1126 IPOINTER_INSERT(ip, mp);
1127
1128 vp = vn_get(vp, &vmap);
1129 if (vp == NULL) {
1130 /*
1131 * The vnode was reclaimed once we let go
1132 * of the inode list lock. Skip to the
1133 * next list entry. Remove the marker.
1134 */
1135
1136 XFS_MOUNT_ILOCK(mp);
1137
1138 mount_locked = B_TRUE;
1139 vnode_refed = B_FALSE;
1140
1141 IPOINTER_REMOVE(ip, mp);
1142
1143 continue;
1144 }
1145
1146 xfs_ilock(ip, lock_flags);
1147
1148 ASSERT(vp == XFS_ITOV(ip));
1149 ASSERT(ip->i_mount == mp);
1150
1151 vnode_refed = B_TRUE;
1152 }
1153
1154 /* From here on in the loop we may have a marker record
1155 * in the inode list.
1156 */
1157
1158 if ((flags & SYNC_CLOSE) && (vp != NULL)) {
1159 /*
1160 * This is the shutdown case. We just need to
1161 * flush and invalidate all the pages associated
1162 * with the inode. Drop the inode lock since
1163 * we can't hold it across calls to the buffer
1164 * cache.
1165 *
1166 * We don't set the VREMAPPING bit in the vnode
1167 * here, because we don't hold the vnode lock
1168 * exclusively. It doesn't really matter, though,
1169 * because we only come here when we're shutting
1170 * down anyway.
1171 */
1172 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1173
1174 if (XFS_FORCED_SHUTDOWN(mp)) {
1175 VOP_TOSS_PAGES(vp, 0, -1, FI_REMAPF);
1176 } else {
1177 VOP_FLUSHINVAL_PAGES(vp, 0, -1, FI_REMAPF);
1178 }
1179
1180 xfs_ilock(ip, XFS_ILOCK_SHARED);
1181
1182 } else if ((flags & SYNC_DELWRI) && (vp != NULL)) {
1183 if (VN_DIRTY(vp)) {
1184 /* We need to have dropped the lock here,
1185 * so insert a marker if we have not already
1186 * done so.
1187 */
1188 if (mount_locked) {
1189 IPOINTER_INSERT(ip, mp);
1190 }
1191
1192 /*
1193 * Drop the inode lock since we can't hold it
1194 * across calls to the buffer cache.
1195 */
1196 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1197 VOP_FLUSH_PAGES(vp, (xfs_off_t)0, -1,
1198 fflag, FI_NONE, error);
1199 xfs_ilock(ip, XFS_ILOCK_SHARED);
1200 }
1201
1202 }
1203
1204 if (flags & SYNC_BDFLUSH) {
1205 if ((flags & SYNC_ATTR) &&
1206 ((ip->i_update_core) ||
1207 ((ip->i_itemp != NULL) &&
1208 (ip->i_itemp->ili_format.ilf_fields != 0)))) {
1209
1210 /* Insert marker and drop lock if not already
1211 * done.
1212 */
1213 if (mount_locked) {
1214 IPOINTER_INSERT(ip, mp);
1215 }
1216
1217 /*
1218 * We don't want the periodic flushing of the
1219 * inodes by vfs_sync() to interfere with
1220 * I/O to the file, especially read I/O
1221 * where it is only the access time stamp
1222 * that is being flushed out. To prevent
1223 * long periods where we have both inode
1224 * locks held shared here while reading the
1225 * inode's buffer in from disk, we drop the
1226 * inode lock while reading in the inode
1227 * buffer. We have to release the buffer
1228 * and reacquire the inode lock so that they
1229 * are acquired in the proper order (inode
1230 * locks first). The buffer will go at the
1231 * end of the lru chain, though, so we can
1232 * expect it to still be there when we go
1233 * for it again in xfs_iflush().
1234 */
1235 if ((xfs_ipincount(ip) == 0) &&
1236 xfs_iflock_nowait(ip)) {
1237
1238 xfs_ifunlock(ip);
1239 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1240
1241 error = xfs_itobp(mp, NULL, ip,
1242 &dip, &bp, 0);
1243 if (!error) {
1244 xfs_buf_relse(bp);
1245 } else {
1246 /* Bailing out, remove the
1247 * marker and free it.
1248 */
1249 XFS_MOUNT_ILOCK(mp);
1250
1251 IPOINTER_REMOVE(ip, mp);
1252
1253 XFS_MOUNT_IUNLOCK(mp);
1254
1255 ASSERT(!(lock_flags &
1256 XFS_IOLOCK_SHARED));
1257
1258 kmem_free(ipointer,
1259 sizeof(xfs_iptr_t));
1260 return (0);
1261 }
1262
1263 /*
1264 * Since we dropped the inode lock,
1265 * the inode may have been reclaimed.
1266 * Therefore, we reacquire the mount
1267 * lock and check to see if we were the
1268 * inode reclaimed. If this happened
1269 * then the ipointer marker will no
1270 * longer point back at us. In this
1271 * case, move ip along to the inode
1272 * after the marker, remove the marker
1273 * and continue.
1274 */
1275 XFS_MOUNT_ILOCK(mp);
1276 mount_locked = B_TRUE;
1277
1278 if (ip != ipointer->ip_mprev) {
1279 IPOINTER_REMOVE(ip, mp);
1280
1281 ASSERT(!vnode_refed);
1282 ASSERT(!(lock_flags &
1283 XFS_IOLOCK_SHARED));
1284 continue;
1285 }
1286
1287 ASSERT(ip->i_mount == mp);
1288
1289 if (xfs_ilock_nowait(ip,
1290 XFS_ILOCK_SHARED) == 0) {
1291 ASSERT(ip->i_mount == mp);
1292 /*
1293 * We failed to reacquire
1294 * the inode lock without
1295 * sleeping, so just skip
1296 * the inode for now. We
1297 * clear the ILOCK bit from
1298 * the lock_flags so that we
1299 * won't try to drop a lock
1300 * we don't hold below.
1301 */
1302 lock_flags &= ~XFS_ILOCK_SHARED;
1303 IPOINTER_REMOVE(ip_next, mp);
1304 } else if ((xfs_ipincount(ip) == 0) &&
1305 xfs_iflock_nowait(ip)) {
1306 ASSERT(ip->i_mount == mp);
1307 /*
1308 * Since this is vfs_sync()
1309 * calling we only flush the
1310 * inode out if we can lock
1311 * it without sleeping and
1312 * it is not pinned. Drop
1313 * the mount lock here so
1314 * that we don't hold it for
1315 * too long. We already have
1316 * a marker in the list here.
1317 */
1318 XFS_MOUNT_IUNLOCK(mp);
1319 mount_locked = B_FALSE;
1320 error = xfs_iflush(ip,
1321 XFS_IFLUSH_DELWRI);
1322 } else {
1323 ASSERT(ip->i_mount == mp);
1324 IPOINTER_REMOVE(ip_next, mp);
1325 }
1326 }
1327
1328 }
1329
1330 } else {
1331 if ((flags & SYNC_ATTR) &&
1332 ((ip->i_update_core) ||
1333 ((ip->i_itemp != NULL) &&
1334 (ip->i_itemp->ili_format.ilf_fields != 0)))) {
1335 if (mount_locked) {
1336 IPOINTER_INSERT(ip, mp);
1337 }
1338
1339 if (flags & SYNC_WAIT) {
1340 xfs_iflock(ip);
1341 error = xfs_iflush(ip,
1342 XFS_IFLUSH_SYNC);
1343 } else {
1344 /*
1345 * If we can't acquire the flush
1346 * lock, then the inode is already
1347 * being flushed so don't bother
1348 * waiting. If we can lock it then
1349 * do a delwri flush so we can
1350 * combine multiple inode flushes
1351 * in each disk write.
1352 */
1353 if (xfs_iflock_nowait(ip)) {
1354 error = xfs_iflush(ip,
1355 XFS_IFLUSH_DELWRI);
1356 }
1357 else if (bypassed)
1358 (*bypassed)++;
1359 }
1360 }
1361 }
1362
1363 if (lock_flags != 0) {
1364 xfs_iunlock(ip, lock_flags);
1365 }
1366
1367 if (vnode_refed) {
1368 /*
1369 * If we had to take a reference on the vnode
1370 * above, then wait until after we've unlocked
1371 * the inode to release the reference. This is
1372 * because we can be already holding the inode
1373 * lock when VN_RELE() calls xfs_inactive().
1374 *
1375 * Make sure to drop the mount lock before calling
1376 * VN_RELE() so that we don't trip over ourselves if
1377 * we have to go for the mount lock again in the
1378 * inactive code.
1379 */
1380 if (mount_locked) {
1381 IPOINTER_INSERT(ip, mp);
1382 }
1383
1384 VN_RELE(vp);
1385
1386 vnode_refed = B_FALSE;
1387 }
1388
1389 if (error) {
1390 last_error = error;
1391 }
1392
1393 /*
1394 * bail out if the filesystem is corrupted.
1395 */
1396 if (error == EFSCORRUPTED) {
1397 if (!mount_locked) {
1398 XFS_MOUNT_ILOCK(mp);
1399 IPOINTER_REMOVE(ip, mp);
1400 }
1401 XFS_MOUNT_IUNLOCK(mp);
1402 ASSERT(ipointer_in == B_FALSE);
1403 kmem_free(ipointer, sizeof(xfs_iptr_t));
1404 return XFS_ERROR(error);
1405 }
1406
1407 /* Let other threads have a chance at the mount lock
1408 * if we have looped many times without dropping the
1409 * lock.
1410 */
1411 if ((++preempt & XFS_PREEMPT_MASK) == 0) {
1412 if (mount_locked) {
1413 IPOINTER_INSERT(ip, mp);
1414 }
1415 }
1416
1417 if (mount_locked == B_FALSE) {
1418 XFS_MOUNT_ILOCK(mp);
1419 mount_locked = B_TRUE;
1420 IPOINTER_REMOVE(ip, mp);
1421 continue;
1422 }
1423
1424 ASSERT(ipointer_in == B_FALSE);
1425 ip = ip->i_mnext;
1426
1427 } while (ip != mp->m_inodes);
1428
1429 XFS_MOUNT_IUNLOCK(mp);
1430
1431 ASSERT(ipointer_in == B_FALSE);
1432
1433 kmem_free(ipointer, sizeof(xfs_iptr_t));
1434 return XFS_ERROR(last_error);
1435}
1436
1437/*
1438 * xfs sync routine for internal use
1439 *
1440 * This routine supports all of the flags defined for the generic VFS_SYNC
1441 * interface as explained above under xfs_sync. In the interests of not
1442 * changing interfaces within the 6.5 family, additional internallly-
1443 * required functions are specified within a separate xflags parameter,
1444 * only available by calling this routine.
1445 *
1446 */
1447int
1448xfs_syncsub(
1449 xfs_mount_t *mp,
1450 int flags,
1451 int xflags,
1452 int *bypassed)
1453{
1454 int error = 0;
1455 int last_error = 0;
1456 uint log_flags = XFS_LOG_FORCE;
1457 xfs_buf_t *bp;
1458 xfs_buf_log_item_t *bip;
1459
1460 /*
1461 * Sync out the log. This ensures that the log is periodically
1462 * flushed even if there is not enough activity to fill it up.
1463 */
1464 if (flags & SYNC_WAIT)
1465 log_flags |= XFS_LOG_SYNC;
1466
1467 xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
1468
1469 if (flags & (SYNC_ATTR|SYNC_DELWRI)) {
1470 if (flags & SYNC_BDFLUSH)
1471 xfs_finish_reclaim_all(mp, 1);
1472 else
1473 error = xfs_sync_inodes(mp, flags, xflags, bypassed);
1474 }
1475
1476 /*
1477 * Flushing out dirty data above probably generated more
1478 * log activity, so if this isn't vfs_sync() then flush
1479 * the log again.
1480 */
1481 if (flags & SYNC_DELWRI) {
1482 xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
1483 }
1484
1485 if (flags & SYNC_FSDATA) {
1486 /*
1487 * If this is vfs_sync() then only sync the superblock
1488 * if we can lock it without sleeping and it is not pinned.
1489 */
1490 if (flags & SYNC_BDFLUSH) {
1491 bp = xfs_getsb(mp, XFS_BUF_TRYLOCK);
1492 if (bp != NULL) {
1493 bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*);
1494 if ((bip != NULL) &&
1495 xfs_buf_item_dirty(bip)) {
1496 if (!(XFS_BUF_ISPINNED(bp))) {
1497 XFS_BUF_ASYNC(bp);
1498 error = xfs_bwrite(mp, bp);
1499 } else {
1500 xfs_buf_relse(bp);
1501 }
1502 } else {
1503 xfs_buf_relse(bp);
1504 }
1505 }
1506 } else {
1507 bp = xfs_getsb(mp, 0);
1508 /*
1509 * If the buffer is pinned then push on the log so
1510 * we won't get stuck waiting in the write for
1511 * someone, maybe ourselves, to flush the log.
1512 * Even though we just pushed the log above, we
1513 * did not have the superblock buffer locked at
1514 * that point so it can become pinned in between
1515 * there and here.
1516 */
1517 if (XFS_BUF_ISPINNED(bp))
1518 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
1519 if (flags & SYNC_WAIT)
1520 XFS_BUF_UNASYNC(bp);
1521 else
1522 XFS_BUF_ASYNC(bp);
1523 error = xfs_bwrite(mp, bp);
1524 }
1525 if (error) {
1526 last_error = error;
1527 }
1528 }
1529
1530 /*
1531 * If this is the periodic sync, then kick some entries out of
1532 * the reference cache. This ensures that idle entries are
1533 * eventually kicked out of the cache.
1534 */
1535 if (flags & SYNC_REFCACHE) {
1536 xfs_refcache_purge_some(mp);
1537 }
1538
1539 /*
1540 * Now check to see if the log needs a "dummy" transaction.
1541 */
1542
1543 if (!(flags & SYNC_REMOUNT) && xfs_log_need_covered(mp)) {
1544 xfs_trans_t *tp;
1545 xfs_inode_t *ip;
1546
1547 /*
1548 * Put a dummy transaction in the log to tell
1549 * recovery that all others are OK.
1550 */
1551 tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
1552 if ((error = xfs_trans_reserve(tp, 0,
1553 XFS_ICHANGE_LOG_RES(mp),
1554 0, 0, 0))) {
1555 xfs_trans_cancel(tp, 0);
1556 return error;
1557 }
1558
1559 ip = mp->m_rootip;
1560 xfs_ilock(ip, XFS_ILOCK_EXCL);
1561
1562 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1563 xfs_trans_ihold(tp, ip);
1564 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1565 error = xfs_trans_commit(tp, 0, NULL);
1566 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1567 xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
1568 }
1569
1570 /*
1571 * When shutting down, we need to insure that the AIL is pushed
1572 * to disk or the filesystem can appear corrupt from the PROM.
1573 */
1574 if ((flags & (SYNC_CLOSE|SYNC_WAIT)) == (SYNC_CLOSE|SYNC_WAIT)) {
1575 XFS_bflush(mp->m_ddev_targp);
1576 if (mp->m_rtdev_targp) {
1577 XFS_bflush(mp->m_rtdev_targp);
1578 }
1579 }
1580
1581 return XFS_ERROR(last_error);
1582}
1583
1584/*
1585 * xfs_vget - called by DMAPI and NFSD to get vnode from file handle
1586 */
1587STATIC int
1588xfs_vget(
1589 bhv_desc_t *bdp,
1590 vnode_t **vpp,
1591 fid_t *fidp)
1592{
1593 xfs_mount_t *mp = XFS_BHVTOM(bdp);
1594 xfs_fid_t *xfid = (struct xfs_fid *)fidp;
1595 xfs_inode_t *ip;
1596 int error;
1597 xfs_ino_t ino;
1598 unsigned int igen;
1599
1600 /*
1601 * Invalid. Since handles can be created in user space and passed in
1602 * via gethandle(), this is not cause for a panic.
1603 */
1604 if (xfid->xfs_fid_len != sizeof(*xfid) - sizeof(xfid->xfs_fid_len))
1605 return XFS_ERROR(EINVAL);
1606
1607 ino = xfid->xfs_fid_ino;
1608 igen = xfid->xfs_fid_gen;
1609
1610 /*
1611 * NFS can sometimes send requests for ino 0. Fail them gracefully.
1612 */
1613 if (ino == 0)
1614 return XFS_ERROR(ESTALE);
1615
1616 error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0);
1617 if (error) {
1618 *vpp = NULL;
1619 return error;
1620 }
1621
1622 if (ip == NULL) {
1623 *vpp = NULL;
1624 return XFS_ERROR(EIO);
1625 }
1626
1627 if (ip->i_d.di_mode == 0 || ip->i_d.di_gen != igen) {
1628 xfs_iput_new(ip, XFS_ILOCK_SHARED);
1629 *vpp = NULL;
1630 return XFS_ERROR(ENOENT);
1631 }
1632
1633 *vpp = XFS_ITOV(ip);
1634 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1635 return 0;
1636}
1637
1638
1639#define MNTOPT_LOGBUFS "logbufs" /* number of XFS log buffers */
1640#define MNTOPT_LOGBSIZE "logbsize" /* size of XFS log buffers */
1641#define MNTOPT_LOGDEV "logdev" /* log device */
1642#define MNTOPT_RTDEV "rtdev" /* realtime I/O device */
1643#define MNTOPT_BIOSIZE "biosize" /* log2 of preferred buffered io size */
1644#define MNTOPT_WSYNC "wsync" /* safe-mode nfs compatible mount */
1645#define MNTOPT_INO64 "ino64" /* force inodes into 64-bit range */
1646#define MNTOPT_NOALIGN "noalign" /* turn off stripe alignment */
1647#define MNTOPT_SWALLOC "swalloc" /* turn on stripe width allocation */
1648#define MNTOPT_SUNIT "sunit" /* data volume stripe unit */
1649#define MNTOPT_SWIDTH "swidth" /* data volume stripe width */
1650#define MNTOPT_NOUUID "nouuid" /* ignore filesystem UUID */
1651#define MNTOPT_MTPT "mtpt" /* filesystem mount point */
1652#define MNTOPT_IHASHSIZE "ihashsize" /* size of inode hash table */
1653#define MNTOPT_NORECOVERY "norecovery" /* don't run XFS recovery */
1654#define MNTOPT_NOLOGFLUSH "nologflush" /* don't hard flush on log writes */
1655#define MNTOPT_OSYNCISOSYNC "osyncisosync" /* o_sync is REALLY o_sync */
1656#define MNTOPT_64BITINODE "inode64" /* inodes can be allocated anywhere */
1657#define MNTOPT_IKEEP "ikeep" /* do not free empty inode clusters */
1658#define MNTOPT_NOIKEEP "noikeep" /* free empty inode clusters */
1659
1660
1661int
1662xfs_parseargs(
1663 struct bhv_desc *bhv,
1664 char *options,
1665 struct xfs_mount_args *args,
1666 int update)
1667{
1668 struct vfs *vfsp = bhvtovfs(bhv);
1669 char *this_char, *value, *eov;
1670 int dsunit, dswidth, vol_dsunit, vol_dswidth;
1671 int iosize;
1672
1673#if 0 /* XXX: off by default, until some remaining issues ironed out */
1674 args->flags |= XFSMNT_IDELETE; /* default to on */
1675#endif
1676
1677 if (!options)
1678 return 0;
1679
1680 iosize = dsunit = dswidth = vol_dsunit = vol_dswidth = 0;
1681
1682 while ((this_char = strsep(&options, ",")) != NULL) {
1683 if (!*this_char)
1684 continue;
1685 if ((value = strchr(this_char, '=')) != NULL)
1686 *value++ = 0;
1687
1688 if (!strcmp(this_char, MNTOPT_LOGBUFS)) {
1689 if (!value || !*value) {
1690 printk("XFS: %s option requires an argument\n",
1691 MNTOPT_LOGBUFS);
1692 return EINVAL;
1693 }
1694 args->logbufs = simple_strtoul(value, &eov, 10);
1695 } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
1696 int last, in_kilobytes = 0;
1697
1698 if (!value || !*value) {
1699 printk("XFS: %s option requires an argument\n",
1700 MNTOPT_LOGBSIZE);
1701 return EINVAL;
1702 }
1703 last = strlen(value) - 1;
1704 if (value[last] == 'K' || value[last] == 'k') {
1705 in_kilobytes = 1;
1706 value[last] = '\0';
1707 }
1708 args->logbufsize = simple_strtoul(value, &eov, 10);
1709 if (in_kilobytes)
1710 args->logbufsize <<= 10;
1711 } else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
1712 if (!value || !*value) {
1713 printk("XFS: %s option requires an argument\n",
1714 MNTOPT_LOGDEV);
1715 return EINVAL;
1716 }
1717 strncpy(args->logname, value, MAXNAMELEN);
1718 } else if (!strcmp(this_char, MNTOPT_MTPT)) {
1719 if (!value || !*value) {
1720 printk("XFS: %s option requires an argument\n",
1721 MNTOPT_MTPT);
1722 return EINVAL;
1723 }
1724 strncpy(args->mtpt, value, MAXNAMELEN);
1725 } else if (!strcmp(this_char, MNTOPT_RTDEV)) {
1726 if (!value || !*value) {
1727 printk("XFS: %s option requires an argument\n",
1728 MNTOPT_RTDEV);
1729 return EINVAL;
1730 }
1731 strncpy(args->rtname, value, MAXNAMELEN);
1732 } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
1733 if (!value || !*value) {
1734 printk("XFS: %s option requires an argument\n",
1735 MNTOPT_BIOSIZE);
1736 return EINVAL;
1737 }
1738 iosize = simple_strtoul(value, &eov, 10);
1739 args->flags |= XFSMNT_IOSIZE;
1740 args->iosizelog = (uint8_t) iosize;
1741 } else if (!strcmp(this_char, MNTOPT_IHASHSIZE)) {
1742 if (!value || !*value) {
1743 printk("XFS: %s option requires an argument\n",
1744 this_char);
1745 return EINVAL;
1746 }
1747 args->flags |= XFSMNT_IHASHSIZE;
1748 args->ihashsize = simple_strtoul(value, &eov, 10);
1749 } else if (!strcmp(this_char, MNTOPT_WSYNC)) {
1750 args->flags |= XFSMNT_WSYNC;
1751 } else if (!strcmp(this_char, MNTOPT_OSYNCISOSYNC)) {
1752 args->flags |= XFSMNT_OSYNCISOSYNC;
1753 } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) {
1754 args->flags |= XFSMNT_NORECOVERY;
1755 } else if (!strcmp(this_char, MNTOPT_INO64)) {
1756 args->flags |= XFSMNT_INO64;
1757#if !XFS_BIG_INUMS
1758 printk("XFS: %s option not allowed on this system\n",
1759 MNTOPT_INO64);
1760 return EINVAL;
1761#endif
1762 } else if (!strcmp(this_char, MNTOPT_NOALIGN)) {
1763 args->flags |= XFSMNT_NOALIGN;
1764 } else if (!strcmp(this_char, MNTOPT_SWALLOC)) {
1765 args->flags |= XFSMNT_SWALLOC;
1766 } else if (!strcmp(this_char, MNTOPT_SUNIT)) {
1767 if (!value || !*value) {
1768 printk("XFS: %s option requires an argument\n",
1769 MNTOPT_SUNIT);
1770 return EINVAL;
1771 }
1772 dsunit = simple_strtoul(value, &eov, 10);
1773 } else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
1774 if (!value || !*value) {
1775 printk("XFS: %s option requires an argument\n",
1776 MNTOPT_SWIDTH);
1777 return EINVAL;
1778 }
1779 dswidth = simple_strtoul(value, &eov, 10);
1780 } else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
1781 args->flags &= ~XFSMNT_32BITINODES;
1782#if !XFS_BIG_INUMS
1783 printk("XFS: %s option not allowed on this system\n",
1784 MNTOPT_64BITINODE);
1785 return EINVAL;
1786#endif
1787 } else if (!strcmp(this_char, MNTOPT_NOUUID)) {
1788 args->flags |= XFSMNT_NOUUID;
1789 } else if (!strcmp(this_char, MNTOPT_NOLOGFLUSH)) {
1790 args->flags |= XFSMNT_NOLOGFLUSH;
1791 } else if (!strcmp(this_char, MNTOPT_IKEEP)) {
1792 args->flags &= ~XFSMNT_IDELETE;
1793 } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) {
1794 args->flags |= XFSMNT_IDELETE;
1795 } else if (!strcmp(this_char, "osyncisdsync")) {
1796 /* no-op, this is now the default */
1797printk("XFS: osyncisdsync is now the default, option is deprecated.\n");
1798 } else if (!strcmp(this_char, "irixsgid")) {
1799printk("XFS: irixsgid is now a sysctl(2) variable, option is deprecated.\n");
1800 } else {
1801 printk("XFS: unknown mount option [%s].\n", this_char);
1802 return EINVAL;
1803 }
1804 }
1805
1806 if (args->flags & XFSMNT_NORECOVERY) {
1807 if ((vfsp->vfs_flag & VFS_RDONLY) == 0) {
1808 printk("XFS: no-recovery mounts must be read-only.\n");
1809 return EINVAL;
1810 }
1811 }
1812
1813 if ((args->flags & XFSMNT_NOALIGN) && (dsunit || dswidth)) {
1814 printk(
1815 "XFS: sunit and swidth options incompatible with the noalign option\n");
1816 return EINVAL;
1817 }
1818
1819 if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
1820 printk("XFS: sunit and swidth must be specified together\n");
1821 return EINVAL;
1822 }
1823
1824 if (dsunit && (dswidth % dsunit != 0)) {
1825 printk(
1826 "XFS: stripe width (%d) must be a multiple of the stripe unit (%d)\n",
1827 dswidth, dsunit);
1828 return EINVAL;
1829 }
1830
1831 if ((args->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) {
1832 if (dsunit) {
1833 args->sunit = dsunit;
1834 args->flags |= XFSMNT_RETERR;
1835 } else {
1836 args->sunit = vol_dsunit;
1837 }
1838 dswidth ? (args->swidth = dswidth) :
1839 (args->swidth = vol_dswidth);
1840 } else {
1841 args->sunit = args->swidth = 0;
1842 }
1843
1844 return 0;
1845}
1846
1847int
1848xfs_showargs(
1849 struct bhv_desc *bhv,
1850 struct seq_file *m)
1851{
1852 static struct proc_xfs_info {
1853 int flag;
1854 char *str;
1855 } xfs_info[] = {
1856 /* the few simple ones we can get from the mount struct */
1857 { XFS_MOUNT_WSYNC, "," MNTOPT_WSYNC },
1858 { XFS_MOUNT_INO64, "," MNTOPT_INO64 },
1859 { XFS_MOUNT_NOALIGN, "," MNTOPT_NOALIGN },
1860 { XFS_MOUNT_SWALLOC, "," MNTOPT_SWALLOC },
1861 { XFS_MOUNT_NOUUID, "," MNTOPT_NOUUID },
1862 { XFS_MOUNT_NORECOVERY, "," MNTOPT_NORECOVERY },
1863 { XFS_MOUNT_OSYNCISOSYNC, "," MNTOPT_OSYNCISOSYNC },
1864 { XFS_MOUNT_NOLOGFLUSH, "," MNTOPT_NOLOGFLUSH },
1865 { XFS_MOUNT_IDELETE, "," MNTOPT_NOIKEEP },
1866 { 0, NULL }
1867 };
1868 struct proc_xfs_info *xfs_infop;
1869 struct xfs_mount *mp = XFS_BHVTOM(bhv);
1870
1871 for (xfs_infop = xfs_info; xfs_infop->flag; xfs_infop++) {
1872 if (mp->m_flags & xfs_infop->flag)
1873 seq_puts(m, xfs_infop->str);
1874 }
1875
1876 if (mp->m_flags & XFS_MOUNT_IHASHSIZE)
1877 seq_printf(m, "," MNTOPT_IHASHSIZE "=%d", mp->m_ihsize);
1878
1879 if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
1880 seq_printf(m, "," MNTOPT_BIOSIZE "=%d", mp->m_writeio_log);
1881
1882 if (mp->m_logbufs > 0)
1883 seq_printf(m, "," MNTOPT_LOGBUFS "=%d", mp->m_logbufs);
1884
1885 if (mp->m_logbsize > 0)
1886 seq_printf(m, "," MNTOPT_LOGBSIZE "=%d", mp->m_logbsize);
1887
1888 if (mp->m_ddev_targp != mp->m_logdev_targp)
1889 seq_printf(m, "," MNTOPT_LOGDEV "=%s",
1890 XFS_BUFTARG_NAME(mp->m_logdev_targp));
1891
1892 if (mp->m_rtdev_targp && mp->m_ddev_targp != mp->m_rtdev_targp)
1893 seq_printf(m, "," MNTOPT_RTDEV "=%s",
1894 XFS_BUFTARG_NAME(mp->m_rtdev_targp));
1895
1896 if (mp->m_dalign > 0)
1897 seq_printf(m, "," MNTOPT_SUNIT "=%d",
1898 (int)XFS_FSB_TO_BB(mp, mp->m_dalign));
1899
1900 if (mp->m_swidth > 0)
1901 seq_printf(m, "," MNTOPT_SWIDTH "=%d",
1902 (int)XFS_FSB_TO_BB(mp, mp->m_swidth));
1903
1904 if (!(mp->m_flags & XFS_MOUNT_32BITINOOPT))
1905 seq_printf(m, "," MNTOPT_64BITINODE);
1906
1907 return 0;
1908}
1909
1910STATIC void
1911xfs_freeze(
1912 bhv_desc_t *bdp)
1913{
1914 xfs_mount_t *mp = XFS_BHVTOM(bdp);
1915
1916 while (atomic_read(&mp->m_active_trans) > 0)
1917 delay(100);
1918
1919 /* Push the superblock and write an unmount record */
1920 xfs_log_unmount_write(mp);
1921 xfs_unmountfs_writesb(mp);
1922}
1923
1924
1925vfsops_t xfs_vfsops = {
1926 BHV_IDENTITY_INIT(VFS_BHV_XFS,VFS_POSITION_XFS),
1927 .vfs_parseargs = xfs_parseargs,
1928 .vfs_showargs = xfs_showargs,
1929 .vfs_mount = xfs_mount,
1930 .vfs_unmount = xfs_unmount,
1931 .vfs_mntupdate = xfs_mntupdate,
1932 .vfs_root = xfs_root,
1933 .vfs_statvfs = xfs_statvfs,
1934 .vfs_sync = xfs_sync,
1935 .vfs_vget = xfs_vget,
1936 .vfs_dmapiops = (vfs_dmapiops_t)fs_nosys,
1937 .vfs_quotactl = (vfs_quotactl_t)fs_nosys,
1938 .vfs_init_vnode = xfs_initialize_vnode,
1939 .vfs_force_shutdown = xfs_do_force_shutdown,
1940 .vfs_freeze = xfs_freeze,
1941};
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
new file mode 100644
index 000000000000..70092963ca9e
--- /dev/null
+++ b/fs/xfs/xfs_vnodeops.c
@@ -0,0 +1,4712 @@
1/*
2 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33#include "xfs.h"
34#include "xfs_macros.h"
35#include "xfs_types.h"
36#include "xfs_inum.h"
37#include "xfs_log.h"
38#include "xfs_trans.h"
39#include "xfs_sb.h"
40#include "xfs_ag.h"
41#include "xfs_dir.h"
42#include "xfs_dir2.h"
43#include "xfs_dmapi.h"
44#include "xfs_mount.h"
45#include "xfs_alloc_btree.h"
46#include "xfs_bmap_btree.h"
47#include "xfs_ialloc_btree.h"
48#include "xfs_itable.h"
49#include "xfs_btree.h"
50#include "xfs_ialloc.h"
51#include "xfs_alloc.h"
52#include "xfs_attr_sf.h"
53#include "xfs_dir_sf.h"
54#include "xfs_dir2_sf.h"
55#include "xfs_dinode.h"
56#include "xfs_inode_item.h"
57#include "xfs_inode.h"
58#include "xfs_bmap.h"
59#include "xfs_da_btree.h"
60#include "xfs_attr.h"
61#include "xfs_rw.h"
62#include "xfs_refcache.h"
63#include "xfs_error.h"
64#include "xfs_bit.h"
65#include "xfs_rtalloc.h"
66#include "xfs_quota.h"
67#include "xfs_utils.h"
68#include "xfs_trans_space.h"
69#include "xfs_dir_leaf.h"
70#include "xfs_mac.h"
71#include "xfs_log_priv.h"
72
73
74/*
75 * The maximum pathlen is 1024 bytes. Since the minimum file system
76 * blocksize is 512 bytes, we can get a max of 2 extents back from
77 * bmapi.
78 */
79#define SYMLINK_MAPS 2
80
81/*
82 * For xfs, we check that the file isn't too big to be opened by this kernel.
83 * No other open action is required for regular files. Devices are handled
84 * through the specfs file system, pipes through fifofs. Device and
85 * fifo vnodes are "wrapped" by specfs and fifofs vnodes, respectively,
86 * when a new vnode is first looked up or created.
87 */
88STATIC int
89xfs_open(
90 bhv_desc_t *bdp,
91 cred_t *credp)
92{
93 int mode;
94 vnode_t *vp;
95 xfs_inode_t *ip;
96
97 vp = BHV_TO_VNODE(bdp);
98 ip = XFS_BHVTOI(bdp);
99
100 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
101 return XFS_ERROR(EIO);
102
103 /*
104 * If it's a directory with any blocks, read-ahead block 0
105 * as we're almost certain to have the next operation be a read there.
106 */
107 if (vp->v_type == VDIR && ip->i_d.di_nextents > 0) {
108 mode = xfs_ilock_map_shared(ip);
109 if (ip->i_d.di_nextents > 0)
110 (void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
111 xfs_iunlock(ip, mode);
112 }
113 return 0;
114}
115
116
117/*
118 * xfs_getattr
119 */
120STATIC int
121xfs_getattr(
122 bhv_desc_t *bdp,
123 vattr_t *vap,
124 int flags,
125 cred_t *credp)
126{
127 xfs_inode_t *ip;
128 xfs_mount_t *mp;
129 vnode_t *vp;
130
131 vp = BHV_TO_VNODE(bdp);
132 vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
133
134 ip = XFS_BHVTOI(bdp);
135 mp = ip->i_mount;
136
137 if (XFS_FORCED_SHUTDOWN(mp))
138 return XFS_ERROR(EIO);
139
140 if (!(flags & ATTR_LAZY))
141 xfs_ilock(ip, XFS_ILOCK_SHARED);
142
143 vap->va_size = ip->i_d.di_size;
144 if (vap->va_mask == XFS_AT_SIZE)
145 goto all_done;
146
147 vap->va_nblocks =
148 XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
149 vap->va_nodeid = ip->i_ino;
150#if XFS_BIG_INUMS
151 vap->va_nodeid += mp->m_inoadd;
152#endif
153 vap->va_nlink = ip->i_d.di_nlink;
154
155 /*
156 * Quick exit for non-stat callers
157 */
158 if ((vap->va_mask &
159 ~(XFS_AT_SIZE|XFS_AT_FSID|XFS_AT_NODEID|
160 XFS_AT_NLINK|XFS_AT_BLKSIZE)) == 0)
161 goto all_done;
162
163 /*
164 * Copy from in-core inode.
165 */
166 vap->va_type = vp->v_type;
167 vap->va_mode = ip->i_d.di_mode & MODEMASK;
168 vap->va_uid = ip->i_d.di_uid;
169 vap->va_gid = ip->i_d.di_gid;
170 vap->va_projid = ip->i_d.di_projid;
171
172 /*
173 * Check vnode type block/char vs. everything else.
174 * Do it with bitmask because that's faster than looking
175 * for multiple values individually.
176 */
177 if (((1 << vp->v_type) & ((1<<VBLK) | (1<<VCHR))) == 0) {
178 vap->va_rdev = 0;
179
180 if (!(ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) {
181
182#if 0
183 /* Large block sizes confuse various
184 * user space programs, so letting the
185 * stripe size through is not a good
186 * idea for now.
187 */
188 vap->va_blocksize = mp->m_swidth ?
189 /*
190 * If the underlying volume is a stripe, then
191 * return the stripe width in bytes as the
192 * recommended I/O size.
193 */
194 (mp->m_swidth << mp->m_sb.sb_blocklog) :
195 /*
196 * Return the largest of the preferred buffer
197 * sizes since doing small I/Os into larger
198 * buffers causes buffers to be decommissioned.
199 * The value returned is in bytes.
200 */
201 (1 << (int)MAX(mp->m_readio_log,
202 mp->m_writeio_log));
203
204#else
205 vap->va_blocksize =
206 /*
207 * Return the largest of the preferred buffer
208 * sizes since doing small I/Os into larger
209 * buffers causes buffers to be decommissioned.
210 * The value returned is in bytes.
211 */
212 1 << (int)MAX(mp->m_readio_log,
213 mp->m_writeio_log);
214#endif
215 } else {
216
217 /*
218 * If the file blocks are being allocated from a
219 * realtime partition, then return the inode's
220 * realtime extent size or the realtime volume's
221 * extent size.
222 */
223 vap->va_blocksize = ip->i_d.di_extsize ?
224 (ip->i_d.di_extsize << mp->m_sb.sb_blocklog) :
225 (mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog);
226 }
227 } else {
228 vap->va_rdev = ip->i_df.if_u2.if_rdev;
229 vap->va_blocksize = BLKDEV_IOSIZE;
230 }
231
232 vap->va_atime.tv_sec = ip->i_d.di_atime.t_sec;
233 vap->va_atime.tv_nsec = ip->i_d.di_atime.t_nsec;
234 vap->va_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
235 vap->va_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
236 vap->va_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
237 vap->va_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
238
239 /*
240 * Exit for stat callers. See if any of the rest of the fields
241 * to be filled in are needed.
242 */
243 if ((vap->va_mask &
244 (XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
245 XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
246 goto all_done;
247
248 /*
249 * Convert di_flags to xflags.
250 */
251 vap->va_xflags = xfs_ip2xflags(ip);
252
253 /*
254 * Exit for inode revalidate. See if any of the rest of
255 * the fields to be filled in are needed.
256 */
257 if ((vap->va_mask &
258 (XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
259 XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
260 goto all_done;
261
262 vap->va_extsize = ip->i_d.di_extsize << mp->m_sb.sb_blocklog;
263 vap->va_nextents =
264 (ip->i_df.if_flags & XFS_IFEXTENTS) ?
265 ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) :
266 ip->i_d.di_nextents;
267 if (ip->i_afp)
268 vap->va_anextents =
269 (ip->i_afp->if_flags & XFS_IFEXTENTS) ?
270 ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) :
271 ip->i_d.di_anextents;
272 else
273 vap->va_anextents = 0;
274 vap->va_gen = ip->i_d.di_gen;
275
276 all_done:
277 if (!(flags & ATTR_LAZY))
278 xfs_iunlock(ip, XFS_ILOCK_SHARED);
279 return 0;
280}
281
282
283/*
284 * xfs_setattr
285 */
286int
287xfs_setattr(
288 bhv_desc_t *bdp,
289 vattr_t *vap,
290 int flags,
291 cred_t *credp)
292{
293 xfs_inode_t *ip;
294 xfs_trans_t *tp;
295 xfs_mount_t *mp;
296 int mask;
297 int code;
298 uint lock_flags;
299 uint commit_flags=0;
300 uid_t uid=0, iuid=0;
301 gid_t gid=0, igid=0;
302 int timeflags = 0;
303 vnode_t *vp;
304 xfs_prid_t projid=0, iprojid=0;
305 int mandlock_before, mandlock_after;
306 struct xfs_dquot *udqp, *gdqp, *olddquot1, *olddquot2;
307 int file_owner;
308 int need_iolock = (flags & ATTR_DMI) == 0;
309
310 vp = BHV_TO_VNODE(bdp);
311 vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
312
313 if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
314 return XFS_ERROR(EROFS);
315
316 /*
317 * Cannot set certain attributes.
318 */
319 mask = vap->va_mask;
320 if (mask & XFS_AT_NOSET) {
321 return XFS_ERROR(EINVAL);
322 }
323
324 ip = XFS_BHVTOI(bdp);
325 mp = ip->i_mount;
326
327 if (XFS_FORCED_SHUTDOWN(mp))
328 return XFS_ERROR(EIO);
329
330 /*
331 * Timestamps do not need to be logged and hence do not
332 * need to be done within a transaction.
333 */
334 if (mask & XFS_AT_UPDTIMES) {
335 ASSERT((mask & ~XFS_AT_UPDTIMES) == 0);
336 timeflags = ((mask & XFS_AT_UPDATIME) ? XFS_ICHGTIME_ACC : 0) |
337 ((mask & XFS_AT_UPDCTIME) ? XFS_ICHGTIME_CHG : 0) |
338 ((mask & XFS_AT_UPDMTIME) ? XFS_ICHGTIME_MOD : 0);
339 xfs_ichgtime(ip, timeflags);
340 return 0;
341 }
342
343 olddquot1 = olddquot2 = NULL;
344 udqp = gdqp = NULL;
345
346 /*
347 * If disk quotas is on, we make sure that the dquots do exist on disk,
348 * before we start any other transactions. Trying to do this later
349 * is messy. We don't care to take a readlock to look at the ids
350 * in inode here, because we can't hold it across the trans_reserve.
351 * If the IDs do change before we take the ilock, we're covered
352 * because the i_*dquot fields will get updated anyway.
353 */
354 if (XFS_IS_QUOTA_ON(mp) && (mask & (XFS_AT_UID|XFS_AT_GID))) {
355 uint qflags = 0;
356
357 if (mask & XFS_AT_UID) {
358 uid = vap->va_uid;
359 qflags |= XFS_QMOPT_UQUOTA;
360 } else {
361 uid = ip->i_d.di_uid;
362 }
363 if (mask & XFS_AT_GID) {
364 gid = vap->va_gid;
365 qflags |= XFS_QMOPT_GQUOTA;
366 } else {
367 gid = ip->i_d.di_gid;
368 }
369 /*
370 * We take a reference when we initialize udqp and gdqp,
371 * so it is important that we never blindly double trip on
372 * the same variable. See xfs_create() for an example.
373 */
374 ASSERT(udqp == NULL);
375 ASSERT(gdqp == NULL);
376 code = XFS_QM_DQVOPALLOC(mp, ip, uid,gid, qflags, &udqp, &gdqp);
377 if (code)
378 return (code);
379 }
380
381 /*
382 * For the other attributes, we acquire the inode lock and
383 * first do an error checking pass.
384 */
385 tp = NULL;
386 lock_flags = XFS_ILOCK_EXCL;
387 if (!(mask & XFS_AT_SIZE)) {
388 if ((mask != (XFS_AT_CTIME|XFS_AT_ATIME|XFS_AT_MTIME)) ||
389 (mp->m_flags & XFS_MOUNT_WSYNC)) {
390 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
391 commit_flags = 0;
392 if ((code = xfs_trans_reserve(tp, 0,
393 XFS_ICHANGE_LOG_RES(mp), 0,
394 0, 0))) {
395 lock_flags = 0;
396 goto error_return;
397 }
398 }
399 } else {
400 if (DM_EVENT_ENABLED (vp->v_vfsp, ip, DM_EVENT_TRUNCATE) &&
401 !(flags & ATTR_DMI)) {
402 int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
403 code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, vp,
404 vap->va_size, 0, dmflags, NULL);
405 if (code) {
406 lock_flags = 0;
407 goto error_return;
408 }
409 }
410 if (need_iolock)
411 lock_flags |= XFS_IOLOCK_EXCL;
412 }
413
414 xfs_ilock(ip, lock_flags);
415
416 /* boolean: are we the file owner? */
417 file_owner = (current_fsuid(credp) == ip->i_d.di_uid);
418
419 /*
420 * Change various properties of a file.
421 * Only the owner or users with CAP_FOWNER
422 * capability may do these things.
423 */
424 if (mask &
425 (XFS_AT_MODE|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_UID|
426 XFS_AT_GID|XFS_AT_PROJID)) {
427 /*
428 * CAP_FOWNER overrides the following restrictions:
429 *
430 * The user ID of the calling process must be equal
431 * to the file owner ID, except in cases where the
432 * CAP_FSETID capability is applicable.
433 */
434 if (!file_owner && !capable(CAP_FOWNER)) {
435 code = XFS_ERROR(EPERM);
436 goto error_return;
437 }
438
439 /*
440 * CAP_FSETID overrides the following restrictions:
441 *
442 * The effective user ID of the calling process shall match
443 * the file owner when setting the set-user-ID and
444 * set-group-ID bits on that file.
445 *
446 * The effective group ID or one of the supplementary group
447 * IDs of the calling process shall match the group owner of
448 * the file when setting the set-group-ID bit on that file
449 */
450 if (mask & XFS_AT_MODE) {
451 mode_t m = 0;
452
453 if ((vap->va_mode & S_ISUID) && !file_owner)
454 m |= S_ISUID;
455 if ((vap->va_mode & S_ISGID) &&
456 !in_group_p((gid_t)ip->i_d.di_gid))
457 m |= S_ISGID;
458#if 0
459 /* Linux allows this, Irix doesn't. */
460 if ((vap->va_mode & S_ISVTX) && vp->v_type != VDIR)
461 m |= S_ISVTX;
462#endif
463 if (m && !capable(CAP_FSETID))
464 vap->va_mode &= ~m;
465 }
466 }
467
468 /*
469 * Change file ownership. Must be the owner or privileged.
470 * If the system was configured with the "restricted_chown"
471 * option, the owner is not permitted to give away the file,
472 * and can change the group id only to a group of which he
473 * or she is a member.
474 */
475 if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
476 /*
477 * These IDs could have changed since we last looked at them.
478 * But, we're assured that if the ownership did change
479 * while we didn't have the inode locked, inode's dquot(s)
480 * would have changed also.
481 */
482 iuid = ip->i_d.di_uid;
483 iprojid = ip->i_d.di_projid;
484 igid = ip->i_d.di_gid;
485 gid = (mask & XFS_AT_GID) ? vap->va_gid : igid;
486 uid = (mask & XFS_AT_UID) ? vap->va_uid : iuid;
487 projid = (mask & XFS_AT_PROJID) ? (xfs_prid_t)vap->va_projid :
488 iprojid;
489
490 /*
491 * CAP_CHOWN overrides the following restrictions:
492 *
493 * If _POSIX_CHOWN_RESTRICTED is defined, this capability
494 * shall override the restriction that a process cannot
495 * change the user ID of a file it owns and the restriction
496 * that the group ID supplied to the chown() function
497 * shall be equal to either the group ID or one of the
498 * supplementary group IDs of the calling process.
499 *
500 * XXX: How does restricted_chown affect projid?
501 */
502 if (restricted_chown &&
503 (iuid != uid || (igid != gid &&
504 !in_group_p((gid_t)gid))) &&
505 !capable(CAP_CHOWN)) {
506 code = XFS_ERROR(EPERM);
507 goto error_return;
508 }
509 /*
510 * Do a quota reservation only if uid or gid is actually
511 * going to change.
512 */
513 if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
514 (XFS_IS_GQUOTA_ON(mp) && igid != gid)) {
515 ASSERT(tp);
516 code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp,
517 capable(CAP_FOWNER) ?
518 XFS_QMOPT_FORCE_RES : 0);
519 if (code) /* out of quota */
520 goto error_return;
521 }
522 }
523
524 /*
525 * Truncate file. Must have write permission and not be a directory.
526 */
527 if (mask & XFS_AT_SIZE) {
528 /* Short circuit the truncate case for zero length files */
529 if ((vap->va_size == 0) &&
530 (ip->i_d.di_size == 0) && (ip->i_d.di_nextents == 0)) {
531 xfs_iunlock(ip, XFS_ILOCK_EXCL);
532 lock_flags &= ~XFS_ILOCK_EXCL;
533 if (mask & XFS_AT_CTIME)
534 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
535 code = 0;
536 goto error_return;
537 }
538
539 if (vp->v_type == VDIR) {
540 code = XFS_ERROR(EISDIR);
541 goto error_return;
542 } else if (vp->v_type != VREG) {
543 code = XFS_ERROR(EINVAL);
544 goto error_return;
545 }
546 /*
547 * Make sure that the dquots are attached to the inode.
548 */
549 if ((code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED)))
550 goto error_return;
551 }
552
553 /*
554 * Change file access or modified times.
555 */
556 if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
557 if (!file_owner) {
558 if ((flags & ATTR_UTIME) &&
559 !capable(CAP_FOWNER)) {
560 code = XFS_ERROR(EPERM);
561 goto error_return;
562 }
563 }
564 }
565
566 /*
567 * Change extent size or realtime flag.
568 */
569 if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
570 /*
571 * Can't change extent size if any extents are allocated.
572 */
573 if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
574 (mask & XFS_AT_EXTSIZE) &&
575 ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
576 vap->va_extsize) ) {
577 code = XFS_ERROR(EINVAL); /* EFBIG? */
578 goto error_return;
579 }
580
581 /*
582 * Can't set extent size unless the file is marked, or
583 * about to be marked as a realtime file.
584 *
585 * This check will be removed when fixed size extents
586 * with buffered data writes is implemented.
587 *
588 */
589 if ((mask & XFS_AT_EXTSIZE) &&
590 ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
591 vap->va_extsize) &&
592 (!((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ||
593 ((mask & XFS_AT_XFLAGS) &&
594 (vap->va_xflags & XFS_XFLAG_REALTIME))))) {
595 code = XFS_ERROR(EINVAL);
596 goto error_return;
597 }
598
599 /*
600 * Can't change realtime flag if any extents are allocated.
601 */
602 if (ip->i_d.di_nextents && (mask & XFS_AT_XFLAGS) &&
603 (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) !=
604 (vap->va_xflags & XFS_XFLAG_REALTIME)) {
605 code = XFS_ERROR(EINVAL); /* EFBIG? */
606 goto error_return;
607 }
608 /*
609 * Extent size must be a multiple of the appropriate block
610 * size, if set at all.
611 */
612 if ((mask & XFS_AT_EXTSIZE) && vap->va_extsize != 0) {
613 xfs_extlen_t size;
614
615 if ((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ||
616 ((mask & XFS_AT_XFLAGS) &&
617 (vap->va_xflags & XFS_XFLAG_REALTIME))) {
618 size = mp->m_sb.sb_rextsize <<
619 mp->m_sb.sb_blocklog;
620 } else {
621 size = mp->m_sb.sb_blocksize;
622 }
623 if (vap->va_extsize % size) {
624 code = XFS_ERROR(EINVAL);
625 goto error_return;
626 }
627 }
628 /*
629 * If realtime flag is set then must have realtime data.
630 */
631 if ((mask & XFS_AT_XFLAGS) &&
632 (vap->va_xflags & XFS_XFLAG_REALTIME)) {
633 if ((mp->m_sb.sb_rblocks == 0) ||
634 (mp->m_sb.sb_rextsize == 0) ||
635 (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
636 code = XFS_ERROR(EINVAL);
637 goto error_return;
638 }
639 }
640
641 /*
642 * Can't modify an immutable/append-only file unless
643 * we have appropriate permission.
644 */
645 if ((mask & XFS_AT_XFLAGS) &&
646 (ip->i_d.di_flags &
647 (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
648 (vap->va_xflags &
649 (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
650 !capable(CAP_LINUX_IMMUTABLE)) {
651 code = XFS_ERROR(EPERM);
652 goto error_return;
653 }
654 }
655
656 /*
657 * Now we can make the changes. Before we join the inode
658 * to the transaction, if XFS_AT_SIZE is set then take care of
659 * the part of the truncation that must be done without the
660 * inode lock. This needs to be done before joining the inode
661 * to the transaction, because the inode cannot be unlocked
662 * once it is a part of the transaction.
663 */
664 if (mask & XFS_AT_SIZE) {
665 code = 0;
666 if (vap->va_size > ip->i_d.di_size)
667 code = xfs_igrow_start(ip, vap->va_size, credp);
668 xfs_iunlock(ip, XFS_ILOCK_EXCL);
669 if (!code)
670 code = xfs_itruncate_data(ip, vap->va_size);
671 if (code) {
672 ASSERT(tp == NULL);
673 lock_flags &= ~XFS_ILOCK_EXCL;
674 ASSERT(lock_flags == XFS_IOLOCK_EXCL);
675 goto error_return;
676 }
677 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
678 if ((code = xfs_trans_reserve(tp, 0,
679 XFS_ITRUNCATE_LOG_RES(mp), 0,
680 XFS_TRANS_PERM_LOG_RES,
681 XFS_ITRUNCATE_LOG_COUNT))) {
682 xfs_trans_cancel(tp, 0);
683 if (need_iolock)
684 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
685 return code;
686 }
687 commit_flags = XFS_TRANS_RELEASE_LOG_RES;
688 xfs_ilock(ip, XFS_ILOCK_EXCL);
689 }
690
691 if (tp) {
692 xfs_trans_ijoin(tp, ip, lock_flags);
693 xfs_trans_ihold(tp, ip);
694 }
695
696 /* determine whether mandatory locking mode changes */
697 mandlock_before = MANDLOCK(vp, ip->i_d.di_mode);
698
699 /*
700 * Truncate file. Must have write permission and not be a directory.
701 */
702 if (mask & XFS_AT_SIZE) {
703 if (vap->va_size > ip->i_d.di_size) {
704 xfs_igrow_finish(tp, ip, vap->va_size,
705 !(flags & ATTR_DMI));
706 } else if ((vap->va_size <= ip->i_d.di_size) ||
707 ((vap->va_size == 0) && ip->i_d.di_nextents)) {
708 /*
709 * signal a sync transaction unless
710 * we're truncating an already unlinked
711 * file on a wsync filesystem
712 */
713 code = xfs_itruncate_finish(&tp, ip,
714 (xfs_fsize_t)vap->va_size,
715 XFS_DATA_FORK,
716 ((ip->i_d.di_nlink != 0 ||
717 !(mp->m_flags & XFS_MOUNT_WSYNC))
718 ? 1 : 0));
719 if (code) {
720 goto abort_return;
721 }
722 }
723 /*
724 * Have to do this even if the file's size doesn't change.
725 */
726 timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
727 }
728
729 /*
730 * Change file access modes.
731 */
732 if (mask & XFS_AT_MODE) {
733 ip->i_d.di_mode &= S_IFMT;
734 ip->i_d.di_mode |= vap->va_mode & ~S_IFMT;
735
736 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
737 timeflags |= XFS_ICHGTIME_CHG;
738 }
739
740 /*
741 * Change file ownership. Must be the owner or privileged.
742 * If the system was configured with the "restricted_chown"
743 * option, the owner is not permitted to give away the file,
744 * and can change the group id only to a group of which he
745 * or she is a member.
746 */
747 if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
748 /*
749 * CAP_FSETID overrides the following restrictions:
750 *
751 * The set-user-ID and set-group-ID bits of a file will be
752 * cleared upon successful return from chown()
753 */
754 if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
755 !capable(CAP_FSETID)) {
756 ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
757 }
758
759 /*
760 * Change the ownerships and register quota modifications
761 * in the transaction.
762 */
763 if (iuid != uid) {
764 if (XFS_IS_UQUOTA_ON(mp)) {
765 ASSERT(mask & XFS_AT_UID);
766 ASSERT(udqp);
767 olddquot1 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
768 &ip->i_udquot, udqp);
769 }
770 ip->i_d.di_uid = uid;
771 }
772 if (igid != gid) {
773 if (XFS_IS_GQUOTA_ON(mp)) {
774 ASSERT(mask & XFS_AT_GID);
775 ASSERT(gdqp);
776 olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
777 &ip->i_gdquot, gdqp);
778 }
779 ip->i_d.di_gid = gid;
780 }
781 if (iprojid != projid) {
782 ip->i_d.di_projid = projid;
783 /*
784 * We may have to rev the inode as well as
785 * the superblock version number since projids didn't
786 * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
787 */
788 if (ip->i_d.di_version == XFS_DINODE_VERSION_1)
789 xfs_bump_ino_vers2(tp, ip);
790 }
791
792 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
793 timeflags |= XFS_ICHGTIME_CHG;
794 }
795
796
797 /*
798 * Change file access or modified times.
799 */
800 if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
801 if (mask & XFS_AT_ATIME) {
802 ip->i_d.di_atime.t_sec = vap->va_atime.tv_sec;
803 ip->i_d.di_atime.t_nsec = vap->va_atime.tv_nsec;
804 ip->i_update_core = 1;
805 timeflags &= ~XFS_ICHGTIME_ACC;
806 }
807 if (mask & XFS_AT_MTIME) {
808 ip->i_d.di_mtime.t_sec = vap->va_mtime.tv_sec;
809 ip->i_d.di_mtime.t_nsec = vap->va_mtime.tv_nsec;
810 timeflags &= ~XFS_ICHGTIME_MOD;
811 timeflags |= XFS_ICHGTIME_CHG;
812 }
813 if (tp && (flags & ATTR_UTIME))
814 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
815 }
816
817 /*
818 * Change XFS-added attributes.
819 */
820 if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
821 if (mask & XFS_AT_EXTSIZE) {
822 /*
823 * Converting bytes to fs blocks.
824 */
825 ip->i_d.di_extsize = vap->va_extsize >>
826 mp->m_sb.sb_blocklog;
827 }
828 if (mask & XFS_AT_XFLAGS) {
829 uint di_flags;
830
831 /* can't set PREALLOC this way, just preserve it */
832 di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
833 if (vap->va_xflags & XFS_XFLAG_IMMUTABLE)
834 di_flags |= XFS_DIFLAG_IMMUTABLE;
835 if (vap->va_xflags & XFS_XFLAG_APPEND)
836 di_flags |= XFS_DIFLAG_APPEND;
837 if (vap->va_xflags & XFS_XFLAG_SYNC)
838 di_flags |= XFS_DIFLAG_SYNC;
839 if (vap->va_xflags & XFS_XFLAG_NOATIME)
840 di_flags |= XFS_DIFLAG_NOATIME;
841 if (vap->va_xflags & XFS_XFLAG_NODUMP)
842 di_flags |= XFS_DIFLAG_NODUMP;
843 if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
844 if (vap->va_xflags & XFS_XFLAG_RTINHERIT)
845 di_flags |= XFS_DIFLAG_RTINHERIT;
846 if (vap->va_xflags & XFS_XFLAG_NOSYMLINKS)
847 di_flags |= XFS_DIFLAG_NOSYMLINKS;
848 } else {
849 if (vap->va_xflags & XFS_XFLAG_REALTIME) {
850 di_flags |= XFS_DIFLAG_REALTIME;
851 ip->i_iocore.io_flags |= XFS_IOCORE_RT;
852 } else {
853 ip->i_iocore.io_flags &= ~XFS_IOCORE_RT;
854 }
855 }
856 ip->i_d.di_flags = di_flags;
857 }
858 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
859 timeflags |= XFS_ICHGTIME_CHG;
860 }
861
862 /*
863 * Change file inode change time only if XFS_AT_CTIME set
864 * AND we have been called by a DMI function.
865 */
866
867 if ( (flags & ATTR_DMI) && (mask & XFS_AT_CTIME) ) {
868 ip->i_d.di_ctime.t_sec = vap->va_ctime.tv_sec;
869 ip->i_d.di_ctime.t_nsec = vap->va_ctime.tv_nsec;
870 ip->i_update_core = 1;
871 timeflags &= ~XFS_ICHGTIME_CHG;
872 }
873
874 /*
875 * Send out timestamp changes that need to be set to the
876 * current time. Not done when called by a DMI function.
877 */
878 if (timeflags && !(flags & ATTR_DMI))
879 xfs_ichgtime(ip, timeflags);
880
881 XFS_STATS_INC(xs_ig_attrchg);
882
883 /*
884 * If this is a synchronous mount, make sure that the
885 * transaction goes to disk before returning to the user.
886 * This is slightly sub-optimal in that truncates require
887 * two sync transactions instead of one for wsync filesytems.
888 * One for the truncate and one for the timestamps since we
889 * don't want to change the timestamps unless we're sure the
890 * truncate worked. Truncates are less than 1% of the laddis
891 * mix so this probably isn't worth the trouble to optimize.
892 */
893 code = 0;
894 if (tp) {
895 if (mp->m_flags & XFS_MOUNT_WSYNC)
896 xfs_trans_set_sync(tp);
897
898 code = xfs_trans_commit(tp, commit_flags, NULL);
899 }
900
901 /*
902 * If the (regular) file's mandatory locking mode changed, then
903 * notify the vnode. We do this under the inode lock to prevent
904 * racing calls to vop_vnode_change.
905 */
906 mandlock_after = MANDLOCK(vp, ip->i_d.di_mode);
907 if (mandlock_before != mandlock_after) {
908 VOP_VNODE_CHANGE(vp, VCHANGE_FLAGS_ENF_LOCKING,
909 mandlock_after);
910 }
911
912 xfs_iunlock(ip, lock_flags);
913
914 /*
915 * Release any dquot(s) the inode had kept before chown.
916 */
917 XFS_QM_DQRELE(mp, olddquot1);
918 XFS_QM_DQRELE(mp, olddquot2);
919 XFS_QM_DQRELE(mp, udqp);
920 XFS_QM_DQRELE(mp, gdqp);
921
922 if (code) {
923 return code;
924 }
925
926 if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_ATTRIBUTE) &&
927 !(flags & ATTR_DMI)) {
928 (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, vp, DM_RIGHT_NULL,
929 NULL, DM_RIGHT_NULL, NULL, NULL,
930 0, 0, AT_DELAY_FLAG(flags));
931 }
932 return 0;
933
934 abort_return:
935 commit_flags |= XFS_TRANS_ABORT;
936 /* FALLTHROUGH */
937 error_return:
938 XFS_QM_DQRELE(mp, udqp);
939 XFS_QM_DQRELE(mp, gdqp);
940 if (tp) {
941 xfs_trans_cancel(tp, commit_flags);
942 }
943 if (lock_flags != 0) {
944 xfs_iunlock(ip, lock_flags);
945 }
946 return code;
947}
948
949
950/*
951 * xfs_access
952 * Null conversion from vnode mode bits to inode mode bits, as in efs.
953 */
954STATIC int
955xfs_access(
956 bhv_desc_t *bdp,
957 int mode,
958 cred_t *credp)
959{
960 xfs_inode_t *ip;
961 int error;
962
963 vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__,
964 (inst_t *)__return_address);
965
966 ip = XFS_BHVTOI(bdp);
967 xfs_ilock(ip, XFS_ILOCK_SHARED);
968 error = xfs_iaccess(ip, mode, credp);
969 xfs_iunlock(ip, XFS_ILOCK_SHARED);
970 return error;
971}
972
973
974/*
975 * xfs_readlink
976 *
977 */
978STATIC int
979xfs_readlink(
980 bhv_desc_t *bdp,
981 uio_t *uiop,
982 int ioflags,
983 cred_t *credp)
984{
985 xfs_inode_t *ip;
986 int count;
987 xfs_off_t offset;
988 int pathlen;
989 vnode_t *vp;
990 int error = 0;
991 xfs_mount_t *mp;
992 int nmaps;
993 xfs_bmbt_irec_t mval[SYMLINK_MAPS];
994 xfs_daddr_t d;
995 int byte_cnt;
996 int n;
997 xfs_buf_t *bp;
998
999 vp = BHV_TO_VNODE(bdp);
1000 vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
1001
1002 ip = XFS_BHVTOI(bdp);
1003 mp = ip->i_mount;
1004
1005 if (XFS_FORCED_SHUTDOWN(mp))
1006 return XFS_ERROR(EIO);
1007
1008 xfs_ilock(ip, XFS_ILOCK_SHARED);
1009
1010 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFLNK);
1011
1012 offset = uiop->uio_offset;
1013 count = uiop->uio_resid;
1014
1015 if (offset < 0) {
1016 error = XFS_ERROR(EINVAL);
1017 goto error_return;
1018 }
1019 if (count <= 0) {
1020 error = 0;
1021 goto error_return;
1022 }
1023
1024 if (!(ioflags & IO_INVIS)) {
1025 xfs_ichgtime(ip, XFS_ICHGTIME_ACC);
1026 }
1027
1028 /*
1029 * See if the symlink is stored inline.
1030 */
1031 pathlen = (int)ip->i_d.di_size;
1032
1033 if (ip->i_df.if_flags & XFS_IFINLINE) {
1034 error = uio_read(ip->i_df.if_u1.if_data, pathlen, uiop);
1035 }
1036 else {
1037 /*
1038 * Symlink not inline. Call bmap to get it in.
1039 */
1040 nmaps = SYMLINK_MAPS;
1041
1042 error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen),
1043 0, NULL, 0, mval, &nmaps, NULL);
1044
1045 if (error) {
1046 goto error_return;
1047 }
1048
1049 for (n = 0; n < nmaps; n++) {
1050 d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
1051 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
1052 bp = xfs_buf_read(mp->m_ddev_targp, d,
1053 BTOBB(byte_cnt), 0);
1054 error = XFS_BUF_GETERROR(bp);
1055 if (error) {
1056 xfs_ioerror_alert("xfs_readlink",
1057 ip->i_mount, bp, XFS_BUF_ADDR(bp));
1058 xfs_buf_relse(bp);
1059 goto error_return;
1060 }
1061 if (pathlen < byte_cnt)
1062 byte_cnt = pathlen;
1063 pathlen -= byte_cnt;
1064
1065 error = uio_read(XFS_BUF_PTR(bp), byte_cnt, uiop);
1066 xfs_buf_relse (bp);
1067 }
1068
1069 }
1070
1071
1072error_return:
1073
1074 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1075
1076 return error;
1077}
1078
1079
1080/*
1081 * xfs_fsync
1082 *
1083 * This is called to sync the inode and its data out to disk.
1084 * We need to hold the I/O lock while flushing the data, and
1085 * the inode lock while flushing the inode. The inode lock CANNOT
1086 * be held while flushing the data, so acquire after we're done
1087 * with that.
1088 */
1089STATIC int
1090xfs_fsync(
1091 bhv_desc_t *bdp,
1092 int flag,
1093 cred_t *credp,
1094 xfs_off_t start,
1095 xfs_off_t stop)
1096{
1097 xfs_inode_t *ip;
1098 xfs_trans_t *tp;
1099 int error;
1100
1101 vn_trace_entry(BHV_TO_VNODE(bdp),
1102 __FUNCTION__, (inst_t *)__return_address);
1103
1104 ip = XFS_BHVTOI(bdp);
1105
1106 ASSERT(start >= 0 && stop >= -1);
1107
1108 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
1109 return XFS_ERROR(EIO);
1110
1111 /*
1112 * We always need to make sure that the required inode state
1113 * is safe on disk. The vnode might be clean but because
1114 * of committed transactions that haven't hit the disk yet.
1115 * Likewise, there could be unflushed non-transactional
1116 * changes to the inode core that have to go to disk.
1117 *
1118 * The following code depends on one assumption: that
1119 * any transaction that changes an inode logs the core
1120 * because it has to change some field in the inode core
1121 * (typically nextents or nblocks). That assumption
1122 * implies that any transactions against an inode will
1123 * catch any non-transactional updates. If inode-altering
1124 * transactions exist that violate this assumption, the
1125 * code breaks. Right now, it figures that if the involved
1126 * update_* field is clear and the inode is unpinned, the
1127 * inode is clean. Either it's been flushed or it's been
1128 * committed and the commit has hit the disk unpinning the inode.
1129 * (Note that xfs_inode_item_format() called at commit clears
1130 * the update_* fields.)
1131 */
1132 xfs_ilock(ip, XFS_ILOCK_SHARED);
1133
1134 /* If we are flushing data then we care about update_size
1135 * being set, otherwise we care about update_core
1136 */
1137 if ((flag & FSYNC_DATA) ?
1138 (ip->i_update_size == 0) :
1139 (ip->i_update_core == 0)) {
1140 /*
1141 * Timestamps/size haven't changed since last inode
1142 * flush or inode transaction commit. That means
1143 * either nothing got written or a transaction
1144 * committed which caught the updates. If the
1145 * latter happened and the transaction hasn't
1146 * hit the disk yet, the inode will be still
1147 * be pinned. If it is, force the log.
1148 */
1149
1150 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1151
1152 if (xfs_ipincount(ip)) {
1153 xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
1154 XFS_LOG_FORCE |
1155 ((flag & FSYNC_WAIT)
1156 ? XFS_LOG_SYNC : 0));
1157 }
1158 error = 0;
1159 } else {
1160 /*
1161 * Kick off a transaction to log the inode
1162 * core to get the updates. Make it
1163 * sync if FSYNC_WAIT is passed in (which
1164 * is done by everybody but specfs). The
1165 * sync transaction will also force the log.
1166 */
1167 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1168 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
1169 if ((error = xfs_trans_reserve(tp, 0,
1170 XFS_FSYNC_TS_LOG_RES(ip->i_mount),
1171 0, 0, 0))) {
1172 xfs_trans_cancel(tp, 0);
1173 return error;
1174 }
1175 xfs_ilock(ip, XFS_ILOCK_EXCL);
1176
1177 /*
1178 * Note - it's possible that we might have pushed
1179 * ourselves out of the way during trans_reserve
1180 * which would flush the inode. But there's no
1181 * guarantee that the inode buffer has actually
1182 * gone out yet (it's delwri). Plus the buffer
1183 * could be pinned anyway if it's part of an
1184 * inode in another recent transaction. So we
1185 * play it safe and fire off the transaction anyway.
1186 */
1187 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1188 xfs_trans_ihold(tp, ip);
1189 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1190 if (flag & FSYNC_WAIT)
1191 xfs_trans_set_sync(tp);
1192 error = xfs_trans_commit(tp, 0, NULL);
1193
1194 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1195 }
1196 return error;
1197}
1198
1199/*
1200 * This is called by xfs_inactive to free any blocks beyond eof,
1201 * when the link count isn't zero.
1202 */
1203STATIC int
1204xfs_inactive_free_eofblocks(
1205 xfs_mount_t *mp,
1206 xfs_inode_t *ip)
1207{
1208 xfs_trans_t *tp;
1209 int error;
1210 xfs_fileoff_t end_fsb;
1211 xfs_fileoff_t last_fsb;
1212 xfs_filblks_t map_len;
1213 int nimaps;
1214 xfs_bmbt_irec_t imap;
1215
1216 /*
1217 * Figure out if there are any blocks beyond the end
1218 * of the file. If not, then there is nothing to do.
1219 */
1220 end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_d.di_size));
1221 last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
1222 map_len = last_fsb - end_fsb;
1223 if (map_len <= 0)
1224 return (0);
1225
1226 nimaps = 1;
1227 xfs_ilock(ip, XFS_ILOCK_SHARED);
1228 error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0,
1229 NULL, 0, &imap, &nimaps, NULL);
1230 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1231
1232 if (!error && (nimaps != 0) &&
1233 (imap.br_startblock != HOLESTARTBLOCK)) {
1234 /*
1235 * Attach the dquots to the inode up front.
1236 */
1237 if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1238 return (error);
1239
1240 /*
1241 * There are blocks after the end of file.
1242 * Free them up now by truncating the file to
1243 * its current size.
1244 */
1245 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1246
1247 /*
1248 * Do the xfs_itruncate_start() call before
1249 * reserving any log space because
1250 * itruncate_start will call into the buffer
1251 * cache and we can't
1252 * do that within a transaction.
1253 */
1254 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1255 xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
1256 ip->i_d.di_size);
1257
1258 error = xfs_trans_reserve(tp, 0,
1259 XFS_ITRUNCATE_LOG_RES(mp),
1260 0, XFS_TRANS_PERM_LOG_RES,
1261 XFS_ITRUNCATE_LOG_COUNT);
1262 if (error) {
1263 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1264 xfs_trans_cancel(tp, 0);
1265 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1266 return (error);
1267 }
1268
1269 xfs_ilock(ip, XFS_ILOCK_EXCL);
1270 xfs_trans_ijoin(tp, ip,
1271 XFS_IOLOCK_EXCL |
1272 XFS_ILOCK_EXCL);
1273 xfs_trans_ihold(tp, ip);
1274
1275 error = xfs_itruncate_finish(&tp, ip,
1276 ip->i_d.di_size,
1277 XFS_DATA_FORK,
1278 0);
1279 /*
1280 * If we get an error at this point we
1281 * simply don't bother truncating the file.
1282 */
1283 if (error) {
1284 xfs_trans_cancel(tp,
1285 (XFS_TRANS_RELEASE_LOG_RES |
1286 XFS_TRANS_ABORT));
1287 } else {
1288 error = xfs_trans_commit(tp,
1289 XFS_TRANS_RELEASE_LOG_RES,
1290 NULL);
1291 }
1292 xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1293 }
1294 return (error);
1295}
1296
1297/*
1298 * Free a symlink that has blocks associated with it.
1299 */
1300STATIC int
1301xfs_inactive_symlink_rmt(
1302 xfs_inode_t *ip,
1303 xfs_trans_t **tpp)
1304{
1305 xfs_buf_t *bp;
1306 int committed;
1307 int done;
1308 int error;
1309 xfs_fsblock_t first_block;
1310 xfs_bmap_free_t free_list;
1311 int i;
1312 xfs_mount_t *mp;
1313 xfs_bmbt_irec_t mval[SYMLINK_MAPS];
1314 int nmaps;
1315 xfs_trans_t *ntp;
1316 int size;
1317 xfs_trans_t *tp;
1318
1319 tp = *tpp;
1320 mp = ip->i_mount;
1321 ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
1322 /*
1323 * We're freeing a symlink that has some
1324 * blocks allocated to it. Free the
1325 * blocks here. We know that we've got
1326 * either 1 or 2 extents and that we can
1327 * free them all in one bunmapi call.
1328 */
1329 ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
1330 if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1331 XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1332 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1333 xfs_trans_cancel(tp, 0);
1334 *tpp = NULL;
1335 return error;
1336 }
1337 /*
1338 * Lock the inode, fix the size, and join it to the transaction.
1339 * Hold it so in the normal path, we still have it locked for
1340 * the second transaction. In the error paths we need it
1341 * held so the cancel won't rele it, see below.
1342 */
1343 xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1344 size = (int)ip->i_d.di_size;
1345 ip->i_d.di_size = 0;
1346 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1347 xfs_trans_ihold(tp, ip);
1348 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1349 /*
1350 * Find the block(s) so we can inval and unmap them.
1351 */
1352 done = 0;
1353 XFS_BMAP_INIT(&free_list, &first_block);
1354 nmaps = sizeof(mval) / sizeof(mval[0]);
1355 if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
1356 XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
1357 &free_list)))
1358 goto error0;
1359 /*
1360 * Invalidate the block(s).
1361 */
1362 for (i = 0; i < nmaps; i++) {
1363 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
1364 XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
1365 XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
1366 xfs_trans_binval(tp, bp);
1367 }
1368 /*
1369 * Unmap the dead block(s) to the free_list.
1370 */
1371 if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
1372 &first_block, &free_list, &done)))
1373 goto error1;
1374 ASSERT(done);
1375 /*
1376 * Commit the first transaction. This logs the EFI and the inode.
1377 */
1378 if ((error = xfs_bmap_finish(&tp, &free_list, first_block, &committed)))
1379 goto error1;
1380 /*
1381 * The transaction must have been committed, since there were
1382 * actually extents freed by xfs_bunmapi. See xfs_bmap_finish.
1383 * The new tp has the extent freeing and EFDs.
1384 */
1385 ASSERT(committed);
1386 /*
1387 * The first xact was committed, so add the inode to the new one.
1388 * Mark it dirty so it will be logged and moved forward in the log as
1389 * part of every commit.
1390 */
1391 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1392 xfs_trans_ihold(tp, ip);
1393 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1394 /*
1395 * Get a new, empty transaction to return to our caller.
1396 */
1397 ntp = xfs_trans_dup(tp);
1398 /*
1399 * Commit the transaction containing extent freeing and EFD's.
1400 * If we get an error on the commit here or on the reserve below,
1401 * we need to unlock the inode since the new transaction doesn't
1402 * have the inode attached.
1403 */
1404 error = xfs_trans_commit(tp, 0, NULL);
1405 tp = ntp;
1406 if (error) {
1407 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1408 goto error0;
1409 }
1410 /*
1411 * Remove the memory for extent descriptions (just bookkeeping).
1412 */
1413 if (ip->i_df.if_bytes)
1414 xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
1415 ASSERT(ip->i_df.if_bytes == 0);
1416 /*
1417 * Put an itruncate log reservation in the new transaction
1418 * for our caller.
1419 */
1420 if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1421 XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1422 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1423 goto error0;
1424 }
1425 /*
1426 * Return with the inode locked but not joined to the transaction.
1427 */
1428 *tpp = tp;
1429 return 0;
1430
1431 error1:
1432 xfs_bmap_cancel(&free_list);
1433 error0:
1434 /*
1435 * Have to come here with the inode locked and either
1436 * (held and in the transaction) or (not in the transaction).
1437 * If the inode isn't held then cancel would iput it, but
1438 * that's wrong since this is inactive and the vnode ref
1439 * count is 0 already.
1440 * Cancel won't do anything to the inode if held, but it still
1441 * needs to be locked until the cancel is done, if it was
1442 * joined to the transaction.
1443 */
1444 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1445 xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1446 *tpp = NULL;
1447 return error;
1448
1449}
1450
1451STATIC int
1452xfs_inactive_symlink_local(
1453 xfs_inode_t *ip,
1454 xfs_trans_t **tpp)
1455{
1456 int error;
1457
1458 ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip));
1459 /*
1460 * We're freeing a symlink which fit into
1461 * the inode. Just free the memory used
1462 * to hold the old symlink.
1463 */
1464 error = xfs_trans_reserve(*tpp, 0,
1465 XFS_ITRUNCATE_LOG_RES(ip->i_mount),
1466 0, XFS_TRANS_PERM_LOG_RES,
1467 XFS_ITRUNCATE_LOG_COUNT);
1468
1469 if (error) {
1470 xfs_trans_cancel(*tpp, 0);
1471 *tpp = NULL;
1472 return (error);
1473 }
1474 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1475
1476 /*
1477 * Zero length symlinks _can_ exist.
1478 */
1479 if (ip->i_df.if_bytes > 0) {
1480 xfs_idata_realloc(ip,
1481 -(ip->i_df.if_bytes),
1482 XFS_DATA_FORK);
1483 ASSERT(ip->i_df.if_bytes == 0);
1484 }
1485 return (0);
1486}
1487
1488/*
1489 *
1490 */
1491STATIC int
1492xfs_inactive_attrs(
1493 xfs_inode_t *ip,
1494 xfs_trans_t **tpp)
1495{
1496 xfs_trans_t *tp;
1497 int error;
1498 xfs_mount_t *mp;
1499
1500 ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE));
1501 tp = *tpp;
1502 mp = ip->i_mount;
1503 ASSERT(ip->i_d.di_forkoff != 0);
1504 xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
1505 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1506
1507 error = xfs_attr_inactive(ip);
1508 if (error) {
1509 *tpp = NULL;
1510 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1511 return (error); /* goto out*/
1512 }
1513
1514 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1515 error = xfs_trans_reserve(tp, 0,
1516 XFS_IFREE_LOG_RES(mp),
1517 0, XFS_TRANS_PERM_LOG_RES,
1518 XFS_INACTIVE_LOG_COUNT);
1519 if (error) {
1520 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1521 xfs_trans_cancel(tp, 0);
1522 *tpp = NULL;
1523 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1524 return (error);
1525 }
1526
1527 xfs_ilock(ip, XFS_ILOCK_EXCL);
1528 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1529 xfs_trans_ihold(tp, ip);
1530 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1531
1532 ASSERT(ip->i_d.di_anextents == 0);
1533
1534 *tpp = tp;
1535 return (0);
1536}
1537
1538STATIC int
1539xfs_release(
1540 bhv_desc_t *bdp)
1541{
1542 xfs_inode_t *ip;
1543 vnode_t *vp;
1544 xfs_mount_t *mp;
1545 int error;
1546
1547 vp = BHV_TO_VNODE(bdp);
1548 ip = XFS_BHVTOI(bdp);
1549
1550 if ((vp->v_type != VREG) || (ip->i_d.di_mode == 0)) {
1551 return 0;
1552 }
1553
1554 /* If this is a read-only mount, don't do this (would generate I/O) */
1555 if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
1556 return 0;
1557
1558#ifdef HAVE_REFCACHE
1559 /* If we are in the NFS reference cache then don't do this now */
1560 if (ip->i_refcache)
1561 return 0;
1562#endif
1563
1564 mp = ip->i_mount;
1565
1566 if (ip->i_d.di_nlink != 0) {
1567 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1568 ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0)) &&
1569 (ip->i_df.if_flags & XFS_IFEXTENTS)) &&
1570 (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)))) {
1571 if ((error = xfs_inactive_free_eofblocks(mp, ip)))
1572 return (error);
1573 /* Update linux inode block count after free above */
1574 LINVFS_GET_IP(vp)->i_blocks = XFS_FSB_TO_BB(mp,
1575 ip->i_d.di_nblocks + ip->i_delayed_blks);
1576 }
1577 }
1578
1579 return 0;
1580}
1581
1582/*
1583 * xfs_inactive
1584 *
1585 * This is called when the vnode reference count for the vnode
1586 * goes to zero. If the file has been unlinked, then it must
1587 * now be truncated. Also, we clear all of the read-ahead state
1588 * kept for the inode here since the file is now closed.
1589 */
1590STATIC int
1591xfs_inactive(
1592 bhv_desc_t *bdp,
1593 cred_t *credp)
1594{
1595 xfs_inode_t *ip;
1596 vnode_t *vp;
1597 xfs_bmap_free_t free_list;
1598 xfs_fsblock_t first_block;
1599 int committed;
1600 xfs_trans_t *tp;
1601 xfs_mount_t *mp;
1602 int error;
1603 int truncate;
1604
1605 vp = BHV_TO_VNODE(bdp);
1606 vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
1607
1608 ip = XFS_BHVTOI(bdp);
1609
1610 /*
1611 * If the inode is already free, then there can be nothing
1612 * to clean up here.
1613 */
1614 if (ip->i_d.di_mode == 0 || VN_BAD(vp)) {
1615 ASSERT(ip->i_df.if_real_bytes == 0);
1616 ASSERT(ip->i_df.if_broot_bytes == 0);
1617 return VN_INACTIVE_CACHE;
1618 }
1619
1620 /*
1621 * Only do a truncate if it's a regular file with
1622 * some actual space in it. It's OK to look at the
1623 * inode's fields without the lock because we're the
1624 * only one with a reference to the inode.
1625 */
1626 truncate = ((ip->i_d.di_nlink == 0) &&
1627 ((ip->i_d.di_size != 0) || (ip->i_d.di_nextents > 0)) &&
1628 ((ip->i_d.di_mode & S_IFMT) == S_IFREG));
1629
1630 mp = ip->i_mount;
1631
1632 if (ip->i_d.di_nlink == 0 &&
1633 DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_DESTROY)) {
1634 (void) XFS_SEND_DESTROY(mp, vp, DM_RIGHT_NULL);
1635 }
1636
1637 error = 0;
1638
1639 /* If this is a read-only mount, don't do this (would generate I/O) */
1640 if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
1641 goto out;
1642
1643 if (ip->i_d.di_nlink != 0) {
1644 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1645 ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0)) &&
1646 (ip->i_df.if_flags & XFS_IFEXTENTS)) &&
1647 (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)) ||
1648 (ip->i_delayed_blks != 0))) {
1649 if ((error = xfs_inactive_free_eofblocks(mp, ip)))
1650 return (VN_INACTIVE_CACHE);
1651 /* Update linux inode block count after free above */
1652 LINVFS_GET_IP(vp)->i_blocks = XFS_FSB_TO_BB(mp,
1653 ip->i_d.di_nblocks + ip->i_delayed_blks);
1654 }
1655 goto out;
1656 }
1657
1658 ASSERT(ip->i_d.di_nlink == 0);
1659
1660 if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1661 return (VN_INACTIVE_CACHE);
1662
1663 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1664 if (truncate) {
1665 /*
1666 * Do the xfs_itruncate_start() call before
1667 * reserving any log space because itruncate_start
1668 * will call into the buffer cache and we can't
1669 * do that within a transaction.
1670 */
1671 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1672
1673 xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0);
1674
1675 error = xfs_trans_reserve(tp, 0,
1676 XFS_ITRUNCATE_LOG_RES(mp),
1677 0, XFS_TRANS_PERM_LOG_RES,
1678 XFS_ITRUNCATE_LOG_COUNT);
1679 if (error) {
1680 /* Don't call itruncate_cleanup */
1681 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1682 xfs_trans_cancel(tp, 0);
1683 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1684 return (VN_INACTIVE_CACHE);
1685 }
1686
1687 xfs_ilock(ip, XFS_ILOCK_EXCL);
1688 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1689 xfs_trans_ihold(tp, ip);
1690
1691 /*
1692 * normally, we have to run xfs_itruncate_finish sync.
1693 * But if filesystem is wsync and we're in the inactive
1694 * path, then we know that nlink == 0, and that the
1695 * xaction that made nlink == 0 is permanently committed
1696 * since xfs_remove runs as a synchronous transaction.
1697 */
1698 error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK,
1699 (!(mp->m_flags & XFS_MOUNT_WSYNC) ? 1 : 0));
1700
1701 if (error) {
1702 xfs_trans_cancel(tp,
1703 XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1704 xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1705 return (VN_INACTIVE_CACHE);
1706 }
1707 } else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) {
1708
1709 /*
1710 * If we get an error while cleaning up a
1711 * symlink we bail out.
1712 */
1713 error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ?
1714 xfs_inactive_symlink_rmt(ip, &tp) :
1715 xfs_inactive_symlink_local(ip, &tp);
1716
1717 if (error) {
1718 ASSERT(tp == NULL);
1719 return (VN_INACTIVE_CACHE);
1720 }
1721
1722 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1723 xfs_trans_ihold(tp, ip);
1724 } else {
1725 error = xfs_trans_reserve(tp, 0,
1726 XFS_IFREE_LOG_RES(mp),
1727 0, XFS_TRANS_PERM_LOG_RES,
1728 XFS_INACTIVE_LOG_COUNT);
1729 if (error) {
1730 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1731 xfs_trans_cancel(tp, 0);
1732 return (VN_INACTIVE_CACHE);
1733 }
1734
1735 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1736 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1737 xfs_trans_ihold(tp, ip);
1738 }
1739
1740 /*
1741 * If there are attributes associated with the file
1742 * then blow them away now. The code calls a routine
1743 * that recursively deconstructs the attribute fork.
1744 * We need to just commit the current transaction
1745 * because we can't use it for xfs_attr_inactive().
1746 */
1747 if (ip->i_d.di_anextents > 0) {
1748 error = xfs_inactive_attrs(ip, &tp);
1749 /*
1750 * If we got an error, the transaction is already
1751 * cancelled, and the inode is unlocked. Just get out.
1752 */
1753 if (error)
1754 return (VN_INACTIVE_CACHE);
1755 } else if (ip->i_afp) {
1756 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1757 }
1758
1759 /*
1760 * Free the inode.
1761 */
1762 XFS_BMAP_INIT(&free_list, &first_block);
1763 error = xfs_ifree(tp, ip, &free_list);
1764 if (error) {
1765 /*
1766 * If we fail to free the inode, shut down. The cancel
1767 * might do that, we need to make sure. Otherwise the
1768 * inode might be lost for a long time or forever.
1769 */
1770 if (!XFS_FORCED_SHUTDOWN(mp)) {
1771 cmn_err(CE_NOTE,
1772 "xfs_inactive: xfs_ifree() returned an error = %d on %s",
1773 error, mp->m_fsname);
1774 xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR);
1775 }
1776 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
1777 } else {
1778 /*
1779 * Credit the quota account(s). The inode is gone.
1780 */
1781 XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1782
1783 /*
1784 * Just ignore errors at this point. There is
1785 * nothing we can do except to try to keep going.
1786 */
1787 (void) xfs_bmap_finish(&tp, &free_list, first_block,
1788 &committed);
1789 (void) xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
1790 }
1791 /*
1792 * Release the dquots held by inode, if any.
1793 */
1794 XFS_QM_DQDETACH(mp, ip);
1795
1796 xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1797
1798 out:
1799 return VN_INACTIVE_CACHE;
1800}
1801
1802
1803/*
1804 * xfs_lookup
1805 */
1806STATIC int
1807xfs_lookup(
1808 bhv_desc_t *dir_bdp,
1809 vname_t *dentry,
1810 vnode_t **vpp,
1811 int flags,
1812 vnode_t *rdir,
1813 cred_t *credp)
1814{
1815 xfs_inode_t *dp, *ip;
1816 xfs_ino_t e_inum;
1817 int error;
1818 uint lock_mode;
1819 vnode_t *dir_vp;
1820
1821 dir_vp = BHV_TO_VNODE(dir_bdp);
1822 vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
1823
1824 dp = XFS_BHVTOI(dir_bdp);
1825
1826 if (XFS_FORCED_SHUTDOWN(dp->i_mount))
1827 return XFS_ERROR(EIO);
1828
1829 lock_mode = xfs_ilock_map_shared(dp);
1830 error = xfs_dir_lookup_int(dir_bdp, lock_mode, dentry, &e_inum, &ip);
1831 if (!error) {
1832 *vpp = XFS_ITOV(ip);
1833 ITRACE(ip);
1834 }
1835 xfs_iunlock_map_shared(dp, lock_mode);
1836 return error;
1837}
1838
1839
1840/*
1841 * xfs_create (create a new file).
1842 */
1843STATIC int
1844xfs_create(
1845 bhv_desc_t *dir_bdp,
1846 vname_t *dentry,
1847 vattr_t *vap,
1848 vnode_t **vpp,
1849 cred_t *credp)
1850{
1851 char *name = VNAME(dentry);
1852 vnode_t *dir_vp;
1853 xfs_inode_t *dp, *ip;
1854 vnode_t *vp=NULL;
1855 xfs_trans_t *tp;
1856 xfs_mount_t *mp;
1857 xfs_dev_t rdev;
1858 int error;
1859 xfs_bmap_free_t free_list;
1860 xfs_fsblock_t first_block;
1861 boolean_t dp_joined_to_trans;
1862 int dm_event_sent = 0;
1863 uint cancel_flags;
1864 int committed;
1865 xfs_prid_t prid;
1866 struct xfs_dquot *udqp, *gdqp;
1867 uint resblks;
1868 int dm_di_mode;
1869 int namelen;
1870
1871 ASSERT(!*vpp);
1872 dir_vp = BHV_TO_VNODE(dir_bdp);
1873 vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
1874
1875 dp = XFS_BHVTOI(dir_bdp);
1876 mp = dp->i_mount;
1877
1878 dm_di_mode = vap->va_mode|VTTOIF(vap->va_type);
1879 namelen = VNAMELEN(dentry);
1880
1881 if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) {
1882 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
1883 dir_vp, DM_RIGHT_NULL, NULL,
1884 DM_RIGHT_NULL, name, NULL,
1885 dm_di_mode, 0, 0);
1886
1887 if (error)
1888 return error;
1889 dm_event_sent = 1;
1890 }
1891
1892 if (XFS_FORCED_SHUTDOWN(mp))
1893 return XFS_ERROR(EIO);
1894
1895 /* Return through std_return after this point. */
1896
1897 udqp = gdqp = NULL;
1898 if (vap->va_mask & XFS_AT_PROJID)
1899 prid = (xfs_prid_t)vap->va_projid;
1900 else
1901 prid = (xfs_prid_t)dfltprid;
1902
1903 /*
1904 * Make sure that we have allocated dquot(s) on disk.
1905 */
1906 error = XFS_QM_DQVOPALLOC(mp, dp,
1907 current_fsuid(credp), current_fsgid(credp),
1908 XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp);
1909 if (error)
1910 goto std_return;
1911
1912 ip = NULL;
1913 dp_joined_to_trans = B_FALSE;
1914
1915 tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1916 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1917 resblks = XFS_CREATE_SPACE_RES(mp, namelen);
1918 /*
1919 * Initially assume that the file does not exist and
1920 * reserve the resources for that case. If that is not
1921 * the case we'll drop the one we have and get a more
1922 * appropriate transaction later.
1923 */
1924 error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
1925 XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1926 if (error == ENOSPC) {
1927 resblks = 0;
1928 error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0,
1929 XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1930 }
1931 if (error) {
1932 cancel_flags = 0;
1933 dp = NULL;
1934 goto error_return;
1935 }
1936
1937 xfs_ilock(dp, XFS_ILOCK_EXCL);
1938
1939 XFS_BMAP_INIT(&free_list, &first_block);
1940
1941 ASSERT(ip == NULL);
1942
1943 /*
1944 * Reserve disk quota and the inode.
1945 */
1946 error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
1947 if (error)
1948 goto error_return;
1949
1950 if (resblks == 0 &&
1951 (error = XFS_DIR_CANENTER(mp, tp, dp, name, namelen)))
1952 goto error_return;
1953 rdev = (vap->va_mask & XFS_AT_RDEV) ? vap->va_rdev : 0;
1954 error = xfs_dir_ialloc(&tp, dp,
1955 MAKEIMODE(vap->va_type,vap->va_mode), 1,
1956 rdev, credp, prid, resblks > 0,
1957 &ip, &committed);
1958 if (error) {
1959 if (error == ENOSPC)
1960 goto error_return;
1961 goto abort_return;
1962 }
1963 ITRACE(ip);
1964
1965 /*
1966 * At this point, we've gotten a newly allocated inode.
1967 * It is locked (and joined to the transaction).
1968 */
1969
1970 ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE));
1971
1972 /*
1973 * Now we join the directory inode to the transaction.
1974 * We do not do it earlier because xfs_dir_ialloc
1975 * might commit the previous transaction (and release
1976 * all the locks).
1977 */
1978
1979 VN_HOLD(dir_vp);
1980 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1981 dp_joined_to_trans = B_TRUE;
1982
1983 error = XFS_DIR_CREATENAME(mp, tp, dp, name, namelen, ip->i_ino,
1984 &first_block, &free_list,
1985 resblks ? resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
1986 if (error) {
1987 ASSERT(error != ENOSPC);
1988 goto abort_return;
1989 }
1990 xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1991 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1992
1993 /*
1994 * If this is a synchronous mount, make sure that the
1995 * create transaction goes to disk before returning to
1996 * the user.
1997 */
1998 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
1999 xfs_trans_set_sync(tp);
2000 }
2001
2002 dp->i_gen++;
2003
2004 /*
2005 * Attach the dquot(s) to the inodes and modify them incore.
2006 * These ids of the inode couldn't have changed since the new
2007 * inode has been locked ever since it was created.
2008 */
2009 XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
2010
2011 /*
2012 * xfs_trans_commit normally decrements the vnode ref count
2013 * when it unlocks the inode. Since we want to return the
2014 * vnode to the caller, we bump the vnode ref count now.
2015 */
2016 IHOLD(ip);
2017 vp = XFS_ITOV(ip);
2018
2019 error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
2020 if (error) {
2021 xfs_bmap_cancel(&free_list);
2022 goto abort_rele;
2023 }
2024
2025 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
2026 if (error) {
2027 IRELE(ip);
2028 tp = NULL;
2029 goto error_return;
2030 }
2031
2032 XFS_QM_DQRELE(mp, udqp);
2033 XFS_QM_DQRELE(mp, gdqp);
2034
2035 /*
2036 * Propogate the fact that the vnode changed after the
2037 * xfs_inode locks have been released.
2038 */
2039 VOP_VNODE_CHANGE(vp, VCHANGE_FLAGS_TRUNCATED, 3);
2040
2041 *vpp = vp;
2042
2043 /* Fallthrough to std_return with error = 0 */
2044
2045std_return:
2046 if ( (*vpp || (error != 0 && dm_event_sent != 0)) &&
2047 DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
2048 DM_EVENT_POSTCREATE)) {
2049 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2050 dir_vp, DM_RIGHT_NULL,
2051 *vpp ? vp:NULL,
2052 DM_RIGHT_NULL, name, NULL,
2053 dm_di_mode, error, 0);
2054 }
2055 return error;
2056
2057 abort_return:
2058 cancel_flags |= XFS_TRANS_ABORT;
2059 /* FALLTHROUGH */
2060 error_return:
2061
2062 if (tp != NULL)
2063 xfs_trans_cancel(tp, cancel_flags);
2064
2065 if (!dp_joined_to_trans && (dp != NULL))
2066 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2067 XFS_QM_DQRELE(mp, udqp);
2068 XFS_QM_DQRELE(mp, gdqp);
2069
2070 goto std_return;
2071
2072 abort_rele:
2073 /*
2074 * Wait until after the current transaction is aborted to
2075 * release the inode. This prevents recursive transactions
2076 * and deadlocks from xfs_inactive.
2077 */
2078 cancel_flags |= XFS_TRANS_ABORT;
2079 xfs_trans_cancel(tp, cancel_flags);
2080 IRELE(ip);
2081
2082 XFS_QM_DQRELE(mp, udqp);
2083 XFS_QM_DQRELE(mp, gdqp);
2084
2085 goto std_return;
2086}
2087
2088#ifdef DEBUG
2089/*
2090 * Some counters to see if (and how often) we are hitting some deadlock
2091 * prevention code paths.
2092 */
2093
2094int xfs_rm_locks;
2095int xfs_rm_lock_delays;
2096int xfs_rm_attempts;
2097#endif
2098
2099/*
2100 * The following routine will lock the inodes associated with the
2101 * directory and the named entry in the directory. The locks are
2102 * acquired in increasing inode number.
2103 *
2104 * If the entry is "..", then only the directory is locked. The
2105 * vnode ref count will still include that from the .. entry in
2106 * this case.
2107 *
2108 * There is a deadlock we need to worry about. If the locked directory is
2109 * in the AIL, it might be blocking up the log. The next inode we lock
2110 * could be already locked by another thread waiting for log space (e.g
2111 * a permanent log reservation with a long running transaction (see
2112 * xfs_itruncate_finish)). To solve this, we must check if the directory
2113 * is in the ail and use lock_nowait. If we can't lock, we need to
2114 * drop the inode lock on the directory and try again. xfs_iunlock will
2115 * potentially push the tail if we were holding up the log.
2116 */
2117STATIC int
2118xfs_lock_dir_and_entry(
2119 xfs_inode_t *dp,
2120 vname_t *dentry,
2121 xfs_inode_t *ip) /* inode of entry 'name' */
2122{
2123 int attempts;
2124 xfs_ino_t e_inum;
2125 xfs_inode_t *ips[2];
2126 xfs_log_item_t *lp;
2127
2128#ifdef DEBUG
2129 xfs_rm_locks++;
2130#endif
2131 attempts = 0;
2132
2133again:
2134 xfs_ilock(dp, XFS_ILOCK_EXCL);
2135
2136 e_inum = ip->i_ino;
2137
2138 ITRACE(ip);
2139
2140 /*
2141 * We want to lock in increasing inum. Since we've already
2142 * acquired the lock on the directory, we may need to release
2143 * if if the inum of the entry turns out to be less.
2144 */
2145 if (e_inum > dp->i_ino) {
2146 /*
2147 * We are already in the right order, so just
2148 * lock on the inode of the entry.
2149 * We need to use nowait if dp is in the AIL.
2150 */
2151
2152 lp = (xfs_log_item_t *)dp->i_itemp;
2153 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2154 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2155 attempts++;
2156#ifdef DEBUG
2157 xfs_rm_attempts++;
2158#endif
2159
2160 /*
2161 * Unlock dp and try again.
2162 * xfs_iunlock will try to push the tail
2163 * if the inode is in the AIL.
2164 */
2165
2166 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2167
2168 if ((attempts % 5) == 0) {
2169 delay(1); /* Don't just spin the CPU */
2170#ifdef DEBUG
2171 xfs_rm_lock_delays++;
2172#endif
2173 }
2174 goto again;
2175 }
2176 } else {
2177 xfs_ilock(ip, XFS_ILOCK_EXCL);
2178 }
2179 } else if (e_inum < dp->i_ino) {
2180 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2181
2182 ips[0] = ip;
2183 ips[1] = dp;
2184 xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2185 }
2186 /* else e_inum == dp->i_ino */
2187 /* This can happen if we're asked to lock /x/..
2188 * the entry is "..", which is also the parent directory.
2189 */
2190
2191 return 0;
2192}
2193
2194#ifdef DEBUG
2195int xfs_locked_n;
2196int xfs_small_retries;
2197int xfs_middle_retries;
2198int xfs_lots_retries;
2199int xfs_lock_delays;
2200#endif
2201
2202/*
2203 * The following routine will lock n inodes in exclusive mode.
2204 * We assume the caller calls us with the inodes in i_ino order.
2205 *
2206 * We need to detect deadlock where an inode that we lock
2207 * is in the AIL and we start waiting for another inode that is locked
2208 * by a thread in a long running transaction (such as truncate). This can
2209 * result in deadlock since the long running trans might need to wait
2210 * for the inode we just locked in order to push the tail and free space
2211 * in the log.
2212 */
2213void
2214xfs_lock_inodes(
2215 xfs_inode_t **ips,
2216 int inodes,
2217 int first_locked,
2218 uint lock_mode)
2219{
2220 int attempts = 0, i, j, try_lock;
2221 xfs_log_item_t *lp;
2222
2223 ASSERT(ips && (inodes >= 2)); /* we need at least two */
2224
2225 if (first_locked) {
2226 try_lock = 1;
2227 i = 1;
2228 } else {
2229 try_lock = 0;
2230 i = 0;
2231 }
2232
2233again:
2234 for (; i < inodes; i++) {
2235 ASSERT(ips[i]);
2236
2237 if (i && (ips[i] == ips[i-1])) /* Already locked */
2238 continue;
2239
2240 /*
2241 * If try_lock is not set yet, make sure all locked inodes
2242 * are not in the AIL.
2243 * If any are, set try_lock to be used later.
2244 */
2245
2246 if (!try_lock) {
2247 for (j = (i - 1); j >= 0 && !try_lock; j--) {
2248 lp = (xfs_log_item_t *)ips[j]->i_itemp;
2249 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2250 try_lock++;
2251 }
2252 }
2253 }
2254
2255 /*
2256 * If any of the previous locks we have locked is in the AIL,
2257 * we must TRY to get the second and subsequent locks. If
2258 * we can't get any, we must release all we have
2259 * and try again.
2260 */
2261
2262 if (try_lock) {
2263 /* try_lock must be 0 if i is 0. */
2264 /*
2265 * try_lock means we have an inode locked
2266 * that is in the AIL.
2267 */
2268 ASSERT(i != 0);
2269 if (!xfs_ilock_nowait(ips[i], lock_mode)) {
2270 attempts++;
2271
2272 /*
2273 * Unlock all previous guys and try again.
2274 * xfs_iunlock will try to push the tail
2275 * if the inode is in the AIL.
2276 */
2277
2278 for(j = i - 1; j >= 0; j--) {
2279
2280 /*
2281 * Check to see if we've already
2282 * unlocked this one.
2283 * Not the first one going back,
2284 * and the inode ptr is the same.
2285 */
2286 if ((j != (i - 1)) && ips[j] ==
2287 ips[j+1])
2288 continue;
2289
2290 xfs_iunlock(ips[j], lock_mode);
2291 }
2292
2293 if ((attempts % 5) == 0) {
2294 delay(1); /* Don't just spin the CPU */
2295#ifdef DEBUG
2296 xfs_lock_delays++;
2297#endif
2298 }
2299 i = 0;
2300 try_lock = 0;
2301 goto again;
2302 }
2303 } else {
2304 xfs_ilock(ips[i], lock_mode);
2305 }
2306 }
2307
2308#ifdef DEBUG
2309 if (attempts) {
2310 if (attempts < 5) xfs_small_retries++;
2311 else if (attempts < 100) xfs_middle_retries++;
2312 else xfs_lots_retries++;
2313 } else {
2314 xfs_locked_n++;
2315 }
2316#endif
2317}
2318
2319#ifdef DEBUG
2320#define REMOVE_DEBUG_TRACE(x) {remove_which_error_return = (x);}
2321int remove_which_error_return = 0;
2322#else /* ! DEBUG */
2323#define REMOVE_DEBUG_TRACE(x)
2324#endif /* ! DEBUG */
2325
2326
2327/*
2328 * xfs_remove
2329 *
2330 */
2331STATIC int
2332xfs_remove(
2333 bhv_desc_t *dir_bdp,
2334 vname_t *dentry,
2335 cred_t *credp)
2336{
2337 vnode_t *dir_vp;
2338 char *name = VNAME(dentry);
2339 xfs_inode_t *dp, *ip;
2340 xfs_trans_t *tp = NULL;
2341 xfs_mount_t *mp;
2342 int error = 0;
2343 xfs_bmap_free_t free_list;
2344 xfs_fsblock_t first_block;
2345 int cancel_flags;
2346 int committed;
2347 int dm_di_mode = 0;
2348 int link_zero;
2349 uint resblks;
2350 int namelen;
2351
2352 dir_vp = BHV_TO_VNODE(dir_bdp);
2353 vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
2354
2355 dp = XFS_BHVTOI(dir_bdp);
2356 mp = dp->i_mount;
2357
2358 if (XFS_FORCED_SHUTDOWN(mp))
2359 return XFS_ERROR(EIO);
2360
2361 namelen = VNAMELEN(dentry);
2362
2363 if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) {
2364 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dir_vp,
2365 DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
2366 name, NULL, 0, 0, 0);
2367 if (error)
2368 return error;
2369 }
2370
2371 /* From this point on, return through std_return */
2372 ip = NULL;
2373
2374 /*
2375 * We need to get a reference to ip before we get our log
2376 * reservation. The reason for this is that we cannot call
2377 * xfs_iget for an inode for which we do not have a reference
2378 * once we've acquired a log reservation. This is because the
2379 * inode we are trying to get might be in xfs_inactive going
2380 * for a log reservation. Since we'll have to wait for the
2381 * inactive code to complete before returning from xfs_iget,
2382 * we need to make sure that we don't have log space reserved
2383 * when we call xfs_iget. Instead we get an unlocked referece
2384 * to the inode before getting our log reservation.
2385 */
2386 error = xfs_get_dir_entry(dentry, &ip);
2387 if (error) {
2388 REMOVE_DEBUG_TRACE(__LINE__);
2389 goto std_return;
2390 }
2391
2392 dm_di_mode = ip->i_d.di_mode;
2393
2394 vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
2395
2396 ITRACE(ip);
2397
2398 error = XFS_QM_DQATTACH(mp, dp, 0);
2399 if (!error && dp != ip)
2400 error = XFS_QM_DQATTACH(mp, ip, 0);
2401 if (error) {
2402 REMOVE_DEBUG_TRACE(__LINE__);
2403 IRELE(ip);
2404 goto std_return;
2405 }
2406
2407 tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
2408 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2409 /*
2410 * We try to get the real space reservation first,
2411 * allowing for directory btree deletion(s) implying
2412 * possible bmap insert(s). If we can't get the space
2413 * reservation then we use 0 instead, and avoid the bmap
2414 * btree insert(s) in the directory code by, if the bmap
2415 * insert tries to happen, instead trimming the LAST
2416 * block from the directory.
2417 */
2418 resblks = XFS_REMOVE_SPACE_RES(mp);
2419 error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
2420 XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2421 if (error == ENOSPC) {
2422 resblks = 0;
2423 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
2424 XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2425 }
2426 if (error) {
2427 ASSERT(error != ENOSPC);
2428 REMOVE_DEBUG_TRACE(__LINE__);
2429 xfs_trans_cancel(tp, 0);
2430 IRELE(ip);
2431 return error;
2432 }
2433
2434 error = xfs_lock_dir_and_entry(dp, dentry, ip);
2435 if (error) {
2436 REMOVE_DEBUG_TRACE(__LINE__);
2437 xfs_trans_cancel(tp, cancel_flags);
2438 IRELE(ip);
2439 goto std_return;
2440 }
2441
2442 /*
2443 * At this point, we've gotten both the directory and the entry
2444 * inodes locked.
2445 */
2446 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2447 if (dp != ip) {
2448 /*
2449 * Increment vnode ref count only in this case since
2450 * there's an extra vnode reference in the case where
2451 * dp == ip.
2452 */
2453 IHOLD(dp);
2454 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2455 }
2456
2457 /*
2458 * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
2459 */
2460 XFS_BMAP_INIT(&free_list, &first_block);
2461 error = XFS_DIR_REMOVENAME(mp, tp, dp, name, namelen, ip->i_ino,
2462 &first_block, &free_list, 0);
2463 if (error) {
2464 ASSERT(error != ENOENT);
2465 REMOVE_DEBUG_TRACE(__LINE__);
2466 goto error1;
2467 }
2468 xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2469
2470 dp->i_gen++;
2471 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2472
2473 error = xfs_droplink(tp, ip);
2474 if (error) {
2475 REMOVE_DEBUG_TRACE(__LINE__);
2476 goto error1;
2477 }
2478
2479 /* Determine if this is the last link while
2480 * we are in the transaction.
2481 */
2482 link_zero = (ip)->i_d.di_nlink==0;
2483
2484 /*
2485 * Take an extra ref on the inode so that it doesn't
2486 * go to xfs_inactive() from within the commit.
2487 */
2488 IHOLD(ip);
2489
2490 /*
2491 * If this is a synchronous mount, make sure that the
2492 * remove transaction goes to disk before returning to
2493 * the user.
2494 */
2495 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2496 xfs_trans_set_sync(tp);
2497 }
2498
2499 error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
2500 if (error) {
2501 REMOVE_DEBUG_TRACE(__LINE__);
2502 goto error_rele;
2503 }
2504
2505 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
2506 if (error) {
2507 IRELE(ip);
2508 goto std_return;
2509 }
2510
2511 /*
2512 * Before we drop our extra reference to the inode, purge it
2513 * from the refcache if it is there. By waiting until afterwards
2514 * to do the IRELE, we ensure that we won't go inactive in the
2515 * xfs_refcache_purge_ip routine (although that would be OK).
2516 */
2517 xfs_refcache_purge_ip(ip);
2518
2519 vn_trace_exit(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
2520
2521 /*
2522 * Let interposed file systems know about removed links.
2523 */
2524 VOP_LINK_REMOVED(XFS_ITOV(ip), dir_vp, link_zero);
2525
2526 IRELE(ip);
2527
2528/* Fall through to std_return with error = 0 */
2529 std_return:
2530 if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp,
2531 DM_EVENT_POSTREMOVE)) {
2532 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
2533 dir_vp, DM_RIGHT_NULL,
2534 NULL, DM_RIGHT_NULL,
2535 name, NULL, dm_di_mode, error, 0);
2536 }
2537 return error;
2538
2539 error1:
2540 xfs_bmap_cancel(&free_list);
2541 cancel_flags |= XFS_TRANS_ABORT;
2542 xfs_trans_cancel(tp, cancel_flags);
2543 goto std_return;
2544
2545 error_rele:
2546 /*
2547 * In this case make sure to not release the inode until after
2548 * the current transaction is aborted. Releasing it beforehand
2549 * can cause us to go to xfs_inactive and start a recursive
2550 * transaction which can easily deadlock with the current one.
2551 */
2552 xfs_bmap_cancel(&free_list);
2553 cancel_flags |= XFS_TRANS_ABORT;
2554 xfs_trans_cancel(tp, cancel_flags);
2555
2556 /*
2557 * Before we drop our extra reference to the inode, purge it
2558 * from the refcache if it is there. By waiting until afterwards
2559 * to do the IRELE, we ensure that we won't go inactive in the
2560 * xfs_refcache_purge_ip routine (although that would be OK).
2561 */
2562 xfs_refcache_purge_ip(ip);
2563
2564 IRELE(ip);
2565
2566 goto std_return;
2567}
2568
2569
2570/*
2571 * xfs_link
2572 *
2573 */
2574STATIC int
2575xfs_link(
2576 bhv_desc_t *target_dir_bdp,
2577 vnode_t *src_vp,
2578 vname_t *dentry,
2579 cred_t *credp)
2580{
2581 xfs_inode_t *tdp, *sip;
2582 xfs_trans_t *tp;
2583 xfs_mount_t *mp;
2584 xfs_inode_t *ips[2];
2585 int error;
2586 xfs_bmap_free_t free_list;
2587 xfs_fsblock_t first_block;
2588 int cancel_flags;
2589 int committed;
2590 vnode_t *target_dir_vp;
2591 bhv_desc_t *src_bdp;
2592 int resblks;
2593 char *target_name = VNAME(dentry);
2594 int target_namelen;
2595
2596 target_dir_vp = BHV_TO_VNODE(target_dir_bdp);
2597 vn_trace_entry(target_dir_vp, __FUNCTION__, (inst_t *)__return_address);
2598 vn_trace_entry(src_vp, __FUNCTION__, (inst_t *)__return_address);
2599
2600 target_namelen = VNAMELEN(dentry);
2601 if (src_vp->v_type == VDIR)
2602 return XFS_ERROR(EPERM);
2603
2604 /*
2605 * For now, manually find the XFS behavior descriptor for
2606 * the source vnode. If it doesn't exist then something
2607 * is wrong and we should just return an error.
2608 * Eventually we need to figure out how link is going to
2609 * work in the face of stacked vnodes.
2610 */
2611 src_bdp = vn_bhv_lookup_unlocked(VN_BHV_HEAD(src_vp), &xfs_vnodeops);
2612 if (src_bdp == NULL) {
2613 return XFS_ERROR(EXDEV);
2614 }
2615 sip = XFS_BHVTOI(src_bdp);
2616 tdp = XFS_BHVTOI(target_dir_bdp);
2617 mp = tdp->i_mount;
2618 if (XFS_FORCED_SHUTDOWN(mp))
2619 return XFS_ERROR(EIO);
2620
2621 if (DM_EVENT_ENABLED(src_vp->v_vfsp, tdp, DM_EVENT_LINK)) {
2622 error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
2623 target_dir_vp, DM_RIGHT_NULL,
2624 src_vp, DM_RIGHT_NULL,
2625 target_name, NULL, 0, 0, 0);
2626 if (error)
2627 return error;
2628 }
2629
2630 /* Return through std_return after this point. */
2631
2632 error = XFS_QM_DQATTACH(mp, sip, 0);
2633 if (!error && sip != tdp)
2634 error = XFS_QM_DQATTACH(mp, tdp, 0);
2635 if (error)
2636 goto std_return;
2637
2638 tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
2639 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2640 resblks = XFS_LINK_SPACE_RES(mp, target_namelen);
2641 error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
2642 XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2643 if (error == ENOSPC) {
2644 resblks = 0;
2645 error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
2646 XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2647 }
2648 if (error) {
2649 cancel_flags = 0;
2650 goto error_return;
2651 }
2652
2653 if (sip->i_ino < tdp->i_ino) {
2654 ips[0] = sip;
2655 ips[1] = tdp;
2656 } else {
2657 ips[0] = tdp;
2658 ips[1] = sip;
2659 }
2660
2661 xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2662
2663 /*
2664 * Increment vnode ref counts since xfs_trans_commit &
2665 * xfs_trans_cancel will both unlock the inodes and
2666 * decrement the associated ref counts.
2667 */
2668 VN_HOLD(src_vp);
2669 VN_HOLD(target_dir_vp);
2670 xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
2671 xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
2672
2673 /*
2674 * If the source has too many links, we can't make any more to it.
2675 */
2676 if (sip->i_d.di_nlink >= XFS_MAXLINK) {
2677 error = XFS_ERROR(EMLINK);
2678 goto error_return;
2679 }
2680
2681 if (resblks == 0 &&
2682 (error = XFS_DIR_CANENTER(mp, tp, tdp, target_name,
2683 target_namelen)))
2684 goto error_return;
2685
2686 XFS_BMAP_INIT(&free_list, &first_block);
2687
2688 error = XFS_DIR_CREATENAME(mp, tp, tdp, target_name, target_namelen,
2689 sip->i_ino, &first_block, &free_list,
2690 resblks);
2691 if (error)
2692 goto abort_return;
2693 xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2694 tdp->i_gen++;
2695 xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
2696
2697 error = xfs_bumplink(tp, sip);
2698 if (error) {
2699 goto abort_return;
2700 }
2701
2702 /*
2703 * If this is a synchronous mount, make sure that the
2704 * link transaction goes to disk before returning to
2705 * the user.
2706 */
2707 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2708 xfs_trans_set_sync(tp);
2709 }
2710
2711 error = xfs_bmap_finish (&tp, &free_list, first_block, &committed);
2712 if (error) {
2713 xfs_bmap_cancel(&free_list);
2714 goto abort_return;
2715 }
2716
2717 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
2718 if (error) {
2719 goto std_return;
2720 }
2721
2722 /* Fall through to std_return with error = 0. */
2723std_return:
2724 if (DM_EVENT_ENABLED(src_vp->v_vfsp, sip,
2725 DM_EVENT_POSTLINK)) {
2726 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
2727 target_dir_vp, DM_RIGHT_NULL,
2728 src_vp, DM_RIGHT_NULL,
2729 target_name, NULL, 0, error, 0);
2730 }
2731 return error;
2732
2733 abort_return:
2734 cancel_flags |= XFS_TRANS_ABORT;
2735 /* FALLTHROUGH */
2736 error_return:
2737 xfs_trans_cancel(tp, cancel_flags);
2738
2739 goto std_return;
2740}
2741/*
2742 * xfs_mkdir
2743 *
2744 */
2745STATIC int
2746xfs_mkdir(
2747 bhv_desc_t *dir_bdp,
2748 vname_t *dentry,
2749 vattr_t *vap,
2750 vnode_t **vpp,
2751 cred_t *credp)
2752{
2753 char *dir_name = VNAME(dentry);
2754 xfs_inode_t *dp;
2755 xfs_inode_t *cdp; /* inode of created dir */
2756 vnode_t *cvp; /* vnode of created dir */
2757 xfs_trans_t *tp;
2758 xfs_mount_t *mp;
2759 int cancel_flags;
2760 int error;
2761 int committed;
2762 xfs_bmap_free_t free_list;
2763 xfs_fsblock_t first_block;
2764 vnode_t *dir_vp;
2765 boolean_t dp_joined_to_trans;
2766 boolean_t created = B_FALSE;
2767 int dm_event_sent = 0;
2768 xfs_prid_t prid;
2769 struct xfs_dquot *udqp, *gdqp;
2770 uint resblks;
2771 int dm_di_mode;
2772 int dir_namelen;
2773
2774 dir_vp = BHV_TO_VNODE(dir_bdp);
2775 dp = XFS_BHVTOI(dir_bdp);
2776 mp = dp->i_mount;
2777
2778 if (XFS_FORCED_SHUTDOWN(mp))
2779 return XFS_ERROR(EIO);
2780
2781 dir_namelen = VNAMELEN(dentry);
2782
2783 tp = NULL;
2784 dp_joined_to_trans = B_FALSE;
2785 dm_di_mode = vap->va_mode|VTTOIF(vap->va_type);
2786
2787 if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) {
2788 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
2789 dir_vp, DM_RIGHT_NULL, NULL,
2790 DM_RIGHT_NULL, dir_name, NULL,
2791 dm_di_mode, 0, 0);
2792 if (error)
2793 return error;
2794 dm_event_sent = 1;
2795 }
2796
2797 /* Return through std_return after this point. */
2798
2799 vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
2800
2801 mp = dp->i_mount;
2802 udqp = gdqp = NULL;
2803 if (vap->va_mask & XFS_AT_PROJID)
2804 prid = (xfs_prid_t)vap->va_projid;
2805 else
2806 prid = (xfs_prid_t)dfltprid;
2807
2808 /*
2809 * Make sure that we have allocated dquot(s) on disk.
2810 */
2811 error = XFS_QM_DQVOPALLOC(mp, dp,
2812 current_fsuid(credp), current_fsgid(credp),
2813 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
2814 if (error)
2815 goto std_return;
2816
2817 tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
2818 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2819 resblks = XFS_MKDIR_SPACE_RES(mp, dir_namelen);
2820 error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
2821 XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
2822 if (error == ENOSPC) {
2823 resblks = 0;
2824 error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0,
2825 XFS_TRANS_PERM_LOG_RES,
2826 XFS_MKDIR_LOG_COUNT);
2827 }
2828 if (error) {
2829 cancel_flags = 0;
2830 dp = NULL;
2831 goto error_return;
2832 }
2833
2834 xfs_ilock(dp, XFS_ILOCK_EXCL);
2835
2836 /*
2837 * Check for directory link count overflow.
2838 */
2839 if (dp->i_d.di_nlink >= XFS_MAXLINK) {
2840 error = XFS_ERROR(EMLINK);
2841 goto error_return;
2842 }
2843
2844 /*
2845 * Reserve disk quota and the inode.
2846 */
2847 error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
2848 if (error)
2849 goto error_return;
2850
2851 if (resblks == 0 &&
2852 (error = XFS_DIR_CANENTER(mp, tp, dp, dir_name, dir_namelen)))
2853 goto error_return;
2854 /*
2855 * create the directory inode.
2856 */
2857 error = xfs_dir_ialloc(&tp, dp,
2858 MAKEIMODE(vap->va_type,vap->va_mode), 2,
2859 0, credp, prid, resblks > 0,
2860 &cdp, NULL);
2861 if (error) {
2862 if (error == ENOSPC)
2863 goto error_return;
2864 goto abort_return;
2865 }
2866 ITRACE(cdp);
2867
2868 /*
2869 * Now we add the directory inode to the transaction.
2870 * We waited until now since xfs_dir_ialloc might start
2871 * a new transaction. Had we joined the transaction
2872 * earlier, the locks might have gotten released.
2873 */
2874 VN_HOLD(dir_vp);
2875 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2876 dp_joined_to_trans = B_TRUE;
2877
2878 XFS_BMAP_INIT(&free_list, &first_block);
2879
2880 error = XFS_DIR_CREATENAME(mp, tp, dp, dir_name, dir_namelen,
2881 cdp->i_ino, &first_block, &free_list,
2882 resblks ? resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
2883 if (error) {
2884 ASSERT(error != ENOSPC);
2885 goto error1;
2886 }
2887 xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2888
2889 /*
2890 * Bump the in memory version number of the parent directory
2891 * so that other processes accessing it will recognize that
2892 * the directory has changed.
2893 */
2894 dp->i_gen++;
2895
2896 error = XFS_DIR_INIT(mp, tp, cdp, dp);
2897 if (error) {
2898 goto error2;
2899 }
2900
2901 cdp->i_gen = 1;
2902 error = xfs_bumplink(tp, dp);
2903 if (error) {
2904 goto error2;
2905 }
2906
2907 cvp = XFS_ITOV(cdp);
2908
2909 created = B_TRUE;
2910
2911 *vpp = cvp;
2912 IHOLD(cdp);
2913
2914 /*
2915 * Attach the dquots to the new inode and modify the icount incore.
2916 */
2917 XFS_QM_DQVOPCREATE(mp, tp, cdp, udqp, gdqp);
2918
2919 /*
2920 * If this is a synchronous mount, make sure that the
2921 * mkdir transaction goes to disk before returning to
2922 * the user.
2923 */
2924 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2925 xfs_trans_set_sync(tp);
2926 }
2927
2928 error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
2929 if (error) {
2930 IRELE(cdp);
2931 goto error2;
2932 }
2933
2934 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
2935 XFS_QM_DQRELE(mp, udqp);
2936 XFS_QM_DQRELE(mp, gdqp);
2937 if (error) {
2938 IRELE(cdp);
2939 }
2940
2941 /* Fall through to std_return with error = 0 or errno from
2942 * xfs_trans_commit. */
2943
2944std_return:
2945 if ( (created || (error != 0 && dm_event_sent != 0)) &&
2946 DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
2947 DM_EVENT_POSTCREATE)) {
2948 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2949 dir_vp, DM_RIGHT_NULL,
2950 created ? XFS_ITOV(cdp):NULL,
2951 DM_RIGHT_NULL,
2952 dir_name, NULL,
2953 dm_di_mode, error, 0);
2954 }
2955 return error;
2956
2957 error2:
2958 error1:
2959 xfs_bmap_cancel(&free_list);
2960 abort_return:
2961 cancel_flags |= XFS_TRANS_ABORT;
2962 error_return:
2963 xfs_trans_cancel(tp, cancel_flags);
2964 XFS_QM_DQRELE(mp, udqp);
2965 XFS_QM_DQRELE(mp, gdqp);
2966
2967 if (!dp_joined_to_trans && (dp != NULL)) {
2968 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2969 }
2970
2971 goto std_return;
2972}
2973
2974
2975/*
2976 * xfs_rmdir
2977 *
2978 */
2979STATIC int
2980xfs_rmdir(
2981 bhv_desc_t *dir_bdp,
2982 vname_t *dentry,
2983 cred_t *credp)
2984{
2985 char *name = VNAME(dentry);
2986 xfs_inode_t *dp;
2987 xfs_inode_t *cdp; /* child directory */
2988 xfs_trans_t *tp;
2989 xfs_mount_t *mp;
2990 int error;
2991 xfs_bmap_free_t free_list;
2992 xfs_fsblock_t first_block;
2993 int cancel_flags;
2994 int committed;
2995 vnode_t *dir_vp;
2996 int dm_di_mode = 0;
2997 int last_cdp_link;
2998 int namelen;
2999 uint resblks;
3000
3001 dir_vp = BHV_TO_VNODE(dir_bdp);
3002 dp = XFS_BHVTOI(dir_bdp);
3003 mp = dp->i_mount;
3004
3005 vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
3006
3007 if (XFS_FORCED_SHUTDOWN(XFS_BHVTOI(dir_bdp)->i_mount))
3008 return XFS_ERROR(EIO);
3009 namelen = VNAMELEN(dentry);
3010
3011 if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) {
3012 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
3013 dir_vp, DM_RIGHT_NULL,
3014 NULL, DM_RIGHT_NULL,
3015 name, NULL, 0, 0, 0);
3016 if (error)
3017 return XFS_ERROR(error);
3018 }
3019
3020 /* Return through std_return after this point. */
3021
3022 cdp = NULL;
3023
3024 /*
3025 * We need to get a reference to cdp before we get our log
3026 * reservation. The reason for this is that we cannot call
3027 * xfs_iget for an inode for which we do not have a reference
3028 * once we've acquired a log reservation. This is because the
3029 * inode we are trying to get might be in xfs_inactive going
3030 * for a log reservation. Since we'll have to wait for the
3031 * inactive code to complete before returning from xfs_iget,
3032 * we need to make sure that we don't have log space reserved
3033 * when we call xfs_iget. Instead we get an unlocked referece
3034 * to the inode before getting our log reservation.
3035 */
3036 error = xfs_get_dir_entry(dentry, &cdp);
3037 if (error) {
3038 REMOVE_DEBUG_TRACE(__LINE__);
3039 goto std_return;
3040 }
3041 mp = dp->i_mount;
3042 dm_di_mode = cdp->i_d.di_mode;
3043
3044 /*
3045 * Get the dquots for the inodes.
3046 */
3047 error = XFS_QM_DQATTACH(mp, dp, 0);
3048 if (!error && dp != cdp)
3049 error = XFS_QM_DQATTACH(mp, cdp, 0);
3050 if (error) {
3051 IRELE(cdp);
3052 REMOVE_DEBUG_TRACE(__LINE__);
3053 goto std_return;
3054 }
3055
3056 tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
3057 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
3058 /*
3059 * We try to get the real space reservation first,
3060 * allowing for directory btree deletion(s) implying
3061 * possible bmap insert(s). If we can't get the space
3062 * reservation then we use 0 instead, and avoid the bmap
3063 * btree insert(s) in the directory code by, if the bmap
3064 * insert tries to happen, instead trimming the LAST
3065 * block from the directory.
3066 */
3067 resblks = XFS_REMOVE_SPACE_RES(mp);
3068 error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
3069 XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
3070 if (error == ENOSPC) {
3071 resblks = 0;
3072 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
3073 XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
3074 }
3075 if (error) {
3076 ASSERT(error != ENOSPC);
3077 cancel_flags = 0;
3078 IRELE(cdp);
3079 goto error_return;
3080 }
3081 XFS_BMAP_INIT(&free_list, &first_block);
3082
3083 /*
3084 * Now lock the child directory inode and the parent directory
3085 * inode in the proper order. This will take care of validating
3086 * that the directory entry for the child directory inode has
3087 * not changed while we were obtaining a log reservation.
3088 */
3089 error = xfs_lock_dir_and_entry(dp, dentry, cdp);
3090 if (error) {
3091 xfs_trans_cancel(tp, cancel_flags);
3092 IRELE(cdp);
3093 goto std_return;
3094 }
3095
3096 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3097 if (dp != cdp) {
3098 /*
3099 * Only increment the parent directory vnode count if
3100 * we didn't bump it in looking up cdp. The only time
3101 * we don't bump it is when we're looking up ".".
3102 */
3103 VN_HOLD(dir_vp);
3104 }
3105
3106 ITRACE(cdp);
3107 xfs_trans_ijoin(tp, cdp, XFS_ILOCK_EXCL);
3108
3109 ASSERT(cdp->i_d.di_nlink >= 2);
3110 if (cdp->i_d.di_nlink != 2) {
3111 error = XFS_ERROR(ENOTEMPTY);
3112 goto error_return;
3113 }
3114 if (!XFS_DIR_ISEMPTY(mp, cdp)) {
3115 error = XFS_ERROR(ENOTEMPTY);
3116 goto error_return;
3117 }
3118
3119 error = XFS_DIR_REMOVENAME(mp, tp, dp, name, namelen, cdp->i_ino,
3120 &first_block, &free_list, resblks);
3121 if (error) {
3122 goto error1;
3123 }
3124
3125 xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3126
3127 /*
3128 * Bump the in memory generation count on the parent
3129 * directory so that other can know that it has changed.
3130 */
3131 dp->i_gen++;
3132
3133 /*
3134 * Drop the link from cdp's "..".
3135 */
3136 error = xfs_droplink(tp, dp);
3137 if (error) {
3138 goto error1;
3139 }
3140
3141 /*
3142 * Drop the link from dp to cdp.
3143 */
3144 error = xfs_droplink(tp, cdp);
3145 if (error) {
3146 goto error1;
3147 }
3148
3149 /*
3150 * Drop the "." link from cdp to self.
3151 */
3152 error = xfs_droplink(tp, cdp);
3153 if (error) {
3154 goto error1;
3155 }
3156
3157 /* Determine these before committing transaction */
3158 last_cdp_link = (cdp)->i_d.di_nlink==0;
3159
3160 /*
3161 * Take an extra ref on the child vnode so that it
3162 * does not go to xfs_inactive() from within the commit.
3163 */
3164 IHOLD(cdp);
3165
3166 /*
3167 * If this is a synchronous mount, make sure that the
3168 * rmdir transaction goes to disk before returning to
3169 * the user.
3170 */
3171 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
3172 xfs_trans_set_sync(tp);
3173 }
3174
3175 error = xfs_bmap_finish (&tp, &free_list, first_block, &committed);
3176 if (error) {
3177 xfs_bmap_cancel(&free_list);
3178 xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
3179 XFS_TRANS_ABORT));
3180 IRELE(cdp);
3181 goto std_return;
3182 }
3183
3184 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
3185 if (error) {
3186 IRELE(cdp);
3187 goto std_return;
3188 }
3189
3190
3191 /*
3192 * Let interposed file systems know about removed links.
3193 */
3194 VOP_LINK_REMOVED(XFS_ITOV(cdp), dir_vp, last_cdp_link);
3195
3196 IRELE(cdp);
3197
3198 /* Fall through to std_return with error = 0 or the errno
3199 * from xfs_trans_commit. */
3200std_return:
3201 if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_POSTREMOVE)) {
3202 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
3203 dir_vp, DM_RIGHT_NULL,
3204 NULL, DM_RIGHT_NULL,
3205 name, NULL, dm_di_mode,
3206 error, 0);
3207 }
3208 return error;
3209
3210 error1:
3211 xfs_bmap_cancel(&free_list);
3212 cancel_flags |= XFS_TRANS_ABORT;
3213 error_return:
3214 xfs_trans_cancel(tp, cancel_flags);
3215 goto std_return;
3216}
3217
3218
3219/*
3220 * xfs_readdir
3221 *
3222 * Read dp's entries starting at uiop->uio_offset and translate them into
3223 * bufsize bytes worth of struct dirents starting at bufbase.
3224 */
3225STATIC int
3226xfs_readdir(
3227 bhv_desc_t *dir_bdp,
3228 uio_t *uiop,
3229 cred_t *credp,
3230 int *eofp)
3231{
3232 xfs_inode_t *dp;
3233 xfs_trans_t *tp = NULL;
3234 int error = 0;
3235 uint lock_mode;
3236 xfs_off_t start_offset;
3237
3238 vn_trace_entry(BHV_TO_VNODE(dir_bdp), __FUNCTION__,
3239 (inst_t *)__return_address);
3240 dp = XFS_BHVTOI(dir_bdp);
3241
3242 if (XFS_FORCED_SHUTDOWN(dp->i_mount)) {
3243 return XFS_ERROR(EIO);
3244 }
3245
3246 lock_mode = xfs_ilock_map_shared(dp);
3247 start_offset = uiop->uio_offset;
3248 error = XFS_DIR_GETDENTS(dp->i_mount, tp, dp, uiop, eofp);
3249 if (start_offset != uiop->uio_offset) {
3250 xfs_ichgtime(dp, XFS_ICHGTIME_ACC);
3251 }
3252 xfs_iunlock_map_shared(dp, lock_mode);
3253 return error;
3254}
3255
3256
3257/*
3258 * xfs_symlink
3259 *
3260 */
3261STATIC int
3262xfs_symlink(
3263 bhv_desc_t *dir_bdp,
3264 vname_t *dentry,
3265 vattr_t *vap,
3266 char *target_path,
3267 vnode_t **vpp,
3268 cred_t *credp)
3269{
3270 xfs_trans_t *tp;
3271 xfs_mount_t *mp;
3272 xfs_inode_t *dp;
3273 xfs_inode_t *ip;
3274 int error;
3275 int pathlen;
3276 xfs_bmap_free_t free_list;
3277 xfs_fsblock_t first_block;
3278 boolean_t dp_joined_to_trans;
3279 vnode_t *dir_vp;
3280 uint cancel_flags;
3281 int committed;
3282 xfs_fileoff_t first_fsb;
3283 xfs_filblks_t fs_blocks;
3284 int nmaps;
3285 xfs_bmbt_irec_t mval[SYMLINK_MAPS];
3286 xfs_daddr_t d;
3287 char *cur_chunk;
3288 int byte_cnt;
3289 int n;
3290 xfs_buf_t *bp;
3291 xfs_prid_t prid;
3292 struct xfs_dquot *udqp, *gdqp;
3293 uint resblks;
3294 char *link_name = VNAME(dentry);
3295 int link_namelen;
3296
3297 *vpp = NULL;
3298 dir_vp = BHV_TO_VNODE(dir_bdp);
3299 dp = XFS_BHVTOI(dir_bdp);
3300 dp_joined_to_trans = B_FALSE;
3301 error = 0;
3302 ip = NULL;
3303 tp = NULL;
3304
3305 vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
3306
3307 mp = dp->i_mount;
3308
3309 if (XFS_FORCED_SHUTDOWN(mp))
3310 return XFS_ERROR(EIO);
3311
3312 link_namelen = VNAMELEN(dentry);
3313
3314 /*
3315 * Check component lengths of the target path name.
3316 */
3317 pathlen = strlen(target_path);
3318 if (pathlen >= MAXPATHLEN) /* total string too long */
3319 return XFS_ERROR(ENAMETOOLONG);
3320 if (pathlen >= MAXNAMELEN) { /* is any component too long? */
3321 int len, total;
3322 char *path;
3323
3324 for(total = 0, path = target_path; total < pathlen;) {
3325 /*
3326 * Skip any slashes.
3327 */
3328 while(*path == '/') {
3329 total++;
3330 path++;
3331 }
3332
3333 /*
3334 * Count up to the next slash or end of path.
3335 * Error out if the component is bigger than MAXNAMELEN.
3336 */
3337 for(len = 0; *path != '/' && total < pathlen;total++, path++) {
3338 if (++len >= MAXNAMELEN) {
3339 error = ENAMETOOLONG;
3340 return error;
3341 }
3342 }
3343 }
3344 }
3345
3346 if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_SYMLINK)) {
3347 error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dir_vp,
3348 DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
3349 link_name, target_path, 0, 0, 0);
3350 if (error)
3351 return error;
3352 }
3353
3354 /* Return through std_return after this point. */
3355
3356 udqp = gdqp = NULL;
3357 if (vap->va_mask & XFS_AT_PROJID)
3358 prid = (xfs_prid_t)vap->va_projid;
3359 else
3360 prid = (xfs_prid_t)dfltprid;
3361
3362 /*
3363 * Make sure that we have allocated dquot(s) on disk.
3364 */
3365 error = XFS_QM_DQVOPALLOC(mp, dp,
3366 current_fsuid(credp), current_fsgid(credp),
3367 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
3368 if (error)
3369 goto std_return;
3370
3371 tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
3372 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
3373 /*
3374 * The symlink will fit into the inode data fork?
3375 * There can't be any attributes so we get the whole variable part.
3376 */
3377 if (pathlen <= XFS_LITINO(mp))
3378 fs_blocks = 0;
3379 else
3380 fs_blocks = XFS_B_TO_FSB(mp, pathlen);
3381 resblks = XFS_SYMLINK_SPACE_RES(mp, link_namelen, fs_blocks);
3382 error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
3383 XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3384 if (error == ENOSPC && fs_blocks == 0) {
3385 resblks = 0;
3386 error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
3387 XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3388 }
3389 if (error) {
3390 cancel_flags = 0;
3391 dp = NULL;
3392 goto error_return;
3393 }
3394
3395 xfs_ilock(dp, XFS_ILOCK_EXCL);
3396
3397 /*
3398 * Check whether the directory allows new symlinks or not.
3399 */
3400 if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
3401 error = XFS_ERROR(EPERM);
3402 goto error_return;
3403 }
3404
3405 /*
3406 * Reserve disk quota : blocks and inode.
3407 */
3408 error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
3409 if (error)
3410 goto error_return;
3411
3412 /*
3413 * Check for ability to enter directory entry, if no space reserved.
3414 */
3415 if (resblks == 0 &&
3416 (error = XFS_DIR_CANENTER(mp, tp, dp, link_name, link_namelen)))
3417 goto error_return;
3418 /*
3419 * Initialize the bmap freelist prior to calling either
3420 * bmapi or the directory create code.
3421 */
3422 XFS_BMAP_INIT(&free_list, &first_block);
3423
3424 /*
3425 * Allocate an inode for the symlink.
3426 */
3427 error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (vap->va_mode&~S_IFMT),
3428 1, 0, credp, prid, resblks > 0, &ip, NULL);
3429 if (error) {
3430 if (error == ENOSPC)
3431 goto error_return;
3432 goto error1;
3433 }
3434 ITRACE(ip);
3435
3436 VN_HOLD(dir_vp);
3437 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3438 dp_joined_to_trans = B_TRUE;
3439
3440 /*
3441 * Also attach the dquot(s) to it, if applicable.
3442 */
3443 XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
3444
3445 if (resblks)
3446 resblks -= XFS_IALLOC_SPACE_RES(mp);
3447 /*
3448 * If the symlink will fit into the inode, write it inline.
3449 */
3450 if (pathlen <= XFS_IFORK_DSIZE(ip)) {
3451 xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
3452 memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
3453 ip->i_d.di_size = pathlen;
3454
3455 /*
3456 * The inode was initially created in extent format.
3457 */
3458 ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
3459 ip->i_df.if_flags |= XFS_IFINLINE;
3460
3461 ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
3462 xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
3463
3464 } else {
3465 first_fsb = 0;
3466 nmaps = SYMLINK_MAPS;
3467
3468 error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
3469 XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
3470 &first_block, resblks, mval, &nmaps,
3471 &free_list);
3472 if (error) {
3473 goto error1;
3474 }
3475
3476 if (resblks)
3477 resblks -= fs_blocks;
3478 ip->i_d.di_size = pathlen;
3479 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3480
3481 cur_chunk = target_path;
3482 for (n = 0; n < nmaps; n++) {
3483 d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
3484 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
3485 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
3486 BTOBB(byte_cnt), 0);
3487 ASSERT(bp && !XFS_BUF_GETERROR(bp));
3488 if (pathlen < byte_cnt) {
3489 byte_cnt = pathlen;
3490 }
3491 pathlen -= byte_cnt;
3492
3493 memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt);
3494 cur_chunk += byte_cnt;
3495
3496 xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
3497 }
3498 }
3499
3500 /*
3501 * Create the directory entry for the symlink.
3502 */
3503 error = XFS_DIR_CREATENAME(mp, tp, dp, link_name, link_namelen,
3504 ip->i_ino, &first_block, &free_list, resblks);
3505 if (error) {
3506 goto error1;
3507 }
3508 xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3509 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
3510
3511 /*
3512 * Bump the in memory version number of the parent directory
3513 * so that other processes accessing it will recognize that
3514 * the directory has changed.
3515 */
3516 dp->i_gen++;
3517
3518 /*
3519 * If this is a synchronous mount, make sure that the
3520 * symlink transaction goes to disk before returning to
3521 * the user.
3522 */
3523 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
3524 xfs_trans_set_sync(tp);
3525 }
3526
3527 /*
3528 * xfs_trans_commit normally decrements the vnode ref count
3529 * when it unlocks the inode. Since we want to return the
3530 * vnode to the caller, we bump the vnode ref count now.
3531 */
3532 IHOLD(ip);
3533
3534 error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
3535 if (error) {
3536 goto error2;
3537 }
3538 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
3539 XFS_QM_DQRELE(mp, udqp);
3540 XFS_QM_DQRELE(mp, gdqp);
3541
3542 /* Fall through to std_return with error = 0 or errno from
3543 * xfs_trans_commit */
3544std_return:
3545 if (DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
3546 DM_EVENT_POSTSYMLINK)) {
3547 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
3548 dir_vp, DM_RIGHT_NULL,
3549 error ? NULL : XFS_ITOV(ip),
3550 DM_RIGHT_NULL, link_name, target_path,
3551 0, error, 0);
3552 }
3553
3554 if (!error) {
3555 vnode_t *vp;
3556
3557 ASSERT(ip);
3558 vp = XFS_ITOV(ip);
3559 *vpp = vp;
3560 }
3561 return error;
3562
3563 error2:
3564 IRELE(ip);
3565 error1:
3566 xfs_bmap_cancel(&free_list);
3567 cancel_flags |= XFS_TRANS_ABORT;
3568 error_return:
3569 xfs_trans_cancel(tp, cancel_flags);
3570 XFS_QM_DQRELE(mp, udqp);
3571 XFS_QM_DQRELE(mp, gdqp);
3572
3573 if (!dp_joined_to_trans && (dp != NULL)) {
3574 xfs_iunlock(dp, XFS_ILOCK_EXCL);
3575 }
3576
3577 goto std_return;
3578}
3579
3580
3581/*
3582 * xfs_fid2
3583 *
3584 * A fid routine that takes a pointer to a previously allocated
3585 * fid structure (like xfs_fast_fid) but uses a 64 bit inode number.
3586 */
3587STATIC int
3588xfs_fid2(
3589 bhv_desc_t *bdp,
3590 fid_t *fidp)
3591{
3592 xfs_inode_t *ip;
3593 xfs_fid2_t *xfid;
3594
3595 vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__,
3596 (inst_t *)__return_address);
3597 ASSERT(sizeof(fid_t) >= sizeof(xfs_fid2_t));
3598
3599 xfid = (xfs_fid2_t *)fidp;
3600 ip = XFS_BHVTOI(bdp);
3601 xfid->fid_len = sizeof(xfs_fid2_t) - sizeof(xfid->fid_len);
3602 xfid->fid_pad = 0;
3603 /*
3604 * use memcpy because the inode is a long long and there's no
3605 * assurance that xfid->fid_ino is properly aligned.
3606 */
3607 memcpy(&xfid->fid_ino, &ip->i_ino, sizeof(xfid->fid_ino));
3608 xfid->fid_gen = ip->i_d.di_gen;
3609
3610 return 0;
3611}
3612
3613
3614/*
3615 * xfs_rwlock
3616 */
3617int
3618xfs_rwlock(
3619 bhv_desc_t *bdp,
3620 vrwlock_t locktype)
3621{
3622 xfs_inode_t *ip;
3623 vnode_t *vp;
3624
3625 vp = BHV_TO_VNODE(bdp);
3626 if (vp->v_type == VDIR)
3627 return 1;
3628 ip = XFS_BHVTOI(bdp);
3629 if (locktype == VRWLOCK_WRITE) {
3630 xfs_ilock(ip, XFS_IOLOCK_EXCL);
3631 } else if (locktype == VRWLOCK_TRY_READ) {
3632 return (xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED));
3633 } else if (locktype == VRWLOCK_TRY_WRITE) {
3634 return (xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL));
3635 } else {
3636 ASSERT((locktype == VRWLOCK_READ) ||
3637 (locktype == VRWLOCK_WRITE_DIRECT));
3638 xfs_ilock(ip, XFS_IOLOCK_SHARED);
3639 }
3640
3641 return 1;
3642}
3643
3644
3645/*
3646 * xfs_rwunlock
3647 */
3648void
3649xfs_rwunlock(
3650 bhv_desc_t *bdp,
3651 vrwlock_t locktype)
3652{
3653 xfs_inode_t *ip;
3654 vnode_t *vp;
3655
3656 vp = BHV_TO_VNODE(bdp);
3657 if (vp->v_type == VDIR)
3658 return;
3659 ip = XFS_BHVTOI(bdp);
3660 if (locktype == VRWLOCK_WRITE) {
3661 /*
3662 * In the write case, we may have added a new entry to
3663 * the reference cache. This might store a pointer to
3664 * an inode to be released in this inode. If it is there,
3665 * clear the pointer and release the inode after unlocking
3666 * this one.
3667 */
3668 xfs_refcache_iunlock(ip, XFS_IOLOCK_EXCL);
3669 } else {
3670 ASSERT((locktype == VRWLOCK_READ) ||
3671 (locktype == VRWLOCK_WRITE_DIRECT));
3672 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
3673 }
3674 return;
3675}
3676
3677STATIC int
3678xfs_inode_flush(
3679 bhv_desc_t *bdp,
3680 int flags)
3681{
3682 xfs_inode_t *ip;
3683 xfs_mount_t *mp;
3684 xfs_inode_log_item_t *iip;
3685 int error = 0;
3686
3687 ip = XFS_BHVTOI(bdp);
3688 mp = ip->i_mount;
3689 iip = ip->i_itemp;
3690
3691 if (XFS_FORCED_SHUTDOWN(mp))
3692 return XFS_ERROR(EIO);
3693
3694 /*
3695 * Bypass inodes which have already been cleaned by
3696 * the inode flush clustering code inside xfs_iflush
3697 */
3698 if ((ip->i_update_core == 0) &&
3699 ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)))
3700 return 0;
3701
3702 if (flags & FLUSH_LOG) {
3703 if (iip && iip->ili_last_lsn) {
3704 xlog_t *log = mp->m_log;
3705 xfs_lsn_t sync_lsn;
3706 int s, log_flags = XFS_LOG_FORCE;
3707
3708 s = GRANT_LOCK(log);
3709 sync_lsn = log->l_last_sync_lsn;
3710 GRANT_UNLOCK(log, s);
3711
3712 if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) <= 0))
3713 return 0;
3714
3715 if (flags & FLUSH_SYNC)
3716 log_flags |= XFS_LOG_SYNC;
3717 return xfs_log_force(mp, iip->ili_last_lsn, log_flags);
3718 }
3719 }
3720
3721 /*
3722 * We make this non-blocking if the inode is contended,
3723 * return EAGAIN to indicate to the caller that they
3724 * did not succeed. This prevents the flush path from
3725 * blocking on inodes inside another operation right
3726 * now, they get caught later by xfs_sync.
3727 */
3728 if (flags & FLUSH_INODE) {
3729 int flush_flags;
3730
3731 if (xfs_ipincount(ip))
3732 return EAGAIN;
3733
3734 if (flags & FLUSH_SYNC) {
3735 xfs_ilock(ip, XFS_ILOCK_SHARED);
3736 xfs_iflock(ip);
3737 } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
3738 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
3739 xfs_iunlock(ip, XFS_ILOCK_SHARED);
3740 return EAGAIN;
3741 }
3742 } else {
3743 return EAGAIN;
3744 }
3745
3746 if (flags & FLUSH_SYNC)
3747 flush_flags = XFS_IFLUSH_SYNC;
3748 else
3749 flush_flags = XFS_IFLUSH_ASYNC;
3750
3751 error = xfs_iflush(ip, flush_flags);
3752 xfs_iunlock(ip, XFS_ILOCK_SHARED);
3753 }
3754
3755 return error;
3756}
3757
3758
3759int
3760xfs_set_dmattrs (
3761 bhv_desc_t *bdp,
3762 u_int evmask,
3763 u_int16_t state,
3764 cred_t *credp)
3765{
3766 xfs_inode_t *ip;
3767 xfs_trans_t *tp;
3768 xfs_mount_t *mp;
3769 int error;
3770
3771 if (!capable(CAP_SYS_ADMIN))
3772 return XFS_ERROR(EPERM);
3773
3774 ip = XFS_BHVTOI(bdp);
3775 mp = ip->i_mount;
3776
3777 if (XFS_FORCED_SHUTDOWN(mp))
3778 return XFS_ERROR(EIO);
3779
3780 tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
3781 error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
3782 if (error) {
3783 xfs_trans_cancel(tp, 0);
3784 return error;
3785 }
3786 xfs_ilock(ip, XFS_ILOCK_EXCL);
3787 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3788
3789 ip->i_iocore.io_dmevmask = ip->i_d.di_dmevmask = evmask;
3790 ip->i_iocore.io_dmstate = ip->i_d.di_dmstate = state;
3791
3792 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3793 IHOLD(ip);
3794 error = xfs_trans_commit(tp, 0, NULL);
3795
3796 return error;
3797}
3798
3799
3800/*
3801 * xfs_reclaim
3802 */
3803STATIC int
3804xfs_reclaim(
3805 bhv_desc_t *bdp)
3806{
3807 xfs_inode_t *ip;
3808 vnode_t *vp;
3809
3810 vp = BHV_TO_VNODE(bdp);
3811 ip = XFS_BHVTOI(bdp);
3812
3813 vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
3814
3815 ASSERT(!VN_MAPPED(vp));
3816
3817 /* bad inode, get out here ASAP */
3818 if (VN_BAD(vp)) {
3819 xfs_ireclaim(ip);
3820 return 0;
3821 }
3822
3823 if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
3824 if (ip->i_d.di_size > 0) {
3825 /*
3826 * Flush and invalidate any data left around that is
3827 * a part of this file.
3828 *
3829 * Get the inode's i/o lock so that buffers are pushed
3830 * out while holding the proper lock. We can't hold
3831 * the inode lock here since flushing out buffers may
3832 * cause us to try to get the lock in xfs_strategy().
3833 *
3834 * We don't have to call remapf() here, because there
3835 * cannot be any mapped file references to this vnode
3836 * since it is being reclaimed.
3837 */
3838 xfs_ilock(ip, XFS_IOLOCK_EXCL);
3839
3840 /*
3841 * If we hit an IO error, we need to make sure that the
3842 * buffer and page caches of file data for
3843 * the file are tossed away. We don't want to use
3844 * VOP_FLUSHINVAL_PAGES here because we don't want dirty
3845 * pages to stay attached to the vnode, but be
3846 * marked P_BAD. pdflush/vnode_pagebad
3847 * hates that.
3848 */
3849 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
3850 VOP_FLUSHINVAL_PAGES(vp, 0, -1, FI_NONE);
3851 } else {
3852 VOP_TOSS_PAGES(vp, 0, -1, FI_NONE);
3853 }
3854
3855 ASSERT(VN_CACHED(vp) == 0);
3856 ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) ||
3857 ip->i_delayed_blks == 0);
3858 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
3859 } else if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
3860 /*
3861 * di_size field may not be quite accurate if we're
3862 * shutting down.
3863 */
3864 VOP_TOSS_PAGES(vp, 0, -1, FI_NONE);
3865 ASSERT(VN_CACHED(vp) == 0);
3866 }
3867 }
3868
3869 /* If we have nothing to flush with this inode then complete the
3870 * teardown now, otherwise break the link between the xfs inode
3871 * and the linux inode and clean up the xfs inode later. This
3872 * avoids flushing the inode to disk during the delete operation
3873 * itself.
3874 */
3875 if (!ip->i_update_core && (ip->i_itemp == NULL)) {
3876 xfs_ilock(ip, XFS_ILOCK_EXCL);
3877 xfs_iflock(ip);
3878 return xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
3879 } else {
3880 xfs_mount_t *mp = ip->i_mount;
3881
3882 /* Protect sync from us */
3883 XFS_MOUNT_ILOCK(mp);
3884 vn_bhv_remove(VN_BHV_HEAD(vp), XFS_ITOBHV(ip));
3885 list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
3886 ip->i_flags |= XFS_IRECLAIMABLE;
3887 XFS_MOUNT_IUNLOCK(mp);
3888 }
3889 return 0;
3890}
3891
3892int
3893xfs_finish_reclaim(
3894 xfs_inode_t *ip,
3895 int locked,
3896 int sync_mode)
3897{
3898 xfs_ihash_t *ih = ip->i_hash;
3899 vnode_t *vp = XFS_ITOV_NULL(ip);
3900 int error;
3901
3902 if (vp && VN_BAD(vp))
3903 goto reclaim;
3904
3905 /* The hash lock here protects a thread in xfs_iget_core from
3906 * racing with us on linking the inode back with a vnode.
3907 * Once we have the XFS_IRECLAIM flag set it will not touch
3908 * us.
3909 */
3910 write_lock(&ih->ih_lock);
3911 if ((ip->i_flags & XFS_IRECLAIM) ||
3912 (!(ip->i_flags & XFS_IRECLAIMABLE) && vp == NULL)) {
3913 write_unlock(&ih->ih_lock);
3914 if (locked) {
3915 xfs_ifunlock(ip);
3916 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3917 }
3918 return(1);
3919 }
3920 ip->i_flags |= XFS_IRECLAIM;
3921 write_unlock(&ih->ih_lock);
3922
3923 /*
3924 * If the inode is still dirty, then flush it out. If the inode
3925 * is not in the AIL, then it will be OK to flush it delwri as
3926 * long as xfs_iflush() does not keep any references to the inode.
3927 * We leave that decision up to xfs_iflush() since it has the
3928 * knowledge of whether it's OK to simply do a delwri flush of
3929 * the inode or whether we need to wait until the inode is
3930 * pulled from the AIL.
3931 * We get the flush lock regardless, though, just to make sure
3932 * we don't free it while it is being flushed.
3933 */
3934 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
3935 if (!locked) {
3936 xfs_ilock(ip, XFS_ILOCK_EXCL);
3937 xfs_iflock(ip);
3938 }
3939
3940 if (ip->i_update_core ||
3941 ((ip->i_itemp != NULL) &&
3942 (ip->i_itemp->ili_format.ilf_fields != 0))) {
3943 error = xfs_iflush(ip, sync_mode);
3944 /*
3945 * If we hit an error, typically because of filesystem
3946 * shutdown, we don't need to let vn_reclaim to know
3947 * because we're gonna reclaim the inode anyway.
3948 */
3949 if (error) {
3950 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3951 goto reclaim;
3952 }
3953 xfs_iflock(ip); /* synchronize with xfs_iflush_done */
3954 }
3955
3956 ASSERT(ip->i_update_core == 0);
3957 ASSERT(ip->i_itemp == NULL ||
3958 ip->i_itemp->ili_format.ilf_fields == 0);
3959 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3960 } else if (locked) {
3961 /*
3962 * We are not interested in doing an iflush if we're
3963 * in the process of shutting down the filesystem forcibly.
3964 * So, just reclaim the inode.
3965 */
3966 xfs_ifunlock(ip);
3967 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3968 }
3969
3970 reclaim:
3971 xfs_ireclaim(ip);
3972 return 0;
3973}
3974
3975int
3976xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock)
3977{
3978 int purged;
3979 xfs_inode_t *ip, *n;
3980 int done = 0;
3981
3982 while (!done) {
3983 purged = 0;
3984 XFS_MOUNT_ILOCK(mp);
3985 list_for_each_entry_safe(ip, n, &mp->m_del_inodes, i_reclaim) {
3986 if (noblock) {
3987 if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0)
3988 continue;
3989 if (xfs_ipincount(ip) ||
3990 !xfs_iflock_nowait(ip)) {
3991 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3992 continue;
3993 }
3994 }
3995 XFS_MOUNT_IUNLOCK(mp);
3996 xfs_finish_reclaim(ip, noblock,
3997 XFS_IFLUSH_DELWRI_ELSE_ASYNC);
3998 purged = 1;
3999 break;
4000 }
4001
4002 done = !purged;
4003 }
4004
4005 XFS_MOUNT_IUNLOCK(mp);
4006 return 0;
4007}
4008
4009/*
4010 * xfs_alloc_file_space()
4011 * This routine allocates disk space for the given file.
4012 *
4013 * If alloc_type == 0, this request is for an ALLOCSP type
4014 * request which will change the file size. In this case, no
4015 * DMAPI event will be generated by the call. A TRUNCATE event
4016 * will be generated later by xfs_setattr.
4017 *
4018 * If alloc_type != 0, this request is for a RESVSP type
4019 * request, and a DMAPI DM_EVENT_WRITE will be generated if the
4020 * lower block boundary byte address is less than the file's
4021 * length.
4022 *
4023 * RETURNS:
4024 * 0 on success
4025 * errno on error
4026 *
4027 */
4028int
4029xfs_alloc_file_space(
4030 xfs_inode_t *ip,
4031 xfs_off_t offset,
4032 xfs_off_t len,
4033 int alloc_type,
4034 int attr_flags)
4035{
4036 xfs_filblks_t allocated_fsb;
4037 xfs_filblks_t allocatesize_fsb;
4038 int committed;
4039 xfs_off_t count;
4040 xfs_filblks_t datablocks;
4041 int error;
4042 xfs_fsblock_t firstfsb;
4043 xfs_bmap_free_t free_list;
4044 xfs_bmbt_irec_t *imapp;
4045 xfs_bmbt_irec_t imaps[1];
4046 xfs_mount_t *mp;
4047 int numrtextents;
4048 int reccount;
4049 uint resblks;
4050 int rt;
4051 int rtextsize;
4052 xfs_fileoff_t startoffset_fsb;
4053 xfs_trans_t *tp;
4054 int xfs_bmapi_flags;
4055
4056 vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
4057 mp = ip->i_mount;
4058
4059 if (XFS_FORCED_SHUTDOWN(mp))
4060 return XFS_ERROR(EIO);
4061
4062 /*
4063 * determine if this is a realtime file
4064 */
4065 if ((rt = XFS_IS_REALTIME_INODE(ip)) != 0) {
4066 if (ip->i_d.di_extsize)
4067 rtextsize = ip->i_d.di_extsize;
4068 else
4069 rtextsize = mp->m_sb.sb_rextsize;
4070 } else
4071 rtextsize = 0;
4072
4073 if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
4074 return error;
4075
4076 if (len <= 0)
4077 return XFS_ERROR(EINVAL);
4078
4079 count = len;
4080 error = 0;
4081 imapp = &imaps[0];
4082 reccount = 1;
4083 xfs_bmapi_flags = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0);
4084 startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
4085 allocatesize_fsb = XFS_B_TO_FSB(mp, count);
4086
4087 /* Generate a DMAPI event if needed. */
4088 if (alloc_type != 0 && offset < ip->i_d.di_size &&
4089 (attr_flags&ATTR_DMI) == 0 &&
4090 DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) {
4091 xfs_off_t end_dmi_offset;
4092
4093 end_dmi_offset = offset+len;
4094 if (end_dmi_offset > ip->i_d.di_size)
4095 end_dmi_offset = ip->i_d.di_size;
4096 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip),
4097 offset, end_dmi_offset - offset,
4098 0, NULL);
4099 if (error)
4100 return(error);
4101 }
4102
4103 /*
4104 * allocate file space until done or until there is an error
4105 */
4106retry:
4107 while (allocatesize_fsb && !error) {
4108 /*
4109 * determine if reserving space on
4110 * the data or realtime partition.
4111 */
4112 if (rt) {
4113 xfs_fileoff_t s, e;
4114
4115 s = startoffset_fsb;
4116 do_div(s, rtextsize);
4117 s *= rtextsize;
4118 e = roundup_64(startoffset_fsb + allocatesize_fsb,
4119 rtextsize);
4120 numrtextents = (int)(e - s) / mp->m_sb.sb_rextsize;
4121 datablocks = 0;
4122 } else {
4123 datablocks = allocatesize_fsb;
4124 numrtextents = 0;
4125 }
4126
4127 /*
4128 * allocate and setup the transaction
4129 */
4130 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
4131 resblks = XFS_DIOSTRAT_SPACE_RES(mp, datablocks);
4132 error = xfs_trans_reserve(tp,
4133 resblks,
4134 XFS_WRITE_LOG_RES(mp),
4135 numrtextents,
4136 XFS_TRANS_PERM_LOG_RES,
4137 XFS_WRITE_LOG_COUNT);
4138
4139 /*
4140 * check for running out of space
4141 */
4142 if (error) {
4143 /*
4144 * Free the transaction structure.
4145 */
4146 ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
4147 xfs_trans_cancel(tp, 0);
4148 break;
4149 }
4150 xfs_ilock(ip, XFS_ILOCK_EXCL);
4151 error = XFS_TRANS_RESERVE_QUOTA_BYDQUOTS(mp, tp,
4152 ip->i_udquot, ip->i_gdquot, resblks, 0, rt ?
4153 XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
4154 if (error)
4155 goto error1;
4156
4157 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4158 xfs_trans_ihold(tp, ip);
4159
4160 /*
4161 * issue the bmapi() call to allocate the blocks
4162 */
4163 XFS_BMAP_INIT(&free_list, &firstfsb);
4164 error = xfs_bmapi(tp, ip, startoffset_fsb,
4165 allocatesize_fsb, xfs_bmapi_flags,
4166 &firstfsb, 0, imapp, &reccount,
4167 &free_list);
4168 if (error) {
4169 goto error0;
4170 }
4171
4172 /*
4173 * complete the transaction
4174 */
4175 error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed);
4176 if (error) {
4177 goto error0;
4178 }
4179
4180 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
4181 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4182 if (error) {
4183 break;
4184 }
4185
4186 allocated_fsb = imapp->br_blockcount;
4187
4188 if (reccount == 0) {
4189 error = XFS_ERROR(ENOSPC);
4190 break;
4191 }
4192
4193 startoffset_fsb += allocated_fsb;
4194 allocatesize_fsb -= allocated_fsb;
4195 }
4196dmapi_enospc_check:
4197 if (error == ENOSPC && (attr_flags&ATTR_DMI) == 0 &&
4198 DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_NOSPACE)) {
4199
4200 error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
4201 XFS_ITOV(ip), DM_RIGHT_NULL,
4202 XFS_ITOV(ip), DM_RIGHT_NULL,
4203 NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
4204 if (error == 0)
4205 goto retry; /* Maybe DMAPI app. has made space */
4206 /* else fall through with error from XFS_SEND_DATA */
4207 }
4208
4209 return error;
4210
4211 error0:
4212 xfs_bmap_cancel(&free_list);
4213 error1:
4214 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4215 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4216 goto dmapi_enospc_check;
4217}
4218
4219/*
4220 * Zero file bytes between startoff and endoff inclusive.
4221 * The iolock is held exclusive and no blocks are buffered.
4222 */
4223STATIC int
4224xfs_zero_remaining_bytes(
4225 xfs_inode_t *ip,
4226 xfs_off_t startoff,
4227 xfs_off_t endoff)
4228{
4229 xfs_bmbt_irec_t imap;
4230 xfs_fileoff_t offset_fsb;
4231 xfs_off_t lastoffset;
4232 xfs_off_t offset;
4233 xfs_buf_t *bp;
4234 xfs_mount_t *mp = ip->i_mount;
4235 int nimap;
4236 int error = 0;
4237
4238 bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
4239 ip->i_d.di_flags & XFS_DIFLAG_REALTIME ?
4240 mp->m_rtdev_targp : mp->m_ddev_targp);
4241
4242 for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
4243 offset_fsb = XFS_B_TO_FSBT(mp, offset);
4244 nimap = 1;
4245 error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0, NULL, 0, &imap,
4246 &nimap, NULL);
4247 if (error || nimap < 1)
4248 break;
4249 ASSERT(imap.br_blockcount >= 1);
4250 ASSERT(imap.br_startoff == offset_fsb);
4251 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
4252 if (lastoffset > endoff)
4253 lastoffset = endoff;
4254 if (imap.br_startblock == HOLESTARTBLOCK)
4255 continue;
4256 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4257 if (imap.br_state == XFS_EXT_UNWRITTEN)
4258 continue;
4259 XFS_BUF_UNDONE(bp);
4260 XFS_BUF_UNWRITE(bp);
4261 XFS_BUF_READ(bp);
4262 XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock));
4263 xfsbdstrat(mp, bp);
4264 if ((error = xfs_iowait(bp))) {
4265 xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
4266 mp, bp, XFS_BUF_ADDR(bp));
4267 break;
4268 }
4269 memset(XFS_BUF_PTR(bp) +
4270 (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
4271 0, lastoffset - offset + 1);
4272 XFS_BUF_UNDONE(bp);
4273 XFS_BUF_UNREAD(bp);
4274 XFS_BUF_WRITE(bp);
4275 xfsbdstrat(mp, bp);
4276 if ((error = xfs_iowait(bp))) {
4277 xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
4278 mp, bp, XFS_BUF_ADDR(bp));
4279 break;
4280 }
4281 }
4282 xfs_buf_free(bp);
4283 return error;
4284}
4285
4286/*
4287 * xfs_free_file_space()
4288 * This routine frees disk space for the given file.
4289 *
4290 * This routine is only called by xfs_change_file_space
4291 * for an UNRESVSP type call.
4292 *
4293 * RETURNS:
4294 * 0 on success
4295 * errno on error
4296 *
4297 */
4298STATIC int
4299xfs_free_file_space(
4300 xfs_inode_t *ip,
4301 xfs_off_t offset,
4302 xfs_off_t len,
4303 int attr_flags)
4304{
4305 int committed;
4306 int done;
4307 xfs_off_t end_dmi_offset;
4308 xfs_fileoff_t endoffset_fsb;
4309 int error;
4310 xfs_fsblock_t firstfsb;
4311 xfs_bmap_free_t free_list;
4312 xfs_off_t ilen;
4313 xfs_bmbt_irec_t imap;
4314 xfs_off_t ioffset;
4315 xfs_extlen_t mod=0;
4316 xfs_mount_t *mp;
4317 int nimap;
4318 uint resblks;
4319 int rounding;
4320 int rt;
4321 xfs_fileoff_t startoffset_fsb;
4322 xfs_trans_t *tp;
4323 int need_iolock = (attr_flags & ATTR_DMI) == 0;
4324
4325 vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
4326 mp = ip->i_mount;
4327
4328 if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
4329 return error;
4330
4331 error = 0;
4332 if (len <= 0) /* if nothing being freed */
4333 return error;
4334 rt = (ip->i_d.di_flags & XFS_DIFLAG_REALTIME);
4335 startoffset_fsb = XFS_B_TO_FSB(mp, offset);
4336 end_dmi_offset = offset + len;
4337 endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
4338
4339 if (offset < ip->i_d.di_size &&
4340 (attr_flags & ATTR_DMI) == 0 &&
4341 DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) {
4342 if (end_dmi_offset > ip->i_d.di_size)
4343 end_dmi_offset = ip->i_d.di_size;
4344 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip),
4345 offset, end_dmi_offset - offset,
4346 AT_DELAY_FLAG(attr_flags), NULL);
4347 if (error)
4348 return(error);
4349 }
4350
4351 if (need_iolock)
4352 xfs_ilock(ip, XFS_IOLOCK_EXCL);
4353 rounding = MAX((__uint8_t)(1 << mp->m_sb.sb_blocklog),
4354 (__uint8_t)NBPP);
4355 ilen = len + (offset & (rounding - 1));
4356 ioffset = offset & ~(rounding - 1);
4357 if (ilen & (rounding - 1))
4358 ilen = (ilen + rounding) & ~(rounding - 1);
4359 xfs_inval_cached_pages(XFS_ITOV(ip), &(ip->i_iocore), ioffset, 0, 0);
4360 /*
4361 * Need to zero the stuff we're not freeing, on disk.
4362 * If its a realtime file & can't use unwritten extents then we
4363 * actually need to zero the extent edges. Otherwise xfs_bunmapi
4364 * will take care of it for us.
4365 */
4366 if (rt && !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
4367 nimap = 1;
4368 error = xfs_bmapi(NULL, ip, startoffset_fsb, 1, 0, NULL, 0,
4369 &imap, &nimap, NULL);
4370 if (error)
4371 goto out_unlock_iolock;
4372 ASSERT(nimap == 0 || nimap == 1);
4373 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4374 xfs_daddr_t block;
4375
4376 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4377 block = imap.br_startblock;
4378 mod = do_div(block, mp->m_sb.sb_rextsize);
4379 if (mod)
4380 startoffset_fsb += mp->m_sb.sb_rextsize - mod;
4381 }
4382 nimap = 1;
4383 error = xfs_bmapi(NULL, ip, endoffset_fsb - 1, 1, 0, NULL, 0,
4384 &imap, &nimap, NULL);
4385 if (error)
4386 goto out_unlock_iolock;
4387 ASSERT(nimap == 0 || nimap == 1);
4388 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4389 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4390 mod++;
4391 if (mod && (mod != mp->m_sb.sb_rextsize))
4392 endoffset_fsb -= mod;
4393 }
4394 }
4395 if ((done = (endoffset_fsb <= startoffset_fsb)))
4396 /*
4397 * One contiguous piece to clear
4398 */
4399 error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
4400 else {
4401 /*
4402 * Some full blocks, possibly two pieces to clear
4403 */
4404 if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
4405 error = xfs_zero_remaining_bytes(ip, offset,
4406 XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
4407 if (!error &&
4408 XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
4409 error = xfs_zero_remaining_bytes(ip,
4410 XFS_FSB_TO_B(mp, endoffset_fsb),
4411 offset + len - 1);
4412 }
4413
4414 /*
4415 * free file space until done or until there is an error
4416 */
4417 resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
4418 while (!error && !done) {
4419
4420 /*
4421 * allocate and setup the transaction
4422 */
4423 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
4424 error = xfs_trans_reserve(tp,
4425 resblks,
4426 XFS_WRITE_LOG_RES(mp),
4427 0,
4428 XFS_TRANS_PERM_LOG_RES,
4429 XFS_WRITE_LOG_COUNT);
4430
4431 /*
4432 * check for running out of space
4433 */
4434 if (error) {
4435 /*
4436 * Free the transaction structure.
4437 */
4438 ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
4439 xfs_trans_cancel(tp, 0);
4440 break;
4441 }
4442 xfs_ilock(ip, XFS_ILOCK_EXCL);
4443 error = XFS_TRANS_RESERVE_QUOTA(mp, tp,
4444 ip->i_udquot, ip->i_gdquot, resblks, 0, rt ?
4445 XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
4446 if (error)
4447 goto error1;
4448
4449 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4450 xfs_trans_ihold(tp, ip);
4451
4452 /*
4453 * issue the bunmapi() call to free the blocks
4454 */
4455 XFS_BMAP_INIT(&free_list, &firstfsb);
4456 error = xfs_bunmapi(tp, ip, startoffset_fsb,
4457 endoffset_fsb - startoffset_fsb,
4458 0, 2, &firstfsb, &free_list, &done);
4459 if (error) {
4460 goto error0;
4461 }
4462
4463 /*
4464 * complete the transaction
4465 */
4466 error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed);
4467 if (error) {
4468 goto error0;
4469 }
4470
4471 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
4472 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4473 }
4474
4475 out_unlock_iolock:
4476 if (need_iolock)
4477 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
4478 return error;
4479
4480 error0:
4481 xfs_bmap_cancel(&free_list);
4482 error1:
4483 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4484 xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
4485 XFS_ILOCK_EXCL);
4486 return error;
4487}
4488
4489/*
4490 * xfs_change_file_space()
4491 * This routine allocates or frees disk space for the given file.
4492 * The user specified parameters are checked for alignment and size
4493 * limitations.
4494 *
4495 * RETURNS:
4496 * 0 on success
4497 * errno on error
4498 *
4499 */
4500int
4501xfs_change_file_space(
4502 bhv_desc_t *bdp,
4503 int cmd,
4504 xfs_flock64_t *bf,
4505 xfs_off_t offset,
4506 cred_t *credp,
4507 int attr_flags)
4508{
4509 int clrprealloc;
4510 int error;
4511 xfs_fsize_t fsize;
4512 xfs_inode_t *ip;
4513 xfs_mount_t *mp;
4514 int setprealloc;
4515 xfs_off_t startoffset;
4516 xfs_off_t llen;
4517 xfs_trans_t *tp;
4518 vattr_t va;
4519 vnode_t *vp;
4520
4521 vp = BHV_TO_VNODE(bdp);
4522 vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
4523
4524 ip = XFS_BHVTOI(bdp);
4525 mp = ip->i_mount;
4526
4527 /*
4528 * must be a regular file and have write permission
4529 */
4530 if (vp->v_type != VREG)
4531 return XFS_ERROR(EINVAL);
4532
4533 xfs_ilock(ip, XFS_ILOCK_SHARED);
4534
4535 if ((error = xfs_iaccess(ip, S_IWUSR, credp))) {
4536 xfs_iunlock(ip, XFS_ILOCK_SHARED);
4537 return error;
4538 }
4539
4540 xfs_iunlock(ip, XFS_ILOCK_SHARED);
4541
4542 switch (bf->l_whence) {
4543 case 0: /*SEEK_SET*/
4544 break;
4545 case 1: /*SEEK_CUR*/
4546 bf->l_start += offset;
4547 break;
4548 case 2: /*SEEK_END*/
4549 bf->l_start += ip->i_d.di_size;
4550 break;
4551 default:
4552 return XFS_ERROR(EINVAL);
4553 }
4554
4555 llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
4556
4557 if ( (bf->l_start < 0)
4558 || (bf->l_start > XFS_MAXIOFFSET(mp))
4559 || (bf->l_start + llen < 0)
4560 || (bf->l_start + llen > XFS_MAXIOFFSET(mp)))
4561 return XFS_ERROR(EINVAL);
4562
4563 bf->l_whence = 0;
4564
4565 startoffset = bf->l_start;
4566 fsize = ip->i_d.di_size;
4567
4568 /*
4569 * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
4570 * file space.
4571 * These calls do NOT zero the data space allocated to the file,
4572 * nor do they change the file size.
4573 *
4574 * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
4575 * space.
4576 * These calls cause the new file data to be zeroed and the file
4577 * size to be changed.
4578 */
4579 setprealloc = clrprealloc = 0;
4580
4581 switch (cmd) {
4582 case XFS_IOC_RESVSP:
4583 case XFS_IOC_RESVSP64:
4584 error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
4585 1, attr_flags);
4586 if (error)
4587 return error;
4588 setprealloc = 1;
4589 break;
4590
4591 case XFS_IOC_UNRESVSP:
4592 case XFS_IOC_UNRESVSP64:
4593 if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
4594 attr_flags)))
4595 return error;
4596 break;
4597
4598 case XFS_IOC_ALLOCSP:
4599 case XFS_IOC_ALLOCSP64:
4600 case XFS_IOC_FREESP:
4601 case XFS_IOC_FREESP64:
4602 if (startoffset > fsize) {
4603 error = xfs_alloc_file_space(ip, fsize,
4604 startoffset - fsize, 0, attr_flags);
4605 if (error)
4606 break;
4607 }
4608
4609 va.va_mask = XFS_AT_SIZE;
4610 va.va_size = startoffset;
4611
4612 error = xfs_setattr(bdp, &va, attr_flags, credp);
4613
4614 if (error)
4615 return error;
4616
4617 clrprealloc = 1;
4618 break;
4619
4620 default:
4621 ASSERT(0);
4622 return XFS_ERROR(EINVAL);
4623 }
4624
4625 /*
4626 * update the inode timestamp, mode, and prealloc flag bits
4627 */
4628 tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
4629
4630 if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
4631 0, 0, 0))) {
4632 /* ASSERT(0); */
4633 xfs_trans_cancel(tp, 0);
4634 return error;
4635 }
4636
4637 xfs_ilock(ip, XFS_ILOCK_EXCL);
4638
4639 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4640 xfs_trans_ihold(tp, ip);
4641
4642 if ((attr_flags & ATTR_DMI) == 0) {
4643 ip->i_d.di_mode &= ~S_ISUID;
4644
4645 /*
4646 * Note that we don't have to worry about mandatory
4647 * file locking being disabled here because we only
4648 * clear the S_ISGID bit if the Group execute bit is
4649 * on, but if it was on then mandatory locking wouldn't
4650 * have been enabled.
4651 */
4652 if (ip->i_d.di_mode & S_IXGRP)
4653 ip->i_d.di_mode &= ~S_ISGID;
4654
4655 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
4656 }
4657 if (setprealloc)
4658 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
4659 else if (clrprealloc)
4660 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
4661
4662 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
4663 xfs_trans_set_sync(tp);
4664
4665 error = xfs_trans_commit(tp, 0, NULL);
4666
4667 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4668
4669 return error;
4670}
4671
4672vnodeops_t xfs_vnodeops = {
4673 BHV_IDENTITY_INIT(VN_BHV_XFS,VNODE_POSITION_XFS),
4674 .vop_open = xfs_open,
4675 .vop_read = xfs_read,
4676#ifdef HAVE_SENDFILE
4677 .vop_sendfile = xfs_sendfile,
4678#endif
4679 .vop_write = xfs_write,
4680 .vop_ioctl = xfs_ioctl,
4681 .vop_getattr = xfs_getattr,
4682 .vop_setattr = xfs_setattr,
4683 .vop_access = xfs_access,
4684 .vop_lookup = xfs_lookup,
4685 .vop_create = xfs_create,
4686 .vop_remove = xfs_remove,
4687 .vop_link = xfs_link,
4688 .vop_rename = xfs_rename,
4689 .vop_mkdir = xfs_mkdir,
4690 .vop_rmdir = xfs_rmdir,
4691 .vop_readdir = xfs_readdir,
4692 .vop_symlink = xfs_symlink,
4693 .vop_readlink = xfs_readlink,
4694 .vop_fsync = xfs_fsync,
4695 .vop_inactive = xfs_inactive,
4696 .vop_fid2 = xfs_fid2,
4697 .vop_rwlock = xfs_rwlock,
4698 .vop_rwunlock = xfs_rwunlock,
4699 .vop_bmap = xfs_bmap,
4700 .vop_reclaim = xfs_reclaim,
4701 .vop_attr_get = xfs_attr_get,
4702 .vop_attr_set = xfs_attr_set,
4703 .vop_attr_remove = xfs_attr_remove,
4704 .vop_attr_list = xfs_attr_list,
4705 .vop_link_removed = (vop_link_removed_t)fs_noval,
4706 .vop_vnode_change = (vop_vnode_change_t)fs_noval,
4707 .vop_tosspages = fs_tosspages,
4708 .vop_flushinval_pages = fs_flushinval_pages,
4709 .vop_flush_pages = fs_flush_pages,
4710 .vop_release = xfs_release,
4711 .vop_iflush = xfs_inode_flush,
4712};