summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/xfs/kmem.c79
-rw-r--r--fs/xfs/kmem.h15
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c2
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h7
-rw-r--r--fs/xfs/libxfs/xfs_attr.c79
-rw-r--r--fs/xfs/libxfs/xfs_attr.h6
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c130
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c2
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c85
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h11
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c16
-rw-r--r--fs/xfs/libxfs/xfs_btree.c14
-rw-r--r--fs/xfs/libxfs/xfs_btree.h10
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c6
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.h4
-rw-r--r--fs/xfs/libxfs/xfs_defer.c2
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c14
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c2
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c678
-rw-r--r--fs/xfs/libxfs/xfs_dir2_sf.c8
-rw-r--r--fs/xfs/libxfs/xfs_fs.h2
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c9
-rw-r--r--fs/xfs/libxfs/xfs_iext_tree.c8
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c16
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c50
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h12
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c59
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h11
-rw-r--r--fs/xfs/libxfs/xfs_shared.h6
-rw-r--r--fs/xfs/libxfs/xfs_types.h8
-rw-r--r--fs/xfs/scrub/agheader.c4
-rw-r--r--fs/xfs/scrub/attr.c6
-rw-r--r--fs/xfs/scrub/bmap.c81
-rw-r--r--fs/xfs/scrub/fscounters.c2
-rw-r--r--fs/xfs/scrub/repair.c6
-rw-r--r--fs/xfs/scrub/symlink.c2
-rw-r--r--fs/xfs/xfs_acl.c14
-rw-r--r--fs/xfs/xfs_attr_inactive.c2
-rw-r--r--fs/xfs/xfs_attr_list.c2
-rw-r--r--fs/xfs/xfs_bmap_item.c8
-rw-r--r--fs/xfs/xfs_bmap_util.c22
-rw-r--r--fs/xfs/xfs_buf.c7
-rw-r--r--fs/xfs/xfs_buf.h6
-rw-r--r--fs/xfs/xfs_buf_item.c4
-rw-r--r--fs/xfs/xfs_dquot.c4
-rw-r--r--fs/xfs/xfs_dquot_item.c2
-rw-r--r--fs/xfs/xfs_error.c2
-rw-r--r--fs/xfs/xfs_extent_busy.c2
-rw-r--r--fs/xfs/xfs_extfree_item.c8
-rw-r--r--fs/xfs/xfs_file.c26
-rw-r--r--fs/xfs/xfs_fsmap.c12
-rw-r--r--fs/xfs/xfs_icache.c2
-rw-r--r--fs/xfs/xfs_icreate_item.c2
-rw-r--r--fs/xfs/xfs_inode.c85
-rw-r--r--fs/xfs/xfs_inode_item.c2
-rw-r--r--fs/xfs/xfs_ioctl.c25
-rw-r--r--fs/xfs/xfs_ioctl32.c2
-rw-r--r--fs/xfs/xfs_iomap.c6
-rw-r--r--fs/xfs/xfs_itable.c10
-rw-r--r--fs/xfs/xfs_itable.h13
-rw-r--r--fs/xfs/xfs_iwalk.c4
-rw-r--r--fs/xfs/xfs_iwalk.h13
-rw-r--r--fs/xfs/xfs_log.c466
-rw-r--r--fs/xfs/xfs_log_cil.c10
-rw-r--r--fs/xfs/xfs_log_recover.c50
-rw-r--r--fs/xfs/xfs_mount.c4
-rw-r--r--fs/xfs/xfs_mount.h7
-rw-r--r--fs/xfs/xfs_mru_cache.c4
-rw-r--r--fs/xfs/xfs_qm.c4
-rw-r--r--fs/xfs/xfs_refcount_item.c16
-rw-r--r--fs/xfs/xfs_reflink.c23
-rw-r--r--fs/xfs/xfs_rmap_item.c6
-rw-r--r--fs/xfs/xfs_rtalloc.c4
-rw-r--r--fs/xfs/xfs_super.c3
-rw-r--r--fs/xfs/xfs_trace.h34
-rw-r--r--fs/xfs/xfs_trans.c4
-rw-r--r--fs/xfs/xfs_trans_dquot.c2
-rw-r--r--fs/xfs/xfs_xattr.c2
-rw-r--r--include/linux/fs.h2
-rw-r--r--mm/fadvise.c4
-rw-r--r--mm/madvise.c22
81 files changed, 1315 insertions, 1089 deletions
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 16bb9a328678..da031b93e182 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -3,10 +3,10 @@
3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4 * All Rights Reserved. 4 * All Rights Reserved.
5 */ 5 */
6#include <linux/sched/mm.h> 6#include "xfs.h"
7#include <linux/backing-dev.h> 7#include <linux/backing-dev.h>
8#include "kmem.h"
9#include "xfs_message.h" 8#include "xfs_message.h"
9#include "xfs_trace.h"
10 10
11void * 11void *
12kmem_alloc(size_t size, xfs_km_flags_t flags) 12kmem_alloc(size_t size, xfs_km_flags_t flags)
@@ -15,9 +15,11 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
15 gfp_t lflags = kmem_flags_convert(flags); 15 gfp_t lflags = kmem_flags_convert(flags);
16 void *ptr; 16 void *ptr;
17 17
18 trace_kmem_alloc(size, flags, _RET_IP_);
19
18 do { 20 do {
19 ptr = kmalloc(size, lflags); 21 ptr = kmalloc(size, lflags);
20 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) 22 if (ptr || (flags & KM_MAYFAIL))
21 return ptr; 23 return ptr;
22 if (!(++retries % 100)) 24 if (!(++retries % 100))
23 xfs_err(NULL, 25 xfs_err(NULL,
@@ -28,28 +30,24 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
28 } while (1); 30 } while (1);
29} 31}
30 32
31void * 33
32kmem_alloc_large(size_t size, xfs_km_flags_t flags) 34/*
35 * __vmalloc() will allocate data pages and auxillary structures (e.g.
36 * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context here. Hence
37 * we need to tell memory reclaim that we are in such a context via
38 * PF_MEMALLOC_NOFS to prevent memory reclaim re-entering the filesystem here
39 * and potentially deadlocking.
40 */
41static void *
42__kmem_vmalloc(size_t size, xfs_km_flags_t flags)
33{ 43{
34 unsigned nofs_flag = 0; 44 unsigned nofs_flag = 0;
35 void *ptr; 45 void *ptr;
36 gfp_t lflags; 46 gfp_t lflags = kmem_flags_convert(flags);
37
38 ptr = kmem_alloc(size, flags | KM_MAYFAIL);
39 if (ptr)
40 return ptr;
41 47
42 /*
43 * __vmalloc() will allocate data pages and auxillary structures (e.g.
44 * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context
45 * here. Hence we need to tell memory reclaim that we are in such a
46 * context via PF_MEMALLOC_NOFS to prevent memory reclaim re-entering
47 * the filesystem here and potentially deadlocking.
48 */
49 if (flags & KM_NOFS) 48 if (flags & KM_NOFS)
50 nofs_flag = memalloc_nofs_save(); 49 nofs_flag = memalloc_nofs_save();
51 50
52 lflags = kmem_flags_convert(flags);
53 ptr = __vmalloc(size, lflags, PAGE_KERNEL); 51 ptr = __vmalloc(size, lflags, PAGE_KERNEL);
54 52
55 if (flags & KM_NOFS) 53 if (flags & KM_NOFS)
@@ -58,6 +56,44 @@ kmem_alloc_large(size_t size, xfs_km_flags_t flags)
58 return ptr; 56 return ptr;
59} 57}
60 58
59/*
60 * Same as kmem_alloc_large, except we guarantee the buffer returned is aligned
61 * to the @align_mask. We only guarantee alignment up to page size, we'll clamp
62 * alignment at page size if it is larger. vmalloc always returns a PAGE_SIZE
63 * aligned region.
64 */
65void *
66kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags)
67{
68 void *ptr;
69
70 trace_kmem_alloc_io(size, flags, _RET_IP_);
71
72 if (WARN_ON_ONCE(align_mask >= PAGE_SIZE))
73 align_mask = PAGE_SIZE - 1;
74
75 ptr = kmem_alloc(size, flags | KM_MAYFAIL);
76 if (ptr) {
77 if (!((uintptr_t)ptr & align_mask))
78 return ptr;
79 kfree(ptr);
80 }
81 return __kmem_vmalloc(size, flags);
82}
83
84void *
85kmem_alloc_large(size_t size, xfs_km_flags_t flags)
86{
87 void *ptr;
88
89 trace_kmem_alloc_large(size, flags, _RET_IP_);
90
91 ptr = kmem_alloc(size, flags | KM_MAYFAIL);
92 if (ptr)
93 return ptr;
94 return __kmem_vmalloc(size, flags);
95}
96
61void * 97void *
62kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags) 98kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags)
63{ 99{
@@ -65,9 +101,11 @@ kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags)
65 gfp_t lflags = kmem_flags_convert(flags); 101 gfp_t lflags = kmem_flags_convert(flags);
66 void *ptr; 102 void *ptr;
67 103
104 trace_kmem_realloc(newsize, flags, _RET_IP_);
105
68 do { 106 do {
69 ptr = krealloc(old, newsize, lflags); 107 ptr = krealloc(old, newsize, lflags);
70 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) 108 if (ptr || (flags & KM_MAYFAIL))
71 return ptr; 109 return ptr;
72 if (!(++retries % 100)) 110 if (!(++retries % 100))
73 xfs_err(NULL, 111 xfs_err(NULL,
@@ -85,9 +123,10 @@ kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags)
85 gfp_t lflags = kmem_flags_convert(flags); 123 gfp_t lflags = kmem_flags_convert(flags);
86 void *ptr; 124 void *ptr;
87 125
126 trace_kmem_zone_alloc(kmem_cache_size(zone), flags, _RET_IP_);
88 do { 127 do {
89 ptr = kmem_cache_alloc(zone, lflags); 128 ptr = kmem_cache_alloc(zone, lflags);
90 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) 129 if (ptr || (flags & KM_MAYFAIL))
91 return ptr; 130 return ptr;
92 if (!(++retries % 100)) 131 if (!(++retries % 100))
93 xfs_err(NULL, 132 xfs_err(NULL,
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 267655acd426..8170d95cf930 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -16,8 +16,6 @@
16 */ 16 */
17 17
18typedef unsigned __bitwise xfs_km_flags_t; 18typedef unsigned __bitwise xfs_km_flags_t;
19#define KM_SLEEP ((__force xfs_km_flags_t)0x0001u)
20#define KM_NOSLEEP ((__force xfs_km_flags_t)0x0002u)
21#define KM_NOFS ((__force xfs_km_flags_t)0x0004u) 19#define KM_NOFS ((__force xfs_km_flags_t)0x0004u)
22#define KM_MAYFAIL ((__force xfs_km_flags_t)0x0008u) 20#define KM_MAYFAIL ((__force xfs_km_flags_t)0x0008u)
23#define KM_ZERO ((__force xfs_km_flags_t)0x0010u) 21#define KM_ZERO ((__force xfs_km_flags_t)0x0010u)
@@ -32,15 +30,11 @@ kmem_flags_convert(xfs_km_flags_t flags)
32{ 30{
33 gfp_t lflags; 31 gfp_t lflags;
34 32
35 BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_ZERO)); 33 BUG_ON(flags & ~(KM_NOFS|KM_MAYFAIL|KM_ZERO));
36 34
37 if (flags & KM_NOSLEEP) { 35 lflags = GFP_KERNEL | __GFP_NOWARN;
38 lflags = GFP_ATOMIC | __GFP_NOWARN; 36 if (flags & KM_NOFS)
39 } else { 37 lflags &= ~__GFP_FS;
40 lflags = GFP_KERNEL | __GFP_NOWARN;
41 if (flags & KM_NOFS)
42 lflags &= ~__GFP_FS;
43 }
44 38
45 /* 39 /*
46 * Default page/slab allocator behavior is to retry for ever 40 * Default page/slab allocator behavior is to retry for ever
@@ -59,6 +53,7 @@ kmem_flags_convert(xfs_km_flags_t flags)
59} 53}
60 54
61extern void *kmem_alloc(size_t, xfs_km_flags_t); 55extern void *kmem_alloc(size_t, xfs_km_flags_t);
56extern void *kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags);
62extern void *kmem_alloc_large(size_t size, xfs_km_flags_t); 57extern void *kmem_alloc_large(size_t size, xfs_km_flags_t);
63extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t); 58extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t);
64static inline void kmem_free(const void *ptr) 59static inline void kmem_free(const void *ptr)
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 372ad55631fc..533b04aaf6f6 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2205,7 +2205,7 @@ xfs_defer_agfl_block(
2205 ASSERT(xfs_bmap_free_item_zone != NULL); 2205 ASSERT(xfs_bmap_free_item_zone != NULL);
2206 ASSERT(oinfo != NULL); 2206 ASSERT(oinfo != NULL);
2207 2207
2208 new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); 2208 new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0);
2209 new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); 2209 new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
2210 new->xefi_blockcount = 1; 2210 new->xefi_blockcount = 1;
2211 new->xefi_oinfo = *oinfo; 2211 new->xefi_oinfo = *oinfo;
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index d6ed5d2c07c2..58fa85cec325 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -81,10 +81,9 @@ typedef struct xfs_alloc_arg {
81/* 81/*
82 * Defines for datatype 82 * Defines for datatype
83 */ 83 */
84#define XFS_ALLOC_USERDATA (1 << 0)/* allocation is for user data*/ 84#define XFS_ALLOC_INITIAL_USER_DATA (1 << 0)/* special case start of file */
85#define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */ 85#define XFS_ALLOC_USERDATA_ZERO (1 << 1)/* zero extent on allocation */
86#define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */ 86#define XFS_ALLOC_NOBUSY (1 << 2)/* Busy extents not allowed */
87#define XFS_ALLOC_NOBUSY (1 << 3)/* Busy extents not allowed */
88 87
89static inline bool 88static inline bool
90xfs_alloc_is_userdata(int datatype) 89xfs_alloc_is_userdata(int datatype)
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index d48fcf11cc35..510ca6974604 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -97,7 +97,10 @@ xfs_inode_hasattr(
97 * Overall external interface routines. 97 * Overall external interface routines.
98 *========================================================================*/ 98 *========================================================================*/
99 99
100/* Retrieve an extended attribute and its value. Must have ilock. */ 100/*
101 * Retrieve an extended attribute and its value. Must have ilock.
102 * Returns 0 on successful retrieval, otherwise an error.
103 */
101int 104int
102xfs_attr_get_ilocked( 105xfs_attr_get_ilocked(
103 struct xfs_inode *ip, 106 struct xfs_inode *ip,
@@ -115,12 +118,28 @@ xfs_attr_get_ilocked(
115 return xfs_attr_node_get(args); 118 return xfs_attr_node_get(args);
116} 119}
117 120
118/* Retrieve an extended attribute by name, and its value. */ 121/*
122 * Retrieve an extended attribute by name, and its value if requested.
123 *
124 * If ATTR_KERNOVAL is set in @flags, then the caller does not want the value,
125 * just an indication whether the attribute exists and the size of the value if
126 * it exists. The size is returned in @valuelenp,
127 *
128 * If the attribute is found, but exceeds the size limit set by the caller in
129 * @valuelenp, return -ERANGE with the size of the attribute that was found in
130 * @valuelenp.
131 *
132 * If ATTR_ALLOC is set in @flags, allocate the buffer for the value after
133 * existence of the attribute has been determined. On success, return that
134 * buffer to the caller and leave them to free it. On failure, free any
135 * allocated buffer and ensure the buffer pointer returned to the caller is
136 * null.
137 */
119int 138int
120xfs_attr_get( 139xfs_attr_get(
121 struct xfs_inode *ip, 140 struct xfs_inode *ip,
122 const unsigned char *name, 141 const unsigned char *name,
123 unsigned char *value, 142 unsigned char **value,
124 int *valuelenp, 143 int *valuelenp,
125 int flags) 144 int flags)
126{ 145{
@@ -128,6 +147,8 @@ xfs_attr_get(
128 uint lock_mode; 147 uint lock_mode;
129 int error; 148 int error;
130 149
150 ASSERT((flags & (ATTR_ALLOC | ATTR_KERNOVAL)) || *value);
151
131 XFS_STATS_INC(ip->i_mount, xs_attr_get); 152 XFS_STATS_INC(ip->i_mount, xs_attr_get);
132 153
133 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 154 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -137,17 +158,29 @@ xfs_attr_get(
137 if (error) 158 if (error)
138 return error; 159 return error;
139 160
140 args.value = value;
141 args.valuelen = *valuelenp;
142 /* Entirely possible to look up a name which doesn't exist */ 161 /* Entirely possible to look up a name which doesn't exist */
143 args.op_flags = XFS_DA_OP_OKNOENT; 162 args.op_flags = XFS_DA_OP_OKNOENT;
163 if (flags & ATTR_ALLOC)
164 args.op_flags |= XFS_DA_OP_ALLOCVAL;
165 else
166 args.value = *value;
167 args.valuelen = *valuelenp;
144 168
145 lock_mode = xfs_ilock_attr_map_shared(ip); 169 lock_mode = xfs_ilock_attr_map_shared(ip);
146 error = xfs_attr_get_ilocked(ip, &args); 170 error = xfs_attr_get_ilocked(ip, &args);
147 xfs_iunlock(ip, lock_mode); 171 xfs_iunlock(ip, lock_mode);
148
149 *valuelenp = args.valuelen; 172 *valuelenp = args.valuelen;
150 return error == -EEXIST ? 0 : error; 173
174 /* on error, we have to clean up allocated value buffers */
175 if (error) {
176 if (flags & ATTR_ALLOC) {
177 kmem_free(args.value);
178 *value = NULL;
179 }
180 return error;
181 }
182 *value = args.value;
183 return 0;
151} 184}
152 185
153/* 186/*
@@ -768,6 +801,8 @@ xfs_attr_leaf_removename(
768 * 801 *
769 * This leaf block cannot have a "remote" value, we only call this routine 802 * This leaf block cannot have a "remote" value, we only call this routine
770 * if bmap_one_block() says there is only one block (ie: no remote blks). 803 * if bmap_one_block() says there is only one block (ie: no remote blks).
804 *
805 * Returns 0 on successful retrieval, otherwise an error.
771 */ 806 */
772STATIC int 807STATIC int
773xfs_attr_leaf_get(xfs_da_args_t *args) 808xfs_attr_leaf_get(xfs_da_args_t *args)
@@ -789,9 +824,6 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
789 } 824 }
790 error = xfs_attr3_leaf_getvalue(bp, args); 825 error = xfs_attr3_leaf_getvalue(bp, args);
791 xfs_trans_brelse(args->trans, bp); 826 xfs_trans_brelse(args->trans, bp);
792 if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) {
793 error = xfs_attr_rmtval_get(args);
794 }
795 return error; 827 return error;
796} 828}
797 829
@@ -1268,11 +1300,13 @@ xfs_attr_refillstate(xfs_da_state_t *state)
1268} 1300}
1269 1301
1270/* 1302/*
1271 * Look up a filename in a node attribute list. 1303 * Retrieve the attribute data from a node attribute list.
1272 * 1304 *
1273 * This routine gets called for any attribute fork that has more than one 1305 * This routine gets called for any attribute fork that has more than one
1274 * block, ie: both true Btree attr lists and for single-leaf-blocks with 1306 * block, ie: both true Btree attr lists and for single-leaf-blocks with
1275 * "remote" values taking up more blocks. 1307 * "remote" values taking up more blocks.
1308 *
1309 * Returns 0 on successful retrieval, otherwise an error.
1276 */ 1310 */
1277STATIC int 1311STATIC int
1278xfs_attr_node_get(xfs_da_args_t *args) 1312xfs_attr_node_get(xfs_da_args_t *args)
@@ -1294,24 +1328,21 @@ xfs_attr_node_get(xfs_da_args_t *args)
1294 error = xfs_da3_node_lookup_int(state, &retval); 1328 error = xfs_da3_node_lookup_int(state, &retval);
1295 if (error) { 1329 if (error) {
1296 retval = error; 1330 retval = error;
1297 } else if (retval == -EEXIST) { 1331 goto out_release;
1298 blk = &state->path.blk[ state->path.active-1 ];
1299 ASSERT(blk->bp != NULL);
1300 ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
1301
1302 /*
1303 * Get the value, local or "remote"
1304 */
1305 retval = xfs_attr3_leaf_getvalue(blk->bp, args);
1306 if (!retval && (args->rmtblkno > 0)
1307 && !(args->flags & ATTR_KERNOVAL)) {
1308 retval = xfs_attr_rmtval_get(args);
1309 }
1310 } 1332 }
1333 if (retval != -EEXIST)
1334 goto out_release;
1335
1336 /*
1337 * Get the value, local or "remote"
1338 */
1339 blk = &state->path.blk[state->path.active - 1];
1340 retval = xfs_attr3_leaf_getvalue(blk->bp, args);
1311 1341
1312 /* 1342 /*
1313 * If not in a transaction, we have to release all the buffers. 1343 * If not in a transaction, we have to release all the buffers.
1314 */ 1344 */
1345out_release:
1315 for (i = 0; i < state->path.active; i++) { 1346 for (i = 0; i < state->path.active; i++) {
1316 xfs_trans_brelse(args->trans, state->path.blk[i].bp); 1347 xfs_trans_brelse(args->trans, state->path.blk[i].bp);
1317 state->path.blk[i].bp = NULL; 1348 state->path.blk[i].bp = NULL;
diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
index ff28ebf3b635..94badfa1743e 100644
--- a/fs/xfs/libxfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -37,6 +37,7 @@ struct xfs_attr_list_context;
37#define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */ 37#define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */
38 38
39#define ATTR_INCOMPLETE 0x4000 /* [kernel] return INCOMPLETE attr keys */ 39#define ATTR_INCOMPLETE 0x4000 /* [kernel] return INCOMPLETE attr keys */
40#define ATTR_ALLOC 0x8000 /* allocate xattr buffer on demand */
40 41
41#define XFS_ATTR_FLAGS \ 42#define XFS_ATTR_FLAGS \
42 { ATTR_DONTFOLLOW, "DONTFOLLOW" }, \ 43 { ATTR_DONTFOLLOW, "DONTFOLLOW" }, \
@@ -47,7 +48,8 @@ struct xfs_attr_list_context;
47 { ATTR_REPLACE, "REPLACE" }, \ 48 { ATTR_REPLACE, "REPLACE" }, \
48 { ATTR_KERNOTIME, "KERNOTIME" }, \ 49 { ATTR_KERNOTIME, "KERNOTIME" }, \
49 { ATTR_KERNOVAL, "KERNOVAL" }, \ 50 { ATTR_KERNOVAL, "KERNOVAL" }, \
50 { ATTR_INCOMPLETE, "INCOMPLETE" } 51 { ATTR_INCOMPLETE, "INCOMPLETE" }, \
52 { ATTR_ALLOC, "ALLOC" }
51 53
52/* 54/*
53 * The maximum size (into the kernel or returned from the kernel) of an 55 * The maximum size (into the kernel or returned from the kernel) of an
@@ -143,7 +145,7 @@ int xfs_attr_list_int(struct xfs_attr_list_context *);
143int xfs_inode_hasattr(struct xfs_inode *ip); 145int xfs_inode_hasattr(struct xfs_inode *ip);
144int xfs_attr_get_ilocked(struct xfs_inode *ip, struct xfs_da_args *args); 146int xfs_attr_get_ilocked(struct xfs_inode *ip, struct xfs_da_args *args);
145int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name, 147int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
146 unsigned char *value, int *valuelenp, int flags); 148 unsigned char **value, int *valuelenp, int flags);
147int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, 149int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
148 unsigned char *value, int valuelen, int flags); 150 unsigned char *value, int valuelen, int flags);
149int xfs_attr_set_args(struct xfs_da_args *args); 151int xfs_attr_set_args(struct xfs_da_args *args);
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 70eb941d02e4..b9f019603d0b 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -393,6 +393,50 @@ xfs_attr_namesp_match(int arg_flags, int ondisk_flags)
393 return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags); 393 return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags);
394} 394}
395 395
396static int
397xfs_attr_copy_value(
398 struct xfs_da_args *args,
399 unsigned char *value,
400 int valuelen)
401{
402 /*
403 * No copy if all we have to do is get the length
404 */
405 if (args->flags & ATTR_KERNOVAL) {
406 args->valuelen = valuelen;
407 return 0;
408 }
409
410 /*
411 * No copy if the length of the existing buffer is too small
412 */
413 if (args->valuelen < valuelen) {
414 args->valuelen = valuelen;
415 return -ERANGE;
416 }
417
418 if (args->op_flags & XFS_DA_OP_ALLOCVAL) {
419 args->value = kmem_alloc_large(valuelen, 0);
420 if (!args->value)
421 return -ENOMEM;
422 }
423 args->valuelen = valuelen;
424
425 /* remote block xattr requires IO for copy-in */
426 if (args->rmtblkno)
427 return xfs_attr_rmtval_get(args);
428
429 /*
430 * This is to prevent a GCC warning because the remote xattr case
431 * doesn't have a value to pass in. In that case, we never reach here,
432 * but GCC can't work that out and so throws a "passing NULL to
433 * memcpy" warning.
434 */
435 if (!value)
436 return -EINVAL;
437 memcpy(args->value, value, valuelen);
438 return 0;
439}
396 440
397/*======================================================================== 441/*========================================================================
398 * External routines when attribute fork size < XFS_LITINO(mp). 442 * External routines when attribute fork size < XFS_LITINO(mp).
@@ -720,15 +764,19 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args)
720} 764}
721 765
722/* 766/*
723 * Look up a name in a shortform attribute list structure. 767 * Retreive the attribute value and length.
768 *
769 * If ATTR_KERNOVAL is specified, only the length needs to be returned.
770 * Unlike a lookup, we only return an error if the attribute does not
771 * exist or we can't retrieve the value.
724 */ 772 */
725/*ARGSUSED*/
726int 773int
727xfs_attr_shortform_getvalue(xfs_da_args_t *args) 774xfs_attr_shortform_getvalue(
775 struct xfs_da_args *args)
728{ 776{
729 xfs_attr_shortform_t *sf; 777 struct xfs_attr_shortform *sf;
730 xfs_attr_sf_entry_t *sfe; 778 struct xfs_attr_sf_entry *sfe;
731 int i; 779 int i;
732 780
733 ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE); 781 ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE);
734 sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data; 782 sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data;
@@ -741,18 +789,8 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args)
741 continue; 789 continue;
742 if (!xfs_attr_namesp_match(args->flags, sfe->flags)) 790 if (!xfs_attr_namesp_match(args->flags, sfe->flags))
743 continue; 791 continue;
744 if (args->flags & ATTR_KERNOVAL) { 792 return xfs_attr_copy_value(args, &sfe->nameval[args->namelen],
745 args->valuelen = sfe->valuelen; 793 sfe->valuelen);
746 return -EEXIST;
747 }
748 if (args->valuelen < sfe->valuelen) {
749 args->valuelen = sfe->valuelen;
750 return -ERANGE;
751 }
752 args->valuelen = sfe->valuelen;
753 memcpy(args->value, &sfe->nameval[args->namelen],
754 args->valuelen);
755 return -EEXIST;
756 } 794 }
757 return -ENOATTR; 795 return -ENOATTR;
758} 796}
@@ -782,7 +820,7 @@ xfs_attr_shortform_to_leaf(
782 ifp = dp->i_afp; 820 ifp = dp->i_afp;
783 sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; 821 sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
784 size = be16_to_cpu(sf->hdr.totsize); 822 size = be16_to_cpu(sf->hdr.totsize);
785 tmpbuffer = kmem_alloc(size, KM_SLEEP); 823 tmpbuffer = kmem_alloc(size, 0);
786 ASSERT(tmpbuffer != NULL); 824 ASSERT(tmpbuffer != NULL);
787 memcpy(tmpbuffer, ifp->if_u1.if_data, size); 825 memcpy(tmpbuffer, ifp->if_u1.if_data, size);
788 sf = (xfs_attr_shortform_t *)tmpbuffer; 826 sf = (xfs_attr_shortform_t *)tmpbuffer;
@@ -985,7 +1023,7 @@ xfs_attr3_leaf_to_shortform(
985 1023
986 trace_xfs_attr_leaf_to_sf(args); 1024 trace_xfs_attr_leaf_to_sf(args);
987 1025
988 tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP); 1026 tmpbuffer = kmem_alloc(args->geo->blksize, 0);
989 if (!tmpbuffer) 1027 if (!tmpbuffer)
990 return -ENOMEM; 1028 return -ENOMEM;
991 1029
@@ -1448,7 +1486,7 @@ xfs_attr3_leaf_compact(
1448 1486
1449 trace_xfs_attr_leaf_compact(args); 1487 trace_xfs_attr_leaf_compact(args);
1450 1488
1451 tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP); 1489 tmpbuffer = kmem_alloc(args->geo->blksize, 0);
1452 memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); 1490 memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
1453 memset(bp->b_addr, 0, args->geo->blksize); 1491 memset(bp->b_addr, 0, args->geo->blksize);
1454 leaf_src = (xfs_attr_leafblock_t *)tmpbuffer; 1492 leaf_src = (xfs_attr_leafblock_t *)tmpbuffer;
@@ -2167,7 +2205,7 @@ xfs_attr3_leaf_unbalance(
2167 struct xfs_attr_leafblock *tmp_leaf; 2205 struct xfs_attr_leafblock *tmp_leaf;
2168 struct xfs_attr3_icleaf_hdr tmphdr; 2206 struct xfs_attr3_icleaf_hdr tmphdr;
2169 2207
2170 tmp_leaf = kmem_zalloc(state->args->geo->blksize, KM_SLEEP); 2208 tmp_leaf = kmem_zalloc(state->args->geo->blksize, 0);
2171 2209
2172 /* 2210 /*
2173 * Copy the header into the temp leaf so that all the stuff 2211 * Copy the header into the temp leaf so that all the stuff
@@ -2350,6 +2388,10 @@ xfs_attr3_leaf_lookup_int(
2350/* 2388/*
2351 * Get the value associated with an attribute name from a leaf attribute 2389 * Get the value associated with an attribute name from a leaf attribute
2352 * list structure. 2390 * list structure.
2391 *
2392 * If ATTR_KERNOVAL is specified, only the length needs to be returned.
2393 * Unlike a lookup, we only return an error if the attribute does not
2394 * exist or we can't retrieve the value.
2353 */ 2395 */
2354int 2396int
2355xfs_attr3_leaf_getvalue( 2397xfs_attr3_leaf_getvalue(
@@ -2361,7 +2403,6 @@ xfs_attr3_leaf_getvalue(
2361 struct xfs_attr_leaf_entry *entry; 2403 struct xfs_attr_leaf_entry *entry;
2362 struct xfs_attr_leaf_name_local *name_loc; 2404 struct xfs_attr_leaf_name_local *name_loc;
2363 struct xfs_attr_leaf_name_remote *name_rmt; 2405 struct xfs_attr_leaf_name_remote *name_rmt;
2364 int valuelen;
2365 2406
2366 leaf = bp->b_addr; 2407 leaf = bp->b_addr;
2367 xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); 2408 xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
@@ -2373,36 +2414,19 @@ xfs_attr3_leaf_getvalue(
2373 name_loc = xfs_attr3_leaf_name_local(leaf, args->index); 2414 name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
2374 ASSERT(name_loc->namelen == args->namelen); 2415 ASSERT(name_loc->namelen == args->namelen);
2375 ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0); 2416 ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0);
2376 valuelen = be16_to_cpu(name_loc->valuelen); 2417 return xfs_attr_copy_value(args,
2377 if (args->flags & ATTR_KERNOVAL) { 2418 &name_loc->nameval[args->namelen],
2378 args->valuelen = valuelen; 2419 be16_to_cpu(name_loc->valuelen));
2379 return 0; 2420 }
2380 } 2421
2381 if (args->valuelen < valuelen) { 2422 name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
2382 args->valuelen = valuelen; 2423 ASSERT(name_rmt->namelen == args->namelen);
2383 return -ERANGE; 2424 ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
2384 } 2425 args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
2385 args->valuelen = valuelen; 2426 args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
2386 memcpy(args->value, &name_loc->nameval[args->namelen], valuelen); 2427 args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
2387 } else { 2428 args->rmtvaluelen);
2388 name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); 2429 return xfs_attr_copy_value(args, NULL, args->rmtvaluelen);
2389 ASSERT(name_rmt->namelen == args->namelen);
2390 ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
2391 args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
2392 args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
2393 args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
2394 args->rmtvaluelen);
2395 if (args->flags & ATTR_KERNOVAL) {
2396 args->valuelen = args->rmtvaluelen;
2397 return 0;
2398 }
2399 if (args->valuelen < args->rmtvaluelen) {
2400 args->valuelen = args->rmtvaluelen;
2401 return -ERANGE;
2402 }
2403 args->valuelen = args->rmtvaluelen;
2404 }
2405 return 0;
2406} 2430}
2407 2431
2408/*======================================================================== 2432/*========================================================================
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 4eb30d357045..3e39b7d40f25 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -358,6 +358,8 @@ xfs_attr_rmtval_copyin(
358/* 358/*
359 * Read the value associated with an attribute from the out-of-line buffer 359 * Read the value associated with an attribute from the out-of-line buffer
360 * that we stored it in. 360 * that we stored it in.
361 *
362 * Returns 0 on successful retrieval, otherwise an error.
361 */ 363 */
362int 364int
363xfs_attr_rmtval_get( 365xfs_attr_rmtval_get(
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 07aad70f3931..054b4ce30033 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -553,7 +553,7 @@ __xfs_bmap_add_free(
553#endif 553#endif
554 ASSERT(xfs_bmap_free_item_zone != NULL); 554 ASSERT(xfs_bmap_free_item_zone != NULL);
555 555
556 new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); 556 new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0);
557 new->xefi_startblock = bno; 557 new->xefi_startblock = bno;
558 new->xefi_blockcount = (xfs_extlen_t)len; 558 new->xefi_blockcount = (xfs_extlen_t)len;
559 if (oinfo) 559 if (oinfo)
@@ -1099,7 +1099,7 @@ xfs_bmap_add_attrfork(
1099 if (error) 1099 if (error)
1100 goto trans_cancel; 1100 goto trans_cancel;
1101 ASSERT(ip->i_afp == NULL); 1101 ASSERT(ip->i_afp == NULL);
1102 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); 1102 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, 0);
1103 ip->i_afp->if_flags = XFS_IFEXTENTS; 1103 ip->i_afp->if_flags = XFS_IFEXTENTS;
1104 logflags = 0; 1104 logflags = 0;
1105 switch (ip->i_d.di_format) { 1105 switch (ip->i_d.di_format) {
@@ -1985,11 +1985,8 @@ xfs_bmap_add_extent_delay_real(
1985 } 1985 }
1986 1986
1987 /* add reverse mapping unless caller opted out */ 1987 /* add reverse mapping unless caller opted out */
1988 if (!(bma->flags & XFS_BMAPI_NORMAP)) { 1988 if (!(bma->flags & XFS_BMAPI_NORMAP))
1989 error = xfs_rmap_map_extent(bma->tp, bma->ip, whichfork, new); 1989 xfs_rmap_map_extent(bma->tp, bma->ip, whichfork, new);
1990 if (error)
1991 goto done;
1992 }
1993 1990
1994 /* convert to a btree if necessary */ 1991 /* convert to a btree if necessary */
1995 if (xfs_bmap_needs_btree(bma->ip, whichfork)) { 1992 if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
@@ -2471,9 +2468,7 @@ xfs_bmap_add_extent_unwritten_real(
2471 } 2468 }
2472 2469
2473 /* update reverse mappings */ 2470 /* update reverse mappings */
2474 error = xfs_rmap_convert_extent(mp, tp, ip, whichfork, new); 2471 xfs_rmap_convert_extent(mp, tp, ip, whichfork, new);
2475 if (error)
2476 goto done;
2477 2472
2478 /* convert to a btree if necessary */ 2473 /* convert to a btree if necessary */
2479 if (xfs_bmap_needs_btree(ip, whichfork)) { 2474 if (xfs_bmap_needs_btree(ip, whichfork)) {
@@ -2832,11 +2827,8 @@ xfs_bmap_add_extent_hole_real(
2832 } 2827 }
2833 2828
2834 /* add reverse mapping unless caller opted out */ 2829 /* add reverse mapping unless caller opted out */
2835 if (!(flags & XFS_BMAPI_NORMAP)) { 2830 if (!(flags & XFS_BMAPI_NORMAP))
2836 error = xfs_rmap_map_extent(tp, ip, whichfork, new); 2831 xfs_rmap_map_extent(tp, ip, whichfork, new);
2837 if (error)
2838 goto done;
2839 }
2840 2832
2841 /* convert to a btree if necessary */ 2833 /* convert to a btree if necessary */
2842 if (xfs_bmap_needs_btree(ip, whichfork)) { 2834 if (xfs_bmap_needs_btree(ip, whichfork)) {
@@ -4050,12 +4042,8 @@ xfs_bmapi_allocate(
4050 */ 4042 */
4051 if (!(bma->flags & XFS_BMAPI_METADATA)) { 4043 if (!(bma->flags & XFS_BMAPI_METADATA)) {
4052 bma->datatype = XFS_ALLOC_NOBUSY; 4044 bma->datatype = XFS_ALLOC_NOBUSY;
4053 if (whichfork == XFS_DATA_FORK) { 4045 if (whichfork == XFS_DATA_FORK && bma->offset == 0)
4054 if (bma->offset == 0) 4046 bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
4055 bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
4056 else
4057 bma->datatype |= XFS_ALLOC_USERDATA;
4058 }
4059 if (bma->flags & XFS_BMAPI_ZERO) 4047 if (bma->flags & XFS_BMAPI_ZERO)
4060 bma->datatype |= XFS_ALLOC_USERDATA_ZERO; 4048 bma->datatype |= XFS_ALLOC_USERDATA_ZERO;
4061 } 4049 }
@@ -4401,12 +4389,9 @@ xfs_bmapi_write(
4401 * If this is a CoW allocation, record the data in 4389 * If this is a CoW allocation, record the data in
4402 * the refcount btree for orphan recovery. 4390 * the refcount btree for orphan recovery.
4403 */ 4391 */
4404 if (whichfork == XFS_COW_FORK) { 4392 if (whichfork == XFS_COW_FORK)
4405 error = xfs_refcount_alloc_cow_extent(tp, 4393 xfs_refcount_alloc_cow_extent(tp, bma.blkno,
4406 bma.blkno, bma.length); 4394 bma.length);
4407 if (error)
4408 goto error0;
4409 }
4410 } 4395 }
4411 4396
4412 /* Deal with the allocated space we found. */ 4397 /* Deal with the allocated space we found. */
@@ -4530,7 +4515,7 @@ xfs_bmapi_convert_delalloc(
4530 if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK)) 4515 if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK))
4531 goto out_finish; 4516 goto out_finish;
4532 error = -EFSCORRUPTED; 4517 error = -EFSCORRUPTED;
4533 if (WARN_ON_ONCE(!bma.got.br_startblock && !XFS_IS_REALTIME_INODE(ip))) 4518 if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock)))
4534 goto out_finish; 4519 goto out_finish;
4535 4520
4536 XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length)); 4521 XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length));
@@ -4540,12 +4525,8 @@ xfs_bmapi_convert_delalloc(
4540 *imap = bma.got; 4525 *imap = bma.got;
4541 *seq = READ_ONCE(ifp->if_seq); 4526 *seq = READ_ONCE(ifp->if_seq);
4542 4527
4543 if (whichfork == XFS_COW_FORK) { 4528 if (whichfork == XFS_COW_FORK)
4544 error = xfs_refcount_alloc_cow_extent(tp, bma.blkno, 4529 xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length);
4545 bma.length);
4546 if (error)
4547 goto out_finish;
4548 }
4549 4530
4550 error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags, 4531 error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
4551 whichfork); 4532 whichfork);
@@ -5149,18 +5130,14 @@ xfs_bmap_del_extent_real(
5149 } 5130 }
5150 5131
5151 /* remove reverse mapping */ 5132 /* remove reverse mapping */
5152 error = xfs_rmap_unmap_extent(tp, ip, whichfork, del); 5133 xfs_rmap_unmap_extent(tp, ip, whichfork, del);
5153 if (error)
5154 goto done;
5155 5134
5156 /* 5135 /*
5157 * If we need to, add to list of extents to delete. 5136 * If we need to, add to list of extents to delete.
5158 */ 5137 */
5159 if (do_fx && !(bflags & XFS_BMAPI_REMAP)) { 5138 if (do_fx && !(bflags & XFS_BMAPI_REMAP)) {
5160 if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) { 5139 if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
5161 error = xfs_refcount_decrease_extent(tp, del); 5140 xfs_refcount_decrease_extent(tp, del);
5162 if (error)
5163 goto done;
5164 } else { 5141 } else {
5165 __xfs_bmap_add_free(tp, del->br_startblock, 5142 __xfs_bmap_add_free(tp, del->br_startblock,
5166 del->br_blockcount, NULL, 5143 del->br_blockcount, NULL,
@@ -5651,12 +5628,11 @@ done:
5651 &new); 5628 &new);
5652 5629
5653 /* update reverse mapping. rmap functions merge the rmaps for us */ 5630 /* update reverse mapping. rmap functions merge the rmaps for us */
5654 error = xfs_rmap_unmap_extent(tp, ip, whichfork, got); 5631 xfs_rmap_unmap_extent(tp, ip, whichfork, got);
5655 if (error)
5656 return error;
5657 memcpy(&new, got, sizeof(new)); 5632 memcpy(&new, got, sizeof(new));
5658 new.br_startoff = left->br_startoff + left->br_blockcount; 5633 new.br_startoff = left->br_startoff + left->br_blockcount;
5659 return xfs_rmap_map_extent(tp, ip, whichfork, &new); 5634 xfs_rmap_map_extent(tp, ip, whichfork, &new);
5635 return 0;
5660} 5636}
5661 5637
5662static int 5638static int
@@ -5695,10 +5671,9 @@ xfs_bmap_shift_update_extent(
5695 got); 5671 got);
5696 5672
5697 /* update reverse mapping */ 5673 /* update reverse mapping */
5698 error = xfs_rmap_unmap_extent(tp, ip, whichfork, &prev); 5674 xfs_rmap_unmap_extent(tp, ip, whichfork, &prev);
5699 if (error) 5675 xfs_rmap_map_extent(tp, ip, whichfork, got);
5700 return error; 5676 return 0;
5701 return xfs_rmap_map_extent(tp, ip, whichfork, got);
5702} 5677}
5703 5678
5704int 5679int
@@ -6094,7 +6069,7 @@ __xfs_bmap_add(
6094 bmap->br_blockcount, 6069 bmap->br_blockcount,
6095 bmap->br_state); 6070 bmap->br_state);
6096 6071
6097 bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_SLEEP | KM_NOFS); 6072 bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_NOFS);
6098 INIT_LIST_HEAD(&bi->bi_list); 6073 INIT_LIST_HEAD(&bi->bi_list);
6099 bi->bi_type = type; 6074 bi->bi_type = type;
6100 bi->bi_owner = ip; 6075 bi->bi_owner = ip;
@@ -6106,29 +6081,29 @@ __xfs_bmap_add(
6106} 6081}
6107 6082
6108/* Map an extent into a file. */ 6083/* Map an extent into a file. */
6109int 6084void
6110xfs_bmap_map_extent( 6085xfs_bmap_map_extent(
6111 struct xfs_trans *tp, 6086 struct xfs_trans *tp,
6112 struct xfs_inode *ip, 6087 struct xfs_inode *ip,
6113 struct xfs_bmbt_irec *PREV) 6088 struct xfs_bmbt_irec *PREV)
6114{ 6089{
6115 if (!xfs_bmap_is_update_needed(PREV)) 6090 if (!xfs_bmap_is_update_needed(PREV))
6116 return 0; 6091 return;
6117 6092
6118 return __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, XFS_DATA_FORK, PREV); 6093 __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, XFS_DATA_FORK, PREV);
6119} 6094}
6120 6095
6121/* Unmap an extent out of a file. */ 6096/* Unmap an extent out of a file. */
6122int 6097void
6123xfs_bmap_unmap_extent( 6098xfs_bmap_unmap_extent(
6124 struct xfs_trans *tp, 6099 struct xfs_trans *tp,
6125 struct xfs_inode *ip, 6100 struct xfs_inode *ip,
6126 struct xfs_bmbt_irec *PREV) 6101 struct xfs_bmbt_irec *PREV)
6127{ 6102{
6128 if (!xfs_bmap_is_update_needed(PREV)) 6103 if (!xfs_bmap_is_update_needed(PREV))
6129 return 0; 6104 return;
6130 6105
6131 return __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, XFS_DATA_FORK, PREV); 6106 __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, XFS_DATA_FORK, PREV);
6132} 6107}
6133 6108
6134/* 6109/*
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 8f597f9abdbe..5bb446d80542 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -171,6 +171,13 @@ static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec)
171 !isnullstartblock(irec->br_startblock); 171 !isnullstartblock(irec->br_startblock);
172} 172}
173 173
174/*
175 * Check the mapping for obviously garbage allocations that could trash the
176 * filesystem immediately.
177 */
178#define xfs_valid_startblock(ip, startblock) \
179 ((startblock) != 0 || XFS_IS_REALTIME_INODE(ip))
180
174void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, 181void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
175 xfs_filblks_t len); 182 xfs_filblks_t len);
176int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); 183int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
@@ -254,9 +261,9 @@ int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_inode *ip,
254 enum xfs_bmap_intent_type type, int whichfork, 261 enum xfs_bmap_intent_type type, int whichfork,
255 xfs_fileoff_t startoff, xfs_fsblock_t startblock, 262 xfs_fileoff_t startoff, xfs_fsblock_t startblock,
256 xfs_filblks_t *blockcount, xfs_exntst_t state); 263 xfs_filblks_t *blockcount, xfs_exntst_t state);
257int xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, 264void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
258 struct xfs_bmbt_irec *imap); 265 struct xfs_bmbt_irec *imap);
259int xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, 266void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
260 struct xfs_bmbt_irec *imap); 267 struct xfs_bmbt_irec *imap);
261 268
262static inline int xfs_bmap_fork_to_state(int whichfork) 269static inline int xfs_bmap_fork_to_state(int whichfork)
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index fbb18ba5d905..ffe608d2a2d9 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -400,8 +400,20 @@ xfs_bmbt_diff_two_keys(
400 union xfs_btree_key *k1, 400 union xfs_btree_key *k1,
401 union xfs_btree_key *k2) 401 union xfs_btree_key *k2)
402{ 402{
403 return (int64_t)be64_to_cpu(k1->bmbt.br_startoff) - 403 uint64_t a = be64_to_cpu(k1->bmbt.br_startoff);
404 be64_to_cpu(k2->bmbt.br_startoff); 404 uint64_t b = be64_to_cpu(k2->bmbt.br_startoff);
405
406 /*
407 * Note: This routine previously casted a and b to int64 and subtracted
408 * them to generate a result. This lead to problems if b was the
409 * "maximum" key value (all ones) being signed incorrectly, hence this
410 * somewhat less efficient version.
411 */
412 if (a > b)
413 return 1;
414 if (b > a)
415 return -1;
416 return 0;
405} 417}
406 418
407static xfs_failaddr_t 419static xfs_failaddr_t
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index f1048efa4268..71de937f9e64 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -4466,8 +4466,6 @@ xfs_btree_lblock_verify(
4466 * btree block 4466 * btree block
4467 * 4467 *
4468 * @bp: buffer containing the btree block 4468 * @bp: buffer containing the btree block
4469 * @max_recs: pointer to the m_*_mxr max records field in the xfs mount
4470 * @pag_max_level: pointer to the per-ag max level field
4471 */ 4469 */
4472xfs_failaddr_t 4470xfs_failaddr_t
4473xfs_btree_sblock_v5hdr_verify( 4471xfs_btree_sblock_v5hdr_verify(
@@ -4600,7 +4598,7 @@ xfs_btree_simple_query_range(
4600 4598
4601 /* Callback */ 4599 /* Callback */
4602 error = fn(cur, recp, priv); 4600 error = fn(cur, recp, priv);
4603 if (error < 0 || error == XFS_BTREE_QUERY_RANGE_ABORT) 4601 if (error)
4604 break; 4602 break;
4605 4603
4606advloop: 4604advloop:
@@ -4702,8 +4700,7 @@ pop_up:
4702 */ 4700 */
4703 if (ldiff >= 0 && hdiff >= 0) { 4701 if (ldiff >= 0 && hdiff >= 0) {
4704 error = fn(cur, recp, priv); 4702 error = fn(cur, recp, priv);
4705 if (error < 0 || 4703 if (error)
4706 error == XFS_BTREE_QUERY_RANGE_ABORT)
4707 break; 4704 break;
4708 } else if (hdiff < 0) { 4705 } else if (hdiff < 0) {
4709 /* Record is larger than high key; pop. */ 4706 /* Record is larger than high key; pop. */
@@ -4774,8 +4771,7 @@ out:
4774 * Query a btree for all records overlapping a given interval of keys. The 4771 * Query a btree for all records overlapping a given interval of keys. The
4775 * supplied function will be called with each record found; return one of the 4772 * supplied function will be called with each record found; return one of the
4776 * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error 4773 * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error
4777 * code. This function returns XFS_BTREE_QUERY_RANGE_ABORT, zero, or a 4774 * code. This function returns -ECANCELED, zero, or a negative error code.
4778 * negative error code.
4779 */ 4775 */
4780int 4776int
4781xfs_btree_query_range( 4777xfs_btree_query_range(
@@ -4891,7 +4887,7 @@ xfs_btree_has_record_helper(
4891 union xfs_btree_rec *rec, 4887 union xfs_btree_rec *rec,
4892 void *priv) 4888 void *priv)
4893{ 4889{
4894 return XFS_BTREE_QUERY_RANGE_ABORT; 4890 return -ECANCELED;
4895} 4891}
4896 4892
4897/* Is there a record covering a given range of keys? */ 4893/* Is there a record covering a given range of keys? */
@@ -4906,7 +4902,7 @@ xfs_btree_has_record(
4906 4902
4907 error = xfs_btree_query_range(cur, low, high, 4903 error = xfs_btree_query_range(cur, low, high,
4908 &xfs_btree_has_record_helper, NULL); 4904 &xfs_btree_has_record_helper, NULL);
4909 if (error == XFS_BTREE_QUERY_RANGE_ABORT) { 4905 if (error == -ECANCELED) {
4910 *exists = true; 4906 *exists = true;
4911 return 0; 4907 return 0;
4912 } 4908 }
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index fa3cd8ab9aba..ced1e65d1483 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -464,9 +464,13 @@ xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp,
464uint xfs_btree_compute_maxlevels(uint *limits, unsigned long len); 464uint xfs_btree_compute_maxlevels(uint *limits, unsigned long len);
465unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len); 465unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len);
466 466
467/* return codes */ 467/*
468#define XFS_BTREE_QUERY_RANGE_CONTINUE (XFS_ITER_CONTINUE) /* keep iterating */ 468 * Return codes for the query range iterator function are 0 to continue
469#define XFS_BTREE_QUERY_RANGE_ABORT (XFS_ITER_ABORT) /* stop iterating */ 469 * iterating, and non-zero to stop iterating. Any non-zero value will be
470 * passed up to the _query_range caller. The special value -ECANCELED can be
471 * used to stop iteration, because _query_range never generates that error
472 * code on its own.
473 */
470typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur, 474typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur,
471 union xfs_btree_rec *rec, void *priv); 475 union xfs_btree_rec *rec, void *priv);
472 476
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 0bf56e94bfe9..4fd1223c1bd5 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -2098,7 +2098,7 @@ xfs_da_grow_inode_int(
2098 * If we didn't get it and the block might work if fragmented, 2098 * If we didn't get it and the block might work if fragmented,
2099 * try without the CONTIG flag. Loop until we get it all. 2099 * try without the CONTIG flag. Loop until we get it all.
2100 */ 2100 */
2101 mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP); 2101 mapp = kmem_alloc(sizeof(*mapp) * count, 0);
2102 for (b = *bno, mapi = 0; b < *bno + count; ) { 2102 for (b = *bno, mapi = 0; b < *bno + count; ) {
2103 nmap = min(XFS_BMAP_MAX_NMAP, count); 2103 nmap = min(XFS_BMAP_MAX_NMAP, count);
2104 c = (int)(*bno + count - b); 2104 c = (int)(*bno + count - b);
@@ -2480,7 +2480,7 @@ xfs_buf_map_from_irec(
2480 2480
2481 if (nirecs > 1) { 2481 if (nirecs > 1) {
2482 map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), 2482 map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map),
2483 KM_SLEEP | KM_NOFS); 2483 KM_NOFS);
2484 if (!map) 2484 if (!map)
2485 return -ENOMEM; 2485 return -ENOMEM;
2486 *mapp = map; 2486 *mapp = map;
@@ -2539,7 +2539,7 @@ xfs_dabuf_map(
2539 */ 2539 */
2540 if (nfsb != 1) 2540 if (nfsb != 1)
2541 irecs = kmem_zalloc(sizeof(irec) * nfsb, 2541 irecs = kmem_zalloc(sizeof(irec) * nfsb,
2542 KM_SLEEP | KM_NOFS); 2542 KM_NOFS);
2543 2543
2544 nirecs = nfsb; 2544 nirecs = nfsb;
2545 error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs, 2545 error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs,
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index 84dd865b6c3d..ae0bbd20d9ca 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -81,13 +81,15 @@ typedef struct xfs_da_args {
81#define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */ 81#define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */
82#define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */ 82#define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */
83#define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */ 83#define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */
84#define XFS_DA_OP_ALLOCVAL 0x0020 /* lookup to alloc buffer if found */
84 85
85#define XFS_DA_OP_FLAGS \ 86#define XFS_DA_OP_FLAGS \
86 { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ 87 { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \
87 { XFS_DA_OP_RENAME, "RENAME" }, \ 88 { XFS_DA_OP_RENAME, "RENAME" }, \
88 { XFS_DA_OP_ADDNAME, "ADDNAME" }, \ 89 { XFS_DA_OP_ADDNAME, "ADDNAME" }, \
89 { XFS_DA_OP_OKNOENT, "OKNOENT" }, \ 90 { XFS_DA_OP_OKNOENT, "OKNOENT" }, \
90 { XFS_DA_OP_CILOOKUP, "CILOOKUP" } 91 { XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \
92 { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" }
91 93
92/* 94/*
93 * Storage for holding state during Btree searches and split/join ops. 95 * Storage for holding state during Btree searches and split/join ops.
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index eb2be2a6a25a..22557527cfdb 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -517,7 +517,7 @@ xfs_defer_add(
517 } 517 }
518 if (!dfp) { 518 if (!dfp) {
519 dfp = kmem_alloc(sizeof(struct xfs_defer_pending), 519 dfp = kmem_alloc(sizeof(struct xfs_defer_pending),
520 KM_SLEEP | KM_NOFS); 520 KM_NOFS);
521 dfp->dfp_type = type; 521 dfp->dfp_type = type;
522 dfp->dfp_intent = NULL; 522 dfp->dfp_intent = NULL;
523 dfp->dfp_done = NULL; 523 dfp->dfp_done = NULL;
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 67840723edbb..867c5dee0751 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -110,9 +110,9 @@ xfs_da_mount(
110 110
111 nodehdr_size = mp->m_dir_inode_ops->node_hdr_size; 111 nodehdr_size = mp->m_dir_inode_ops->node_hdr_size;
112 mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry), 112 mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
113 KM_SLEEP | KM_MAYFAIL); 113 KM_MAYFAIL);
114 mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry), 114 mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
115 KM_SLEEP | KM_MAYFAIL); 115 KM_MAYFAIL);
116 if (!mp->m_dir_geo || !mp->m_attr_geo) { 116 if (!mp->m_dir_geo || !mp->m_attr_geo) {
117 kmem_free(mp->m_dir_geo); 117 kmem_free(mp->m_dir_geo);
118 kmem_free(mp->m_attr_geo); 118 kmem_free(mp->m_attr_geo);
@@ -217,7 +217,7 @@ xfs_dir_init(
217 if (error) 217 if (error)
218 return error; 218 return error;
219 219
220 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); 220 args = kmem_zalloc(sizeof(*args), KM_NOFS);
221 if (!args) 221 if (!args)
222 return -ENOMEM; 222 return -ENOMEM;
223 223
@@ -254,7 +254,7 @@ xfs_dir_createname(
254 XFS_STATS_INC(dp->i_mount, xs_dir_create); 254 XFS_STATS_INC(dp->i_mount, xs_dir_create);
255 } 255 }
256 256
257 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); 257 args = kmem_zalloc(sizeof(*args), KM_NOFS);
258 if (!args) 258 if (!args)
259 return -ENOMEM; 259 return -ENOMEM;
260 260
@@ -353,7 +353,7 @@ xfs_dir_lookup(
353 * lockdep Doing this avoids having to add a bunch of lockdep class 353 * lockdep Doing this avoids having to add a bunch of lockdep class
354 * annotations into the reclaim path for the ilock. 354 * annotations into the reclaim path for the ilock.
355 */ 355 */
356 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); 356 args = kmem_zalloc(sizeof(*args), KM_NOFS);
357 args->geo = dp->i_mount->m_dir_geo; 357 args->geo = dp->i_mount->m_dir_geo;
358 args->name = name->name; 358 args->name = name->name;
359 args->namelen = name->len; 359 args->namelen = name->len;
@@ -422,7 +422,7 @@ xfs_dir_removename(
422 ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); 422 ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
423 XFS_STATS_INC(dp->i_mount, xs_dir_remove); 423 XFS_STATS_INC(dp->i_mount, xs_dir_remove);
424 424
425 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); 425 args = kmem_zalloc(sizeof(*args), KM_NOFS);
426 if (!args) 426 if (!args)
427 return -ENOMEM; 427 return -ENOMEM;
428 428
@@ -483,7 +483,7 @@ xfs_dir_replace(
483 if (rval) 483 if (rval)
484 return rval; 484 return rval;
485 485
486 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); 486 args = kmem_zalloc(sizeof(*args), KM_NOFS);
487 if (!args) 487 if (!args)
488 return -ENOMEM; 488 return -ENOMEM;
489 489
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index a6fb0cc2085e..9595ced393dc 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -1092,7 +1092,7 @@ xfs_dir2_sf_to_block(
1092 * Copy the directory into a temporary buffer. 1092 * Copy the directory into a temporary buffer.
1093 * Then pitch the incore inode data so we can make extents. 1093 * Then pitch the incore inode data so we can make extents.
1094 */ 1094 */
1095 sfp = kmem_alloc(ifp->if_bytes, KM_SLEEP); 1095 sfp = kmem_alloc(ifp->if_bytes, 0);
1096 memcpy(sfp, oldsfp, ifp->if_bytes); 1096 memcpy(sfp, oldsfp, ifp->if_bytes);
1097 1097
1098 xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK); 1098 xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK);
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index 1fc44efc344d..705c4f562758 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -32,8 +32,6 @@ static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state,
32static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp, 32static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp,
33 int index, xfs_da_state_blk_t *dblk, 33 int index, xfs_da_state_blk_t *dblk,
34 int *rval); 34 int *rval);
35static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
36 xfs_da_state_blk_t *fblk);
37 35
38/* 36/*
39 * Check internal consistency of a leafn block. 37 * Check internal consistency of a leafn block.
@@ -1611,113 +1609,152 @@ xfs_dir2_leafn_unbalance(
1611} 1609}
1612 1610
1613/* 1611/*
1614 * Top-level node form directory addname routine. 1612 * Add a new data block to the directory at the free space index that the caller
1613 * has specified.
1615 */ 1614 */
1616int /* error */ 1615static int
1617xfs_dir2_node_addname( 1616xfs_dir2_node_add_datablk(
1618 xfs_da_args_t *args) /* operation arguments */ 1617 struct xfs_da_args *args,
1618 struct xfs_da_state_blk *fblk,
1619 xfs_dir2_db_t *dbno,
1620 struct xfs_buf **dbpp,
1621 struct xfs_buf **fbpp,
1622 int *findex)
1619{ 1623{
1620 xfs_da_state_blk_t *blk; /* leaf block for insert */ 1624 struct xfs_inode *dp = args->dp;
1621 int error; /* error return value */ 1625 struct xfs_trans *tp = args->trans;
1622 int rval; /* sub-return value */ 1626 struct xfs_mount *mp = dp->i_mount;
1623 xfs_da_state_t *state; /* btree cursor */ 1627 struct xfs_dir3_icfree_hdr freehdr;
1628 struct xfs_dir2_data_free *bf;
1629 struct xfs_dir2_data_hdr *hdr;
1630 struct xfs_dir2_free *free = NULL;
1631 xfs_dir2_db_t fbno;
1632 struct xfs_buf *fbp;
1633 struct xfs_buf *dbp;
1634 __be16 *bests = NULL;
1635 int error;
1624 1636
1625 trace_xfs_dir2_node_addname(args); 1637 /* Not allowed to allocate, return failure. */
1638 if (args->total == 0)
1639 return -ENOSPC;
1640
1641 /* Allocate and initialize the new data block. */
1642 error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, dbno);
1643 if (error)
1644 return error;
1645 error = xfs_dir3_data_init(args, *dbno, &dbp);
1646 if (error)
1647 return error;
1626 1648
1627 /* 1649 /*
1628 * Allocate and initialize the state (btree cursor). 1650 * Get the freespace block corresponding to the data block
1629 */ 1651 * that was just allocated.
1630 state = xfs_da_state_alloc();
1631 state->args = args;
1632 state->mp = args->dp->i_mount;
1633 /*
1634 * Look up the name. We're not supposed to find it, but
1635 * this gives us the insertion point.
1636 */ 1652 */
1637 error = xfs_da3_node_lookup_int(state, &rval); 1653 fbno = dp->d_ops->db_to_fdb(args->geo, *dbno);
1654 error = xfs_dir2_free_try_read(tp, dp,
1655 xfs_dir2_db_to_da(args->geo, fbno), &fbp);
1638 if (error) 1656 if (error)
1639 rval = error; 1657 return error;
1640 if (rval != -ENOENT) { 1658
1641 goto done;
1642 }
1643 /* 1659 /*
1644 * Add the data entry to a data block. 1660 * If there wasn't a freespace block, the read will
1645 * Extravalid is set to a freeblock found by lookup. 1661 * return a NULL fbp. Allocate and initialize a new one.
1646 */ 1662 */
1647 rval = xfs_dir2_node_addname_int(args, 1663 if (!fbp) {
1648 state->extravalid ? &state->extrablk : NULL); 1664 error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fbno);
1649 if (rval) { 1665 if (error)
1650 goto done; 1666 return error;
1667
1668 if (dp->d_ops->db_to_fdb(args->geo, *dbno) != fbno) {
1669 xfs_alert(mp,
1670"%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld",
1671 __func__, (unsigned long long)dp->i_ino,
1672 (long long)dp->d_ops->db_to_fdb(args->geo, *dbno),
1673 (long long)*dbno, (long long)fbno);
1674 if (fblk) {
1675 xfs_alert(mp,
1676 " fblk "PTR_FMT" blkno %llu index %d magic 0x%x",
1677 fblk, (unsigned long long)fblk->blkno,
1678 fblk->index, fblk->magic);
1679 } else {
1680 xfs_alert(mp, " ... fblk is NULL");
1681 }
1682 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
1683 return -EFSCORRUPTED;
1684 }
1685
1686 /* Get a buffer for the new block. */
1687 error = xfs_dir3_free_get_buf(args, fbno, &fbp);
1688 if (error)
1689 return error;
1690 free = fbp->b_addr;
1691 bests = dp->d_ops->free_bests_p(free);
1692 dp->d_ops->free_hdr_from_disk(&freehdr, free);
1693
1694 /* Remember the first slot as our empty slot. */
1695 freehdr.firstdb = (fbno - xfs_dir2_byte_to_db(args->geo,
1696 XFS_DIR2_FREE_OFFSET)) *
1697 dp->d_ops->free_max_bests(args->geo);
1698 } else {
1699 free = fbp->b_addr;
1700 bests = dp->d_ops->free_bests_p(free);
1701 dp->d_ops->free_hdr_from_disk(&freehdr, free);
1651 } 1702 }
1652 blk = &state->path.blk[state->path.active - 1]; 1703
1653 ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); 1704 /* Set the freespace block index from the data block number. */
1705 *findex = dp->d_ops->db_to_fdindex(args->geo, *dbno);
1706
1707 /* Extend the freespace table if the new data block is off the end. */
1708 if (*findex >= freehdr.nvalid) {
1709 ASSERT(*findex < dp->d_ops->free_max_bests(args->geo));
1710 freehdr.nvalid = *findex + 1;
1711 bests[*findex] = cpu_to_be16(NULLDATAOFF);
1712 }
1713
1654 /* 1714 /*
1655 * Add the new leaf entry. 1715 * If this entry was for an empty data block (this should always be
1716 * true) then update the header.
1656 */ 1717 */
1657 rval = xfs_dir2_leafn_add(blk->bp, args, blk->index); 1718 if (bests[*findex] == cpu_to_be16(NULLDATAOFF)) {
1658 if (rval == 0) { 1719 freehdr.nused++;
1659 /* 1720 dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
1660 * It worked, fix the hash values up the btree. 1721 xfs_dir2_free_log_header(args, fbp);
1661 */
1662 if (!(args->op_flags & XFS_DA_OP_JUSTCHECK))
1663 xfs_da3_fixhashpath(state, &state->path);
1664 } else {
1665 /*
1666 * It didn't work, we need to split the leaf block.
1667 */
1668 if (args->total == 0) {
1669 ASSERT(rval == -ENOSPC);
1670 goto done;
1671 }
1672 /*
1673 * Split the leaf block and insert the new entry.
1674 */
1675 rval = xfs_da3_split(state);
1676 } 1722 }
1677done: 1723
1678 xfs_da_state_free(state); 1724 /* Update the freespace value for the new block in the table. */
1679 return rval; 1725 hdr = dbp->b_addr;
1726 bf = dp->d_ops->data_bestfree_p(hdr);
1727 bests[*findex] = bf[0].length;
1728
1729 *dbpp = dbp;
1730 *fbpp = fbp;
1731 return 0;
1680} 1732}
1681 1733
1682/* 1734static int
1683 * Add the data entry for a node-format directory name addition. 1735xfs_dir2_node_find_freeblk(
1684 * The leaf entry is added in xfs_dir2_leafn_add. 1736 struct xfs_da_args *args,
1685 * We may enter with a freespace block that the lookup found. 1737 struct xfs_da_state_blk *fblk,
1686 */ 1738 xfs_dir2_db_t *dbnop,
1687static int /* error */ 1739 struct xfs_buf **fbpp,
1688xfs_dir2_node_addname_int( 1740 int *findexp,
1689 xfs_da_args_t *args, /* operation arguments */ 1741 int length)
1690 xfs_da_state_blk_t *fblk) /* optional freespace block */
1691{ 1742{
1692 xfs_dir2_data_hdr_t *hdr; /* data block header */
1693 xfs_dir2_db_t dbno; /* data block number */
1694 struct xfs_buf *dbp; /* data block buffer */
1695 xfs_dir2_data_entry_t *dep; /* data entry pointer */
1696 xfs_inode_t *dp; /* incore directory inode */
1697 xfs_dir2_data_unused_t *dup; /* data unused entry pointer */
1698 int error; /* error return value */
1699 xfs_dir2_db_t fbno; /* freespace block number */
1700 struct xfs_buf *fbp; /* freespace buffer */
1701 int findex; /* freespace entry index */
1702 xfs_dir2_free_t *free=NULL; /* freespace block structure */
1703 xfs_dir2_db_t ifbno; /* initial freespace block no */
1704 xfs_dir2_db_t lastfbno=0; /* highest freespace block no */
1705 int length; /* length of the new entry */
1706 int logfree; /* need to log free entry */
1707 xfs_mount_t *mp; /* filesystem mount point */
1708 int needlog; /* need to log data header */
1709 int needscan; /* need to rescan data frees */
1710 __be16 *tagp; /* data entry tag pointer */
1711 xfs_trans_t *tp; /* transaction pointer */
1712 __be16 *bests;
1713 struct xfs_dir3_icfree_hdr freehdr; 1743 struct xfs_dir3_icfree_hdr freehdr;
1714 struct xfs_dir2_data_free *bf; 1744 struct xfs_dir2_free *free = NULL;
1715 xfs_dir2_data_aoff_t aoff; 1745 struct xfs_inode *dp = args->dp;
1746 struct xfs_trans *tp = args->trans;
1747 struct xfs_buf *fbp = NULL;
1748 xfs_dir2_db_t firstfbno;
1749 xfs_dir2_db_t lastfbno;
1750 xfs_dir2_db_t ifbno = -1;
1751 xfs_dir2_db_t dbno = -1;
1752 xfs_dir2_db_t fbno;
1753 xfs_fileoff_t fo;
1754 __be16 *bests = NULL;
1755 int findex = 0;
1756 int error;
1716 1757
1717 dp = args->dp;
1718 mp = dp->i_mount;
1719 tp = args->trans;
1720 length = dp->d_ops->data_entsize(args->namelen);
1721 /* 1758 /*
1722 * If we came in with a freespace block that means that lookup 1759 * If we came in with a freespace block that means that lookup
1723 * found an entry with our hash value. This is the freespace 1760 * found an entry with our hash value. This is the freespace
@@ -1725,288 +1762,157 @@ xfs_dir2_node_addname_int(
1725 */ 1762 */
1726 if (fblk) { 1763 if (fblk) {
1727 fbp = fblk->bp; 1764 fbp = fblk->bp;
1728 /*
1729 * Remember initial freespace block number.
1730 */
1731 ifbno = fblk->blkno;
1732 free = fbp->b_addr; 1765 free = fbp->b_addr;
1733 findex = fblk->index; 1766 findex = fblk->index;
1734 bests = dp->d_ops->free_bests_p(free);
1735 dp->d_ops->free_hdr_from_disk(&freehdr, free);
1736
1737 /*
1738 * This means the free entry showed that the data block had
1739 * space for our entry, so we remembered it.
1740 * Use that data block.
1741 */
1742 if (findex >= 0) { 1767 if (findex >= 0) {
1768 /* caller already found the freespace for us. */
1769 bests = dp->d_ops->free_bests_p(free);
1770 dp->d_ops->free_hdr_from_disk(&freehdr, free);
1771
1743 ASSERT(findex < freehdr.nvalid); 1772 ASSERT(findex < freehdr.nvalid);
1744 ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF); 1773 ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF);
1745 ASSERT(be16_to_cpu(bests[findex]) >= length); 1774 ASSERT(be16_to_cpu(bests[findex]) >= length);
1746 dbno = freehdr.firstdb + findex; 1775 dbno = freehdr.firstdb + findex;
1747 } else { 1776 goto found_block;
1748 /*
1749 * The data block looked at didn't have enough room.
1750 * We'll start at the beginning of the freespace entries.
1751 */
1752 dbno = -1;
1753 findex = 0;
1754 } 1777 }
1755 } else { 1778
1756 /* 1779 /*
1757 * Didn't come in with a freespace block, so no data block. 1780 * The data block looked at didn't have enough room.
1781 * We'll start at the beginning of the freespace entries.
1758 */ 1782 */
1759 ifbno = dbno = -1; 1783 ifbno = fblk->blkno;
1784 xfs_trans_brelse(tp, fbp);
1760 fbp = NULL; 1785 fbp = NULL;
1761 findex = 0; 1786 fblk->bp = NULL;
1762 } 1787 }
1763 1788
1764 /* 1789 /*
1765 * If we don't have a data block yet, we're going to scan the 1790 * If we don't have a data block yet, we're going to scan the freespace
1766 * freespace blocks looking for one. Figure out what the 1791 * data for a data block with enough free space in it.
1767 * highest freespace block number is.
1768 */
1769 if (dbno == -1) {
1770 xfs_fileoff_t fo; /* freespace block number */
1771
1772 if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK)))
1773 return error;
1774 lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo);
1775 fbno = ifbno;
1776 }
1777 /*
1778 * While we haven't identified a data block, search the freeblock
1779 * data for a good data block. If we find a null freeblock entry,
1780 * indicating a hole in the data blocks, remember that.
1781 */ 1792 */
1782 while (dbno == -1) { 1793 error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK);
1783 /* 1794 if (error)
1784 * If we don't have a freeblock in hand, get the next one. 1795 return error;
1785 */ 1796 lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo);
1786 if (fbp == NULL) { 1797 firstfbno = xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET);
1787 /*
1788 * Happens the first time through unless lookup gave
1789 * us a freespace block to start with.
1790 */
1791 if (++fbno == 0)
1792 fbno = xfs_dir2_byte_to_db(args->geo,
1793 XFS_DIR2_FREE_OFFSET);
1794 /*
1795 * If it's ifbno we already looked at it.
1796 */
1797 if (fbno == ifbno)
1798 fbno++;
1799 /*
1800 * If it's off the end we're done.
1801 */
1802 if (fbno >= lastfbno)
1803 break;
1804 /*
1805 * Read the block. There can be holes in the
1806 * freespace blocks, so this might not succeed.
1807 * This should be really rare, so there's no reason
1808 * to avoid it.
1809 */
1810 error = xfs_dir2_free_try_read(tp, dp,
1811 xfs_dir2_db_to_da(args->geo, fbno),
1812 &fbp);
1813 if (error)
1814 return error;
1815 if (!fbp)
1816 continue;
1817 free = fbp->b_addr;
1818 findex = 0;
1819 }
1820 /*
1821 * Look at the current free entry. Is it good enough?
1822 *
1823 * The bests initialisation should be where the bufer is read in
1824 * the above branch. But gcc is too stupid to realise that bests
1825 * and the freehdr are actually initialised if they are placed
1826 * there, so we have to do it here to avoid warnings. Blech.
1827 */
1828 bests = dp->d_ops->free_bests_p(free);
1829 dp->d_ops->free_hdr_from_disk(&freehdr, free);
1830 if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
1831 be16_to_cpu(bests[findex]) >= length)
1832 dbno = freehdr.firstdb + findex;
1833 else {
1834 /*
1835 * Are we done with the freeblock?
1836 */
1837 if (++findex == freehdr.nvalid) {
1838 /*
1839 * Drop the block.
1840 */
1841 xfs_trans_brelse(tp, fbp);
1842 fbp = NULL;
1843 if (fblk && fblk->bp)
1844 fblk->bp = NULL;
1845 }
1846 }
1847 }
1848 /*
1849 * If we don't have a data block, we need to allocate one and make
1850 * the freespace entries refer to it.
1851 */
1852 if (unlikely(dbno == -1)) {
1853 /*
1854 * Not allowed to allocate, return failure.
1855 */
1856 if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
1857 return -ENOSPC;
1858
1859 /*
1860 * Allocate and initialize the new data block.
1861 */
1862 if (unlikely((error = xfs_dir2_grow_inode(args,
1863 XFS_DIR2_DATA_SPACE,
1864 &dbno)) ||
1865 (error = xfs_dir3_data_init(args, dbno, &dbp))))
1866 return error;
1867 1798
1868 /* 1799 for (fbno = lastfbno - 1; fbno >= firstfbno; fbno--) {
1869 * If (somehow) we have a freespace block, get rid of it. 1800 /* If it's ifbno we already looked at it. */
1870 */ 1801 if (fbno == ifbno)
1871 if (fbp) 1802 continue;
1872 xfs_trans_brelse(tp, fbp);
1873 if (fblk && fblk->bp)
1874 fblk->bp = NULL;
1875 1803
1876 /* 1804 /*
1877 * Get the freespace block corresponding to the data block 1805 * Read the block. There can be holes in the freespace blocks,
1878 * that was just allocated. 1806 * so this might not succeed. This should be really rare, so
1807 * there's no reason to avoid it.
1879 */ 1808 */
1880 fbno = dp->d_ops->db_to_fdb(args->geo, dbno);
1881 error = xfs_dir2_free_try_read(tp, dp, 1809 error = xfs_dir2_free_try_read(tp, dp,
1882 xfs_dir2_db_to_da(args->geo, fbno), 1810 xfs_dir2_db_to_da(args->geo, fbno),
1883 &fbp); 1811 &fbp);
1884 if (error) 1812 if (error)
1885 return error; 1813 return error;
1814 if (!fbp)
1815 continue;
1886 1816
1887 /* 1817 free = fbp->b_addr;
1888 * If there wasn't a freespace block, the read will 1818 bests = dp->d_ops->free_bests_p(free);
1889 * return a NULL fbp. Allocate and initialize a new one. 1819 dp->d_ops->free_hdr_from_disk(&freehdr, free);
1890 */
1891 if (!fbp) {
1892 error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE,
1893 &fbno);
1894 if (error)
1895 return error;
1896 1820
1897 if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) { 1821 /* Scan the free entry array for a large enough free space. */
1898 xfs_alert(mp, 1822 for (findex = freehdr.nvalid - 1; findex >= 0; findex--) {
1899"%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld ifbno %llu lastfbno %d", 1823 if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
1900 __func__, (unsigned long long)dp->i_ino, 1824 be16_to_cpu(bests[findex]) >= length) {
1901 (long long)dp->d_ops->db_to_fdb( 1825 dbno = freehdr.firstdb + findex;
1902 args->geo, dbno), 1826 goto found_block;
1903 (long long)dbno, (long long)fbno,
1904 (unsigned long long)ifbno, lastfbno);
1905 if (fblk) {
1906 xfs_alert(mp,
1907 " fblk "PTR_FMT" blkno %llu index %d magic 0x%x",
1908 fblk,
1909 (unsigned long long)fblk->blkno,
1910 fblk->index,
1911 fblk->magic);
1912 } else {
1913 xfs_alert(mp, " ... fblk is NULL");
1914 }
1915 XFS_ERROR_REPORT("xfs_dir2_node_addname_int",
1916 XFS_ERRLEVEL_LOW, mp);
1917 return -EFSCORRUPTED;
1918 } 1827 }
1919
1920 /*
1921 * Get a buffer for the new block.
1922 */
1923 error = xfs_dir3_free_get_buf(args, fbno, &fbp);
1924 if (error)
1925 return error;
1926 free = fbp->b_addr;
1927 bests = dp->d_ops->free_bests_p(free);
1928 dp->d_ops->free_hdr_from_disk(&freehdr, free);
1929
1930 /*
1931 * Remember the first slot as our empty slot.
1932 */
1933 freehdr.firstdb =
1934 (fbno - xfs_dir2_byte_to_db(args->geo,
1935 XFS_DIR2_FREE_OFFSET)) *
1936 dp->d_ops->free_max_bests(args->geo);
1937 } else {
1938 free = fbp->b_addr;
1939 bests = dp->d_ops->free_bests_p(free);
1940 dp->d_ops->free_hdr_from_disk(&freehdr, free);
1941 } 1828 }
1942 1829
1943 /* 1830 /* Didn't find free space, go on to next free block */
1944 * Set the freespace block index from the data block number. 1831 xfs_trans_brelse(tp, fbp);
1945 */
1946 findex = dp->d_ops->db_to_fdindex(args->geo, dbno);
1947 /*
1948 * If it's after the end of the current entries in the
1949 * freespace block, extend that table.
1950 */
1951 if (findex >= freehdr.nvalid) {
1952 ASSERT(findex < dp->d_ops->free_max_bests(args->geo));
1953 freehdr.nvalid = findex + 1;
1954 /*
1955 * Tag new entry so nused will go up.
1956 */
1957 bests[findex] = cpu_to_be16(NULLDATAOFF);
1958 }
1959 /*
1960 * If this entry was for an empty data block
1961 * (this should always be true) then update the header.
1962 */
1963 if (bests[findex] == cpu_to_be16(NULLDATAOFF)) {
1964 freehdr.nused++;
1965 dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
1966 xfs_dir2_free_log_header(args, fbp);
1967 }
1968 /*
1969 * Update the real value in the table.
1970 * We haven't allocated the data entry yet so this will
1971 * change again.
1972 */
1973 hdr = dbp->b_addr;
1974 bf = dp->d_ops->data_bestfree_p(hdr);
1975 bests[findex] = bf[0].length;
1976 logfree = 1;
1977 } 1832 }
1833
1834found_block:
1835 *dbnop = dbno;
1836 *fbpp = fbp;
1837 *findexp = findex;
1838 return 0;
1839}
1840
1841
1842/*
1843 * Add the data entry for a node-format directory name addition.
1844 * The leaf entry is added in xfs_dir2_leafn_add.
1845 * We may enter with a freespace block that the lookup found.
1846 */
1847static int
1848xfs_dir2_node_addname_int(
1849 struct xfs_da_args *args, /* operation arguments */
1850 struct xfs_da_state_blk *fblk) /* optional freespace block */
1851{
1852 struct xfs_dir2_data_unused *dup; /* data unused entry pointer */
1853 struct xfs_dir2_data_entry *dep; /* data entry pointer */
1854 struct xfs_dir2_data_hdr *hdr; /* data block header */
1855 struct xfs_dir2_data_free *bf;
1856 struct xfs_dir2_free *free = NULL; /* freespace block structure */
1857 struct xfs_trans *tp = args->trans;
1858 struct xfs_inode *dp = args->dp;
1859 struct xfs_buf *dbp; /* data block buffer */
1860 struct xfs_buf *fbp; /* freespace buffer */
1861 xfs_dir2_data_aoff_t aoff;
1862 xfs_dir2_db_t dbno; /* data block number */
1863 int error; /* error return value */
1864 int findex; /* freespace entry index */
1865 int length; /* length of the new entry */
1866 int logfree = 0; /* need to log free entry */
1867 int needlog = 0; /* need to log data header */
1868 int needscan = 0; /* need to rescan data frees */
1869 __be16 *tagp; /* data entry tag pointer */
1870 __be16 *bests;
1871
1872 length = dp->d_ops->data_entsize(args->namelen);
1873 error = xfs_dir2_node_find_freeblk(args, fblk, &dbno, &fbp, &findex,
1874 length);
1875 if (error)
1876 return error;
1877
1978 /* 1878 /*
1979 * We had a data block so we don't have to make a new one. 1879 * Now we know if we must allocate blocks, so if we are checking whether
1880 * we can insert without allocation then we can return now.
1980 */ 1881 */
1981 else { 1882 if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
1982 /* 1883 if (dbno == -1)
1983 * If just checking, we succeeded. 1884 return -ENOSPC;
1984 */ 1885 return 0;
1985 if (args->op_flags & XFS_DA_OP_JUSTCHECK) 1886 }
1986 return 0;
1987 1887
1988 /* 1888 /*
1989 * Read the data block in. 1889 * If we don't have a data block, we need to allocate one and make
1990 */ 1890 * the freespace entries refer to it.
1891 */
1892 if (dbno == -1) {
1893 /* we're going to have to log the free block index later */
1894 logfree = 1;
1895 error = xfs_dir2_node_add_datablk(args, fblk, &dbno, &dbp, &fbp,
1896 &findex);
1897 } else {
1898 /* Read the data block in. */
1991 error = xfs_dir3_data_read(tp, dp, 1899 error = xfs_dir3_data_read(tp, dp,
1992 xfs_dir2_db_to_da(args->geo, dbno), 1900 xfs_dir2_db_to_da(args->geo, dbno),
1993 -1, &dbp); 1901 -1, &dbp);
1994 if (error)
1995 return error;
1996 hdr = dbp->b_addr;
1997 bf = dp->d_ops->data_bestfree_p(hdr);
1998 logfree = 0;
1999 } 1902 }
1903 if (error)
1904 return error;
1905
1906 /* setup for data block up now */
1907 hdr = dbp->b_addr;
1908 bf = dp->d_ops->data_bestfree_p(hdr);
2000 ASSERT(be16_to_cpu(bf[0].length) >= length); 1909 ASSERT(be16_to_cpu(bf[0].length) >= length);
2001 /* 1910
2002 * Point to the existing unused space. 1911 /* Point to the existing unused space. */
2003 */
2004 dup = (xfs_dir2_data_unused_t *) 1912 dup = (xfs_dir2_data_unused_t *)
2005 ((char *)hdr + be16_to_cpu(bf[0].offset)); 1913 ((char *)hdr + be16_to_cpu(bf[0].offset));
2006 needscan = needlog = 0; 1914
2007 /* 1915 /* Mark the first part of the unused space, inuse for us. */
2008 * Mark the first part of the unused space, inuse for us.
2009 */
2010 aoff = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr); 1916 aoff = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr);
2011 error = xfs_dir2_data_use_free(args, dbp, dup, aoff, length, 1917 error = xfs_dir2_data_use_free(args, dbp, dup, aoff, length,
2012 &needlog, &needscan); 1918 &needlog, &needscan);
@@ -2014,9 +1920,8 @@ xfs_dir2_node_addname_int(
2014 xfs_trans_brelse(tp, dbp); 1920 xfs_trans_brelse(tp, dbp);
2015 return error; 1921 return error;
2016 } 1922 }
2017 /* 1923
2018 * Fill in the new entry and log it. 1924 /* Fill in the new entry and log it. */
2019 */
2020 dep = (xfs_dir2_data_entry_t *)dup; 1925 dep = (xfs_dir2_data_entry_t *)dup;
2021 dep->inumber = cpu_to_be64(args->inumber); 1926 dep->inumber = cpu_to_be64(args->inumber);
2022 dep->namelen = args->namelen; 1927 dep->namelen = args->namelen;
@@ -2025,38 +1930,101 @@ xfs_dir2_node_addname_int(
2025 tagp = dp->d_ops->data_entry_tag_p(dep); 1930 tagp = dp->d_ops->data_entry_tag_p(dep);
2026 *tagp = cpu_to_be16((char *)dep - (char *)hdr); 1931 *tagp = cpu_to_be16((char *)dep - (char *)hdr);
2027 xfs_dir2_data_log_entry(args, dbp, dep); 1932 xfs_dir2_data_log_entry(args, dbp, dep);
2028 /* 1933
2029 * Rescan the block for bestfree if needed. 1934 /* Rescan the freespace and log the data block if needed. */
2030 */
2031 if (needscan) 1935 if (needscan)
2032 xfs_dir2_data_freescan(dp, hdr, &needlog); 1936 xfs_dir2_data_freescan(dp, hdr, &needlog);
2033 /*
2034 * Log the data block header if needed.
2035 */
2036 if (needlog) 1937 if (needlog)
2037 xfs_dir2_data_log_header(args, dbp); 1938 xfs_dir2_data_log_header(args, dbp);
2038 /* 1939
2039 * If the freespace entry is now wrong, update it. 1940 /* If the freespace block entry is now wrong, update it. */
2040 */ 1941 free = fbp->b_addr;
2041 bests = dp->d_ops->free_bests_p(free); /* gcc is so stupid */ 1942 bests = dp->d_ops->free_bests_p(free);
2042 if (be16_to_cpu(bests[findex]) != be16_to_cpu(bf[0].length)) { 1943 if (bests[findex] != bf[0].length) {
2043 bests[findex] = bf[0].length; 1944 bests[findex] = bf[0].length;
2044 logfree = 1; 1945 logfree = 1;
2045 } 1946 }
2046 /* 1947
2047 * Log the freespace entry if needed. 1948 /* Log the freespace entry if needed. */
2048 */
2049 if (logfree) 1949 if (logfree)
2050 xfs_dir2_free_log_bests(args, fbp, findex, findex); 1950 xfs_dir2_free_log_bests(args, fbp, findex, findex);
2051 /* 1951
2052 * Return the data block and offset in args, then drop the data block. 1952 /* Return the data block and offset in args. */
2053 */
2054 args->blkno = (xfs_dablk_t)dbno; 1953 args->blkno = (xfs_dablk_t)dbno;
2055 args->index = be16_to_cpu(*tagp); 1954 args->index = be16_to_cpu(*tagp);
2056 return 0; 1955 return 0;
2057} 1956}
2058 1957
2059/* 1958/*
1959 * Top-level node form directory addname routine.
1960 */
1961int /* error */
1962xfs_dir2_node_addname(
1963 xfs_da_args_t *args) /* operation arguments */
1964{
1965 xfs_da_state_blk_t *blk; /* leaf block for insert */
1966 int error; /* error return value */
1967 int rval; /* sub-return value */
1968 xfs_da_state_t *state; /* btree cursor */
1969
1970 trace_xfs_dir2_node_addname(args);
1971
1972 /*
1973 * Allocate and initialize the state (btree cursor).
1974 */
1975 state = xfs_da_state_alloc();
1976 state->args = args;
1977 state->mp = args->dp->i_mount;
1978 /*
1979 * Look up the name. We're not supposed to find it, but
1980 * this gives us the insertion point.
1981 */
1982 error = xfs_da3_node_lookup_int(state, &rval);
1983 if (error)
1984 rval = error;
1985 if (rval != -ENOENT) {
1986 goto done;
1987 }
1988 /*
1989 * Add the data entry to a data block.
1990 * Extravalid is set to a freeblock found by lookup.
1991 */
1992 rval = xfs_dir2_node_addname_int(args,
1993 state->extravalid ? &state->extrablk : NULL);
1994 if (rval) {
1995 goto done;
1996 }
1997 blk = &state->path.blk[state->path.active - 1];
1998 ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
1999 /*
2000 * Add the new leaf entry.
2001 */
2002 rval = xfs_dir2_leafn_add(blk->bp, args, blk->index);
2003 if (rval == 0) {
2004 /*
2005 * It worked, fix the hash values up the btree.
2006 */
2007 if (!(args->op_flags & XFS_DA_OP_JUSTCHECK))
2008 xfs_da3_fixhashpath(state, &state->path);
2009 } else {
2010 /*
2011 * It didn't work, we need to split the leaf block.
2012 */
2013 if (args->total == 0) {
2014 ASSERT(rval == -ENOSPC);
2015 goto done;
2016 }
2017 /*
2018 * Split the leaf block and insert the new entry.
2019 */
2020 rval = xfs_da3_split(state);
2021 }
2022done:
2023 xfs_da_state_free(state);
2024 return rval;
2025}
2026
2027/*
2060 * Lookup an entry in a node-format directory. 2028 * Lookup an entry in a node-format directory.
2061 * All the real work happens in xfs_da3_node_lookup_int. 2029 * All the real work happens in xfs_da3_node_lookup_int.
2062 * The only real output is the inode number of the entry. 2030 * The only real output is the inode number of the entry.
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 033589257f54..85f14fc2a8da 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -164,7 +164,7 @@ xfs_dir2_block_to_sf(
164 * can free the block and copy the formatted data into the inode literal 164 * can free the block and copy the formatted data into the inode literal
165 * area. 165 * area.
166 */ 166 */
167 dst = kmem_alloc(mp->m_sb.sb_inodesize, KM_SLEEP); 167 dst = kmem_alloc(mp->m_sb.sb_inodesize, 0);
168 hdr = bp->b_addr; 168 hdr = bp->b_addr;
169 169
170 /* 170 /*
@@ -436,7 +436,7 @@ xfs_dir2_sf_addname_hard(
436 436
437 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; 437 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
438 old_isize = (int)dp->i_d.di_size; 438 old_isize = (int)dp->i_d.di_size;
439 buf = kmem_alloc(old_isize, KM_SLEEP); 439 buf = kmem_alloc(old_isize, 0);
440 oldsfp = (xfs_dir2_sf_hdr_t *)buf; 440 oldsfp = (xfs_dir2_sf_hdr_t *)buf;
441 memcpy(oldsfp, sfp, old_isize); 441 memcpy(oldsfp, sfp, old_isize);
442 /* 442 /*
@@ -1096,7 +1096,7 @@ xfs_dir2_sf_toino4(
1096 * Don't want xfs_idata_realloc copying the data here. 1096 * Don't want xfs_idata_realloc copying the data here.
1097 */ 1097 */
1098 oldsize = dp->i_df.if_bytes; 1098 oldsize = dp->i_df.if_bytes;
1099 buf = kmem_alloc(oldsize, KM_SLEEP); 1099 buf = kmem_alloc(oldsize, 0);
1100 oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; 1100 oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
1101 ASSERT(oldsfp->i8count == 1); 1101 ASSERT(oldsfp->i8count == 1);
1102 memcpy(buf, oldsfp, oldsize); 1102 memcpy(buf, oldsfp, oldsize);
@@ -1169,7 +1169,7 @@ xfs_dir2_sf_toino8(
1169 * Don't want xfs_idata_realloc copying the data here. 1169 * Don't want xfs_idata_realloc copying the data here.
1170 */ 1170 */
1171 oldsize = dp->i_df.if_bytes; 1171 oldsize = dp->i_df.if_bytes;
1172 buf = kmem_alloc(oldsize, KM_SLEEP); 1172 buf = kmem_alloc(oldsize, 0);
1173 oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; 1173 oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
1174 ASSERT(oldsfp->i8count == 0); 1174 ASSERT(oldsfp->i8count == 0);
1175 memcpy(buf, oldsfp, oldsize); 1175 memcpy(buf, oldsfp, oldsize);
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 52d03a3a02a4..39dd2b908106 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -287,7 +287,7 @@ struct xfs_ag_geometry {
287 uint32_t ag_ifree; /* o: inodes free */ 287 uint32_t ag_ifree; /* o: inodes free */
288 uint32_t ag_sick; /* o: sick things in ag */ 288 uint32_t ag_sick; /* o: sick things in ag */
289 uint32_t ag_checked; /* o: checked metadata in ag */ 289 uint32_t ag_checked; /* o: checked metadata in ag */
290 uint32_t ag_reserved32; /* o: zero */ 290 uint32_t ag_flags; /* i/o: flags for this ag */
291 uint64_t ag_reserved[12];/* o: zero */ 291 uint64_t ag_reserved[12];/* o: zero */
292}; 292};
293#define XFS_AG_GEOM_SICK_SB (1 << 0) /* superblock */ 293#define XFS_AG_GEOM_SICK_SB (1 << 0) /* superblock */
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 04377ab75863..588d44613094 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -2787,8 +2787,13 @@ xfs_ialloc_setup_geometry(
2787 igeo->inobt_maxlevels = xfs_btree_compute_maxlevels(igeo->inobt_mnr, 2787 igeo->inobt_maxlevels = xfs_btree_compute_maxlevels(igeo->inobt_mnr,
2788 inodes); 2788 inodes);
2789 2789
2790 /* Set the maximum inode count for this filesystem. */ 2790 /*
2791 if (sbp->sb_imax_pct) { 2791 * Set the maximum inode count for this filesystem, being careful not
2792 * to use obviously garbage sb_inopblog/sb_inopblock values. Regular
2793 * users should never get here due to failing sb verification, but
2794 * certain users (xfs_db) need to be usable even with corrupt metadata.
2795 */
2796 if (sbp->sb_imax_pct && igeo->ialloc_blks) {
2792 /* 2797 /*
2793 * Make sure the maximum inode count is a multiple 2798 * Make sure the maximum inode count is a multiple
2794 * of the units we allocate inodes in. 2799 * of the units we allocate inodes in.
diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c
index 27aa3f2bc4bc..7bc87408f1a0 100644
--- a/fs/xfs/libxfs/xfs_iext_tree.c
+++ b/fs/xfs/libxfs/xfs_iext_tree.c
@@ -616,7 +616,7 @@ xfs_iext_realloc_root(
616 * sequence counter is seen before the modifications to the extent tree itself 616 * sequence counter is seen before the modifications to the extent tree itself
617 * take effect. 617 * take effect.
618 */ 618 */
619static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp, int state) 619static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp)
620{ 620{
621 WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1); 621 WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1);
622} 622}
@@ -633,7 +633,7 @@ xfs_iext_insert(
633 struct xfs_iext_leaf *new = NULL; 633 struct xfs_iext_leaf *new = NULL;
634 int nr_entries, i; 634 int nr_entries, i;
635 635
636 xfs_iext_inc_seq(ifp, state); 636 xfs_iext_inc_seq(ifp);
637 637
638 if (ifp->if_height == 0) 638 if (ifp->if_height == 0)
639 xfs_iext_alloc_root(ifp, cur); 639 xfs_iext_alloc_root(ifp, cur);
@@ -875,7 +875,7 @@ xfs_iext_remove(
875 ASSERT(ifp->if_u1.if_root != NULL); 875 ASSERT(ifp->if_u1.if_root != NULL);
876 ASSERT(xfs_iext_valid(ifp, cur)); 876 ASSERT(xfs_iext_valid(ifp, cur));
877 877
878 xfs_iext_inc_seq(ifp, state); 878 xfs_iext_inc_seq(ifp);
879 879
880 nr_entries = xfs_iext_leaf_nr_entries(ifp, leaf, cur->pos) - 1; 880 nr_entries = xfs_iext_leaf_nr_entries(ifp, leaf, cur->pos) - 1;
881 for (i = cur->pos; i < nr_entries; i++) 881 for (i = cur->pos; i < nr_entries; i++)
@@ -983,7 +983,7 @@ xfs_iext_update_extent(
983{ 983{
984 struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state); 984 struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state);
985 985
986 xfs_iext_inc_seq(ifp, state); 986 xfs_iext_inc_seq(ifp);
987 987
988 if (cur->pos == 0) { 988 if (cur->pos == 0) {
989 struct xfs_bmbt_irec old; 989 struct xfs_bmbt_irec old;
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index bf3e04018246..c643beeb5a24 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -94,7 +94,7 @@ xfs_iformat_fork(
94 return 0; 94 return 0;
95 95
96 ASSERT(ip->i_afp == NULL); 96 ASSERT(ip->i_afp == NULL);
97 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS); 97 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_NOFS);
98 98
99 switch (dip->di_aformat) { 99 switch (dip->di_aformat) {
100 case XFS_DINODE_FMT_LOCAL: 100 case XFS_DINODE_FMT_LOCAL:
@@ -147,7 +147,7 @@ xfs_init_local_fork(
147 147
148 if (size) { 148 if (size) {
149 real_size = roundup(mem_size, 4); 149 real_size = roundup(mem_size, 4);
150 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS); 150 ifp->if_u1.if_data = kmem_alloc(real_size, KM_NOFS);
151 memcpy(ifp->if_u1.if_data, data, size); 151 memcpy(ifp->if_u1.if_data, data, size);
152 if (zero_terminate) 152 if (zero_terminate)
153 ifp->if_u1.if_data[size] = '\0'; 153 ifp->if_u1.if_data[size] = '\0';
@@ -302,7 +302,7 @@ xfs_iformat_btree(
302 } 302 }
303 303
304 ifp->if_broot_bytes = size; 304 ifp->if_broot_bytes = size;
305 ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS); 305 ifp->if_broot = kmem_alloc(size, KM_NOFS);
306 ASSERT(ifp->if_broot != NULL); 306 ASSERT(ifp->if_broot != NULL);
307 /* 307 /*
308 * Copy and convert from the on-disk structure 308 * Copy and convert from the on-disk structure
@@ -367,7 +367,7 @@ xfs_iroot_realloc(
367 */ 367 */
368 if (ifp->if_broot_bytes == 0) { 368 if (ifp->if_broot_bytes == 0) {
369 new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff); 369 new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff);
370 ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); 370 ifp->if_broot = kmem_alloc(new_size, KM_NOFS);
371 ifp->if_broot_bytes = (int)new_size; 371 ifp->if_broot_bytes = (int)new_size;
372 return; 372 return;
373 } 373 }
@@ -382,7 +382,7 @@ xfs_iroot_realloc(
382 new_max = cur_max + rec_diff; 382 new_max = cur_max + rec_diff;
383 new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); 383 new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
384 ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, 384 ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
385 KM_SLEEP | KM_NOFS); 385 KM_NOFS);
386 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, 386 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
387 ifp->if_broot_bytes); 387 ifp->if_broot_bytes);
388 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, 388 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
@@ -408,7 +408,7 @@ xfs_iroot_realloc(
408 else 408 else
409 new_size = 0; 409 new_size = 0;
410 if (new_size > 0) { 410 if (new_size > 0) {
411 new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); 411 new_broot = kmem_alloc(new_size, KM_NOFS);
412 /* 412 /*
413 * First copy over the btree block header. 413 * First copy over the btree block header.
414 */ 414 */
@@ -492,7 +492,7 @@ xfs_idata_realloc(
492 * We enforce that here. 492 * We enforce that here.
493 */ 493 */
494 ifp->if_u1.if_data = kmem_realloc(ifp->if_u1.if_data, 494 ifp->if_u1.if_data = kmem_realloc(ifp->if_u1.if_data,
495 roundup(new_size, 4), KM_SLEEP | KM_NOFS); 495 roundup(new_size, 4), KM_NOFS);
496 ifp->if_bytes = new_size; 496 ifp->if_bytes = new_size;
497} 497}
498 498
@@ -683,7 +683,7 @@ xfs_ifork_init_cow(
683 return; 683 return;
684 684
685 ip->i_cowfp = kmem_zone_zalloc(xfs_ifork_zone, 685 ip->i_cowfp = kmem_zone_zalloc(xfs_ifork_zone,
686 KM_SLEEP | KM_NOFS); 686 KM_NOFS);
687 ip->i_cowfp->if_flags = XFS_IFEXTENTS; 687 ip->i_cowfp->if_flags = XFS_IFEXTENTS;
688 ip->i_cformat = XFS_DINODE_FMT_EXTENTS; 688 ip->i_cformat = XFS_DINODE_FMT_EXTENTS;
689 ip->i_cnextents = 0; 689 ip->i_cnextents = 0;
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 51bb9bdb0e84..9a7fadb1361c 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -1174,7 +1174,7 @@ out_cur:
1174/* 1174/*
1175 * Record a refcount intent for later processing. 1175 * Record a refcount intent for later processing.
1176 */ 1176 */
1177static int 1177static void
1178__xfs_refcount_add( 1178__xfs_refcount_add(
1179 struct xfs_trans *tp, 1179 struct xfs_trans *tp,
1180 enum xfs_refcount_intent_type type, 1180 enum xfs_refcount_intent_type type,
@@ -1189,44 +1189,43 @@ __xfs_refcount_add(
1189 blockcount); 1189 blockcount);
1190 1190
1191 ri = kmem_alloc(sizeof(struct xfs_refcount_intent), 1191 ri = kmem_alloc(sizeof(struct xfs_refcount_intent),
1192 KM_SLEEP | KM_NOFS); 1192 KM_NOFS);
1193 INIT_LIST_HEAD(&ri->ri_list); 1193 INIT_LIST_HEAD(&ri->ri_list);
1194 ri->ri_type = type; 1194 ri->ri_type = type;
1195 ri->ri_startblock = startblock; 1195 ri->ri_startblock = startblock;
1196 ri->ri_blockcount = blockcount; 1196 ri->ri_blockcount = blockcount;
1197 1197
1198 xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list); 1198 xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list);
1199 return 0;
1200} 1199}
1201 1200
1202/* 1201/*
1203 * Increase the reference count of the blocks backing a file's extent. 1202 * Increase the reference count of the blocks backing a file's extent.
1204 */ 1203 */
1205int 1204void
1206xfs_refcount_increase_extent( 1205xfs_refcount_increase_extent(
1207 struct xfs_trans *tp, 1206 struct xfs_trans *tp,
1208 struct xfs_bmbt_irec *PREV) 1207 struct xfs_bmbt_irec *PREV)
1209{ 1208{
1210 if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb)) 1209 if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb))
1211 return 0; 1210 return;
1212 1211
1213 return __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, 1212 __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, PREV->br_startblock,
1214 PREV->br_startblock, PREV->br_blockcount); 1213 PREV->br_blockcount);
1215} 1214}
1216 1215
1217/* 1216/*
1218 * Decrease the reference count of the blocks backing a file's extent. 1217 * Decrease the reference count of the blocks backing a file's extent.
1219 */ 1218 */
1220int 1219void
1221xfs_refcount_decrease_extent( 1220xfs_refcount_decrease_extent(
1222 struct xfs_trans *tp, 1221 struct xfs_trans *tp,
1223 struct xfs_bmbt_irec *PREV) 1222 struct xfs_bmbt_irec *PREV)
1224{ 1223{
1225 if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb)) 1224 if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb))
1226 return 0; 1225 return;
1227 1226
1228 return __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, 1227 __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, PREV->br_startblock,
1229 PREV->br_startblock, PREV->br_blockcount); 1228 PREV->br_blockcount);
1230} 1229}
1231 1230
1232/* 1231/*
@@ -1541,47 +1540,40 @@ __xfs_refcount_cow_free(
1541} 1540}
1542 1541
1543/* Record a CoW staging extent in the refcount btree. */ 1542/* Record a CoW staging extent in the refcount btree. */
1544int 1543void
1545xfs_refcount_alloc_cow_extent( 1544xfs_refcount_alloc_cow_extent(
1546 struct xfs_trans *tp, 1545 struct xfs_trans *tp,
1547 xfs_fsblock_t fsb, 1546 xfs_fsblock_t fsb,
1548 xfs_extlen_t len) 1547 xfs_extlen_t len)
1549{ 1548{
1550 struct xfs_mount *mp = tp->t_mountp; 1549 struct xfs_mount *mp = tp->t_mountp;
1551 int error;
1552 1550
1553 if (!xfs_sb_version_hasreflink(&mp->m_sb)) 1551 if (!xfs_sb_version_hasreflink(&mp->m_sb))
1554 return 0; 1552 return;
1555 1553
1556 error = __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len); 1554 __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len);
1557 if (error)
1558 return error;
1559 1555
1560 /* Add rmap entry */ 1556 /* Add rmap entry */
1561 return xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb), 1557 xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
1562 XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW); 1558 XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
1563} 1559}
1564 1560
1565/* Forget a CoW staging event in the refcount btree. */ 1561/* Forget a CoW staging event in the refcount btree. */
1566int 1562void
1567xfs_refcount_free_cow_extent( 1563xfs_refcount_free_cow_extent(
1568 struct xfs_trans *tp, 1564 struct xfs_trans *tp,
1569 xfs_fsblock_t fsb, 1565 xfs_fsblock_t fsb,
1570 xfs_extlen_t len) 1566 xfs_extlen_t len)
1571{ 1567{
1572 struct xfs_mount *mp = tp->t_mountp; 1568 struct xfs_mount *mp = tp->t_mountp;
1573 int error;
1574 1569
1575 if (!xfs_sb_version_hasreflink(&mp->m_sb)) 1570 if (!xfs_sb_version_hasreflink(&mp->m_sb))
1576 return 0; 1571 return;
1577 1572
1578 /* Remove rmap entry */ 1573 /* Remove rmap entry */
1579 error = xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb), 1574 xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
1580 XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW); 1575 XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
1581 if (error) 1576 __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len);
1582 return error;
1583
1584 return __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len);
1585} 1577}
1586 1578
1587struct xfs_refcount_recovery { 1579struct xfs_refcount_recovery {
@@ -1602,7 +1594,7 @@ xfs_refcount_recover_extent(
1602 if (be32_to_cpu(rec->refc.rc_refcount) != 1) 1594 if (be32_to_cpu(rec->refc.rc_refcount) != 1)
1603 return -EFSCORRUPTED; 1595 return -EFSCORRUPTED;
1604 1596
1605 rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), KM_SLEEP); 1597 rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), 0);
1606 xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec); 1598 xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec);
1607 list_add_tail(&rr->rr_list, debris); 1599 list_add_tail(&rr->rr_list, debris);
1608 1600
@@ -1679,10 +1671,8 @@ xfs_refcount_recover_cow_leftovers(
1679 /* Free the orphan record */ 1671 /* Free the orphan record */
1680 agbno = rr->rr_rrec.rc_startblock - XFS_REFC_COW_START; 1672 agbno = rr->rr_rrec.rc_startblock - XFS_REFC_COW_START;
1681 fsb = XFS_AGB_TO_FSB(mp, agno, agbno); 1673 fsb = XFS_AGB_TO_FSB(mp, agno, agbno);
1682 error = xfs_refcount_free_cow_extent(tp, fsb, 1674 xfs_refcount_free_cow_extent(tp, fsb,
1683 rr->rr_rrec.rc_blockcount); 1675 rr->rr_rrec.rc_blockcount);
1684 if (error)
1685 goto out_trans;
1686 1676
1687 /* Free the block. */ 1677 /* Free the block. */
1688 xfs_bmap_add_free(tp, fsb, rr->rr_rrec.rc_blockcount, NULL); 1678 xfs_bmap_add_free(tp, fsb, rr->rr_rrec.rc_blockcount, NULL);
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 1d9c518575e7..209795539c8d 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -29,9 +29,9 @@ struct xfs_refcount_intent {
29 xfs_extlen_t ri_blockcount; 29 xfs_extlen_t ri_blockcount;
30}; 30};
31 31
32extern int xfs_refcount_increase_extent(struct xfs_trans *tp, 32void xfs_refcount_increase_extent(struct xfs_trans *tp,
33 struct xfs_bmbt_irec *irec); 33 struct xfs_bmbt_irec *irec);
34extern int xfs_refcount_decrease_extent(struct xfs_trans *tp, 34void xfs_refcount_decrease_extent(struct xfs_trans *tp,
35 struct xfs_bmbt_irec *irec); 35 struct xfs_bmbt_irec *irec);
36 36
37extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp, 37extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp,
@@ -45,10 +45,10 @@ extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur,
45 xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, 45 xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno,
46 xfs_extlen_t *flen, bool find_end_of_shared); 46 xfs_extlen_t *flen, bool find_end_of_shared);
47 47
48extern int xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, 48void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
49 xfs_fsblock_t fsb, xfs_extlen_t len); 49 xfs_extlen_t len);
50extern int xfs_refcount_free_cow_extent(struct xfs_trans *tp, 50void xfs_refcount_free_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
51 xfs_fsblock_t fsb, xfs_extlen_t len); 51 xfs_extlen_t len);
52extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, 52extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
53 xfs_agnumber_t agno); 53 xfs_agnumber_t agno);
54 54
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index e6aeb390b2fb..38e9414878b3 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -168,7 +168,6 @@ xfs_rmap_btrec_to_irec(
168 union xfs_btree_rec *rec, 168 union xfs_btree_rec *rec,
169 struct xfs_rmap_irec *irec) 169 struct xfs_rmap_irec *irec)
170{ 170{
171 irec->rm_flags = 0;
172 irec->rm_startblock = be32_to_cpu(rec->rmap.rm_startblock); 171 irec->rm_startblock = be32_to_cpu(rec->rmap.rm_startblock);
173 irec->rm_blockcount = be32_to_cpu(rec->rmap.rm_blockcount); 172 irec->rm_blockcount = be32_to_cpu(rec->rmap.rm_blockcount);
174 irec->rm_owner = be64_to_cpu(rec->rmap.rm_owner); 173 irec->rm_owner = be64_to_cpu(rec->rmap.rm_owner);
@@ -254,15 +253,15 @@ xfs_rmap_find_left_neighbor_helper(
254 rec->rm_flags); 253 rec->rm_flags);
255 254
256 if (rec->rm_owner != info->high.rm_owner) 255 if (rec->rm_owner != info->high.rm_owner)
257 return XFS_BTREE_QUERY_RANGE_CONTINUE; 256 return 0;
258 if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) && 257 if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) &&
259 !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) && 258 !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) &&
260 rec->rm_offset + rec->rm_blockcount - 1 != info->high.rm_offset) 259 rec->rm_offset + rec->rm_blockcount - 1 != info->high.rm_offset)
261 return XFS_BTREE_QUERY_RANGE_CONTINUE; 260 return 0;
262 261
263 *info->irec = *rec; 262 *info->irec = *rec;
264 *info->stat = 1; 263 *info->stat = 1;
265 return XFS_BTREE_QUERY_RANGE_ABORT; 264 return -ECANCELED;
266} 265}
267 266
268/* 267/*
@@ -305,7 +304,7 @@ xfs_rmap_find_left_neighbor(
305 304
306 error = xfs_rmap_query_range(cur, &info.high, &info.high, 305 error = xfs_rmap_query_range(cur, &info.high, &info.high,
307 xfs_rmap_find_left_neighbor_helper, &info); 306 xfs_rmap_find_left_neighbor_helper, &info);
308 if (error == XFS_BTREE_QUERY_RANGE_ABORT) 307 if (error == -ECANCELED)
309 error = 0; 308 error = 0;
310 if (*stat) 309 if (*stat)
311 trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, 310 trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
@@ -330,16 +329,16 @@ xfs_rmap_lookup_le_range_helper(
330 rec->rm_flags); 329 rec->rm_flags);
331 330
332 if (rec->rm_owner != info->high.rm_owner) 331 if (rec->rm_owner != info->high.rm_owner)
333 return XFS_BTREE_QUERY_RANGE_CONTINUE; 332 return 0;
334 if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) && 333 if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) &&
335 !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) && 334 !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) &&
336 (rec->rm_offset > info->high.rm_offset || 335 (rec->rm_offset > info->high.rm_offset ||
337 rec->rm_offset + rec->rm_blockcount <= info->high.rm_offset)) 336 rec->rm_offset + rec->rm_blockcount <= info->high.rm_offset))
338 return XFS_BTREE_QUERY_RANGE_CONTINUE; 337 return 0;
339 338
340 *info->irec = *rec; 339 *info->irec = *rec;
341 *info->stat = 1; 340 *info->stat = 1;
342 return XFS_BTREE_QUERY_RANGE_ABORT; 341 return -ECANCELED;
343} 342}
344 343
345/* 344/*
@@ -377,7 +376,7 @@ xfs_rmap_lookup_le_range(
377 cur->bc_private.a.agno, bno, 0, owner, offset, flags); 376 cur->bc_private.a.agno, bno, 0, owner, offset, flags);
378 error = xfs_rmap_query_range(cur, &info.high, &info.high, 377 error = xfs_rmap_query_range(cur, &info.high, &info.high,
379 xfs_rmap_lookup_le_range_helper, &info); 378 xfs_rmap_lookup_le_range_helper, &info);
380 if (error == XFS_BTREE_QUERY_RANGE_ABORT) 379 if (error == -ECANCELED)
381 error = 0; 380 error = 0;
382 if (*stat) 381 if (*stat)
383 trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, 382 trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
@@ -2268,7 +2267,7 @@ xfs_rmap_update_is_needed(
2268 * Record a rmap intent; the list is kept sorted first by AG and then by 2267 * Record a rmap intent; the list is kept sorted first by AG and then by
2269 * increasing age. 2268 * increasing age.
2270 */ 2269 */
2271static int 2270static void
2272__xfs_rmap_add( 2271__xfs_rmap_add(
2273 struct xfs_trans *tp, 2272 struct xfs_trans *tp,
2274 enum xfs_rmap_intent_type type, 2273 enum xfs_rmap_intent_type type,
@@ -2287,7 +2286,7 @@ __xfs_rmap_add(
2287 bmap->br_blockcount, 2286 bmap->br_blockcount,
2288 bmap->br_state); 2287 bmap->br_state);
2289 2288
2290 ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_SLEEP | KM_NOFS); 2289 ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_NOFS);
2291 INIT_LIST_HEAD(&ri->ri_list); 2290 INIT_LIST_HEAD(&ri->ri_list);
2292 ri->ri_type = type; 2291 ri->ri_type = type;
2293 ri->ri_owner = owner; 2292 ri->ri_owner = owner;
@@ -2295,11 +2294,10 @@ __xfs_rmap_add(
2295 ri->ri_bmap = *bmap; 2294 ri->ri_bmap = *bmap;
2296 2295
2297 xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list); 2296 xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list);
2298 return 0;
2299} 2297}
2300 2298
2301/* Map an extent into a file. */ 2299/* Map an extent into a file. */
2302int 2300void
2303xfs_rmap_map_extent( 2301xfs_rmap_map_extent(
2304 struct xfs_trans *tp, 2302 struct xfs_trans *tp,
2305 struct xfs_inode *ip, 2303 struct xfs_inode *ip,
@@ -2307,15 +2305,15 @@ xfs_rmap_map_extent(
2307 struct xfs_bmbt_irec *PREV) 2305 struct xfs_bmbt_irec *PREV)
2308{ 2306{
2309 if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork)) 2307 if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
2310 return 0; 2308 return;
2311 2309
2312 return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? 2310 __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
2313 XFS_RMAP_MAP_SHARED : XFS_RMAP_MAP, ip->i_ino, 2311 XFS_RMAP_MAP_SHARED : XFS_RMAP_MAP, ip->i_ino,
2314 whichfork, PREV); 2312 whichfork, PREV);
2315} 2313}
2316 2314
2317/* Unmap an extent out of a file. */ 2315/* Unmap an extent out of a file. */
2318int 2316void
2319xfs_rmap_unmap_extent( 2317xfs_rmap_unmap_extent(
2320 struct xfs_trans *tp, 2318 struct xfs_trans *tp,
2321 struct xfs_inode *ip, 2319 struct xfs_inode *ip,
@@ -2323,9 +2321,9 @@ xfs_rmap_unmap_extent(
2323 struct xfs_bmbt_irec *PREV) 2321 struct xfs_bmbt_irec *PREV)
2324{ 2322{
2325 if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork)) 2323 if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
2326 return 0; 2324 return;
2327 2325
2328 return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? 2326 __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
2329 XFS_RMAP_UNMAP_SHARED : XFS_RMAP_UNMAP, ip->i_ino, 2327 XFS_RMAP_UNMAP_SHARED : XFS_RMAP_UNMAP, ip->i_ino,
2330 whichfork, PREV); 2328 whichfork, PREV);
2331} 2329}
@@ -2336,7 +2334,7 @@ xfs_rmap_unmap_extent(
2336 * Note that tp can be NULL here as no transaction is used for COW fork 2334 * Note that tp can be NULL here as no transaction is used for COW fork
2337 * unwritten conversion. 2335 * unwritten conversion.
2338 */ 2336 */
2339int 2337void
2340xfs_rmap_convert_extent( 2338xfs_rmap_convert_extent(
2341 struct xfs_mount *mp, 2339 struct xfs_mount *mp,
2342 struct xfs_trans *tp, 2340 struct xfs_trans *tp,
@@ -2345,15 +2343,15 @@ xfs_rmap_convert_extent(
2345 struct xfs_bmbt_irec *PREV) 2343 struct xfs_bmbt_irec *PREV)
2346{ 2344{
2347 if (!xfs_rmap_update_is_needed(mp, whichfork)) 2345 if (!xfs_rmap_update_is_needed(mp, whichfork))
2348 return 0; 2346 return;
2349 2347
2350 return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? 2348 __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
2351 XFS_RMAP_CONVERT_SHARED : XFS_RMAP_CONVERT, ip->i_ino, 2349 XFS_RMAP_CONVERT_SHARED : XFS_RMAP_CONVERT, ip->i_ino,
2352 whichfork, PREV); 2350 whichfork, PREV);
2353} 2351}
2354 2352
2355/* Schedule the creation of an rmap for non-file data. */ 2353/* Schedule the creation of an rmap for non-file data. */
2356int 2354void
2357xfs_rmap_alloc_extent( 2355xfs_rmap_alloc_extent(
2358 struct xfs_trans *tp, 2356 struct xfs_trans *tp,
2359 xfs_agnumber_t agno, 2357 xfs_agnumber_t agno,
@@ -2364,18 +2362,18 @@ xfs_rmap_alloc_extent(
2364 struct xfs_bmbt_irec bmap; 2362 struct xfs_bmbt_irec bmap;
2365 2363
2366 if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK)) 2364 if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK))
2367 return 0; 2365 return;
2368 2366
2369 bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno); 2367 bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno);
2370 bmap.br_blockcount = len; 2368 bmap.br_blockcount = len;
2371 bmap.br_startoff = 0; 2369 bmap.br_startoff = 0;
2372 bmap.br_state = XFS_EXT_NORM; 2370 bmap.br_state = XFS_EXT_NORM;
2373 2371
2374 return __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap); 2372 __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap);
2375} 2373}
2376 2374
2377/* Schedule the deletion of an rmap for non-file data. */ 2375/* Schedule the deletion of an rmap for non-file data. */
2378int 2376void
2379xfs_rmap_free_extent( 2377xfs_rmap_free_extent(
2380 struct xfs_trans *tp, 2378 struct xfs_trans *tp,
2381 xfs_agnumber_t agno, 2379 xfs_agnumber_t agno,
@@ -2386,14 +2384,14 @@ xfs_rmap_free_extent(
2386 struct xfs_bmbt_irec bmap; 2384 struct xfs_bmbt_irec bmap;
2387 2385
2388 if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK)) 2386 if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK))
2389 return 0; 2387 return;
2390 2388
2391 bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno); 2389 bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno);
2392 bmap.br_blockcount = len; 2390 bmap.br_blockcount = len;
2393 bmap.br_startoff = 0; 2391 bmap.br_startoff = 0;
2394 bmap.br_state = XFS_EXT_NORM; 2392 bmap.br_state = XFS_EXT_NORM;
2395 2393
2396 return __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap); 2394 __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap);
2397} 2395}
2398 2396
2399/* Compare rmap records. Returns -1 if a < b, 1 if a > b, and 0 if equal. */ 2397/* Compare rmap records. Returns -1 if a < b, 1 if a > b, and 0 if equal. */
@@ -2511,7 +2509,7 @@ xfs_rmap_has_other_keys_helper(
2511 ((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags) 2509 ((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags)
2512 return 0; 2510 return 0;
2513 rks->has_rmap = true; 2511 rks->has_rmap = true;
2514 return XFS_BTREE_QUERY_RANGE_ABORT; 2512 return -ECANCELED;
2515} 2513}
2516 2514
2517/* 2515/*
@@ -2540,8 +2538,11 @@ xfs_rmap_has_other_keys(
2540 2538
2541 error = xfs_rmap_query_range(cur, &low, &high, 2539 error = xfs_rmap_query_range(cur, &low, &high,
2542 xfs_rmap_has_other_keys_helper, &rks); 2540 xfs_rmap_has_other_keys_helper, &rks);
2541 if (error < 0)
2542 return error;
2543
2543 *has_rmap = rks.has_rmap; 2544 *has_rmap = rks.has_rmap;
2544 return error; 2545 return 0;
2545} 2546}
2546 2547
2547const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE = { 2548const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE = {
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index e21ed0294e5c..abe633403fd1 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -68,6 +68,7 @@ xfs_rmap_irec_offset_unpack(
68 if (offset & ~(XFS_RMAP_OFF_MASK | XFS_RMAP_OFF_FLAGS)) 68 if (offset & ~(XFS_RMAP_OFF_MASK | XFS_RMAP_OFF_FLAGS))
69 return -EFSCORRUPTED; 69 return -EFSCORRUPTED;
70 irec->rm_offset = XFS_RMAP_OFF(offset); 70 irec->rm_offset = XFS_RMAP_OFF(offset);
71 irec->rm_flags = 0;
71 if (offset & XFS_RMAP_OFF_ATTR_FORK) 72 if (offset & XFS_RMAP_OFF_ATTR_FORK)
72 irec->rm_flags |= XFS_RMAP_ATTR_FORK; 73 irec->rm_flags |= XFS_RMAP_ATTR_FORK;
73 if (offset & XFS_RMAP_OFF_BMBT_BLOCK) 74 if (offset & XFS_RMAP_OFF_BMBT_BLOCK)
@@ -161,16 +162,16 @@ struct xfs_rmap_intent {
161}; 162};
162 163
163/* functions for updating the rmapbt based on bmbt map/unmap operations */ 164/* functions for updating the rmapbt based on bmbt map/unmap operations */
164int xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, 165void xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
165 int whichfork, struct xfs_bmbt_irec *imap); 166 int whichfork, struct xfs_bmbt_irec *imap);
166int xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, 167void xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
167 int whichfork, struct xfs_bmbt_irec *imap); 168 int whichfork, struct xfs_bmbt_irec *imap);
168int xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp, 169void xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp,
169 struct xfs_inode *ip, int whichfork, 170 struct xfs_inode *ip, int whichfork,
170 struct xfs_bmbt_irec *imap); 171 struct xfs_bmbt_irec *imap);
171int xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno, 172void xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
172 xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner); 173 xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner);
173int xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno, 174void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
174 xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner); 175 xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner);
175 176
176void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp, 177void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp,
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index e0641b7337b3..c45acbd3add9 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -177,10 +177,4 @@ struct xfs_ino_geometry {
177 unsigned int agino_log; /* #bits for agino in inum */ 177 unsigned int agino_log; /* #bits for agino in inum */
178}; 178};
179 179
180/* Keep iterating the data structure. */
181#define XFS_ITER_CONTINUE (0)
182
183/* Stop iterating the data structure. */
184#define XFS_ITER_ABORT (1)
185
186#endif /* __XFS_SHARED_H__ */ 180#endif /* __XFS_SHARED_H__ */
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 802b34cd10fe..300b3e91ca3a 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -169,6 +169,14 @@ typedef struct xfs_bmbt_irec
169 xfs_exntst_t br_state; /* extent state */ 169 xfs_exntst_t br_state; /* extent state */
170} xfs_bmbt_irec_t; 170} xfs_bmbt_irec_t;
171 171
172/* per-AG block reservation types */
173enum xfs_ag_resv_type {
174 XFS_AG_RESV_NONE = 0,
175 XFS_AG_RESV_AGFL,
176 XFS_AG_RESV_METADATA,
177 XFS_AG_RESV_RMAPBT,
178};
179
172/* 180/*
173 * Type verifier functions 181 * Type verifier functions
174 */ 182 */
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 16b09b941441..ba0f747c82e8 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -639,7 +639,7 @@ xchk_agfl_block(
639 xchk_agfl_block_xref(sc, agbno); 639 xchk_agfl_block_xref(sc, agbno);
640 640
641 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 641 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
642 return XFS_ITER_ABORT; 642 return -ECANCELED;
643 643
644 return 0; 644 return 0;
645} 645}
@@ -730,7 +730,7 @@ xchk_agfl(
730 /* Check the blocks in the AGFL. */ 730 /* Check the blocks in the AGFL. */
731 error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(sc->sa.agf_bp), 731 error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(sc->sa.agf_bp),
732 sc->sa.agfl_bp, xchk_agfl_block, &sai); 732 sc->sa.agfl_bp, xchk_agfl_block, &sai);
733 if (error == XFS_ITER_ABORT) { 733 if (error == -ECANCELED) {
734 error = 0; 734 error = 0;
735 goto out_free; 735 goto out_free;
736 } 736 }
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index 1afc58bf71dd..0edc7f8eb96e 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -80,7 +80,7 @@ xchk_setup_xattr(
80 * without the inode lock held, which means we can sleep. 80 * without the inode lock held, which means we can sleep.
81 */ 81 */
82 if (sc->flags & XCHK_TRY_HARDER) { 82 if (sc->flags & XCHK_TRY_HARDER) {
83 error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, KM_SLEEP); 83 error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, 0);
84 if (error) 84 if (error)
85 return error; 85 return error;
86 } 86 }
@@ -163,8 +163,6 @@ xchk_xattr_listent(
163 args.valuelen = valuelen; 163 args.valuelen = valuelen;
164 164
165 error = xfs_attr_get_ilocked(context->dp, &args); 165 error = xfs_attr_get_ilocked(context->dp, &args);
166 if (error == -EEXIST)
167 error = 0;
168 if (!xchk_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno, 166 if (!xchk_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno,
169 &error)) 167 &error))
170 goto fail_xref; 168 goto fail_xref;
@@ -173,7 +171,7 @@ xchk_xattr_listent(
173 args.blkno); 171 args.blkno);
174fail_xref: 172fail_xref:
175 if (sx->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 173 if (sx->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
176 context->seen_enough = XFS_ITER_ABORT; 174 context->seen_enough = 1;
177 return; 175 return;
178} 176}
179 177
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 1bd29fdc2ab5..fa6ea6407992 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -75,6 +75,7 @@ struct xchk_bmap_info {
75 xfs_fileoff_t lastoff; 75 xfs_fileoff_t lastoff;
76 bool is_rt; 76 bool is_rt;
77 bool is_shared; 77 bool is_shared;
78 bool was_loaded;
78 int whichfork; 79 int whichfork;
79}; 80};
80 81
@@ -213,25 +214,20 @@ xchk_bmap_xref_rmap(
213 214
214/* Cross-reference a single rtdev extent record. */ 215/* Cross-reference a single rtdev extent record. */
215STATIC void 216STATIC void
216xchk_bmap_rt_extent_xref( 217xchk_bmap_rt_iextent_xref(
217 struct xchk_bmap_info *info,
218 struct xfs_inode *ip, 218 struct xfs_inode *ip,
219 struct xfs_btree_cur *cur, 219 struct xchk_bmap_info *info,
220 struct xfs_bmbt_irec *irec) 220 struct xfs_bmbt_irec *irec)
221{ 221{
222 if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
223 return;
224
225 xchk_xref_is_used_rt_space(info->sc, irec->br_startblock, 222 xchk_xref_is_used_rt_space(info->sc, irec->br_startblock,
226 irec->br_blockcount); 223 irec->br_blockcount);
227} 224}
228 225
229/* Cross-reference a single datadev extent record. */ 226/* Cross-reference a single datadev extent record. */
230STATIC void 227STATIC void
231xchk_bmap_extent_xref( 228xchk_bmap_iextent_xref(
232 struct xchk_bmap_info *info,
233 struct xfs_inode *ip, 229 struct xfs_inode *ip,
234 struct xfs_btree_cur *cur, 230 struct xchk_bmap_info *info,
235 struct xfs_bmbt_irec *irec) 231 struct xfs_bmbt_irec *irec)
236{ 232{
237 struct xfs_mount *mp = info->sc->mp; 233 struct xfs_mount *mp = info->sc->mp;
@@ -240,9 +236,6 @@ xchk_bmap_extent_xref(
240 xfs_extlen_t len; 236 xfs_extlen_t len;
241 int error; 237 int error;
242 238
243 if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
244 return;
245
246 agno = XFS_FSB_TO_AGNO(mp, irec->br_startblock); 239 agno = XFS_FSB_TO_AGNO(mp, irec->br_startblock);
247 agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock); 240 agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock);
248 len = irec->br_blockcount; 241 len = irec->br_blockcount;
@@ -300,20 +293,15 @@ xchk_bmap_dirattr_extent(
300 293
301/* Scrub a single extent record. */ 294/* Scrub a single extent record. */
302STATIC int 295STATIC int
303xchk_bmap_extent( 296xchk_bmap_iextent(
304 struct xfs_inode *ip, 297 struct xfs_inode *ip,
305 struct xfs_btree_cur *cur,
306 struct xchk_bmap_info *info, 298 struct xchk_bmap_info *info,
307 struct xfs_bmbt_irec *irec) 299 struct xfs_bmbt_irec *irec)
308{ 300{
309 struct xfs_mount *mp = info->sc->mp; 301 struct xfs_mount *mp = info->sc->mp;
310 struct xfs_buf *bp = NULL;
311 xfs_filblks_t end; 302 xfs_filblks_t end;
312 int error = 0; 303 int error = 0;
313 304
314 if (cur)
315 xfs_btree_get_block(cur, 0, &bp);
316
317 /* 305 /*
318 * Check for out-of-order extents. This record could have come 306 * Check for out-of-order extents. This record could have come
319 * from the incore list, for which there is no ordering check. 307 * from the incore list, for which there is no ordering check.
@@ -364,10 +352,13 @@ xchk_bmap_extent(
364 xchk_fblock_set_corrupt(info->sc, info->whichfork, 352 xchk_fblock_set_corrupt(info->sc, info->whichfork,
365 irec->br_startoff); 353 irec->br_startoff);
366 354
355 if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
356 return 0;
357
367 if (info->is_rt) 358 if (info->is_rt)
368 xchk_bmap_rt_extent_xref(info, ip, cur, irec); 359 xchk_bmap_rt_iextent_xref(ip, info, irec);
369 else 360 else
370 xchk_bmap_extent_xref(info, ip, cur, irec); 361 xchk_bmap_iextent_xref(ip, info, irec);
371 362
372 info->lastoff = irec->br_startoff + irec->br_blockcount; 363 info->lastoff = irec->br_startoff + irec->br_blockcount;
373 return error; 364 return error;
@@ -380,10 +371,13 @@ xchk_bmapbt_rec(
380 union xfs_btree_rec *rec) 371 union xfs_btree_rec *rec)
381{ 372{
382 struct xfs_bmbt_irec irec; 373 struct xfs_bmbt_irec irec;
374 struct xfs_bmbt_irec iext_irec;
375 struct xfs_iext_cursor icur;
383 struct xchk_bmap_info *info = bs->private; 376 struct xchk_bmap_info *info = bs->private;
384 struct xfs_inode *ip = bs->cur->bc_private.b.ip; 377 struct xfs_inode *ip = bs->cur->bc_private.b.ip;
385 struct xfs_buf *bp = NULL; 378 struct xfs_buf *bp = NULL;
386 struct xfs_btree_block *block; 379 struct xfs_btree_block *block;
380 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, info->whichfork);
387 uint64_t owner; 381 uint64_t owner;
388 int i; 382 int i;
389 383
@@ -402,9 +396,26 @@ xchk_bmapbt_rec(
402 } 396 }
403 } 397 }
404 398
405 /* Set up the in-core record and scrub it. */ 399 /*
400 * Check that the incore extent tree contains an extent that matches
401 * this one exactly. We validate those cached bmaps later, so we don't
402 * need to check them here. If the incore extent tree was just loaded
403 * from disk by the scrubber, we assume that its contents match what's
404 * on disk (we still hold the ILOCK) and skip the equivalence check.
405 */
406 if (!info->was_loaded)
407 return 0;
408
406 xfs_bmbt_disk_get_all(&rec->bmbt, &irec); 409 xfs_bmbt_disk_get_all(&rec->bmbt, &irec);
407 return xchk_bmap_extent(ip, bs->cur, info, &irec); 410 if (!xfs_iext_lookup_extent(ip, ifp, irec.br_startoff, &icur,
411 &iext_irec) ||
412 irec.br_startoff != iext_irec.br_startoff ||
413 irec.br_startblock != iext_irec.br_startblock ||
414 irec.br_blockcount != iext_irec.br_blockcount ||
415 irec.br_state != iext_irec.br_state)
416 xchk_fblock_set_corrupt(bs->sc, info->whichfork,
417 irec.br_startoff);
418 return 0;
408} 419}
409 420
410/* Scan the btree records. */ 421/* Scan the btree records. */
@@ -415,15 +426,26 @@ xchk_bmap_btree(
415 struct xchk_bmap_info *info) 426 struct xchk_bmap_info *info)
416{ 427{
417 struct xfs_owner_info oinfo; 428 struct xfs_owner_info oinfo;
429 struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, whichfork);
418 struct xfs_mount *mp = sc->mp; 430 struct xfs_mount *mp = sc->mp;
419 struct xfs_inode *ip = sc->ip; 431 struct xfs_inode *ip = sc->ip;
420 struct xfs_btree_cur *cur; 432 struct xfs_btree_cur *cur;
421 int error; 433 int error;
422 434
435 /* Load the incore bmap cache if it's not loaded. */
436 info->was_loaded = ifp->if_flags & XFS_IFEXTENTS;
437 if (!info->was_loaded) {
438 error = xfs_iread_extents(sc->tp, ip, whichfork);
439 if (!xchk_fblock_process_error(sc, whichfork, 0, &error))
440 goto out;
441 }
442
443 /* Check the btree structure. */
423 cur = xfs_bmbt_init_cursor(mp, sc->tp, ip, whichfork); 444 cur = xfs_bmbt_init_cursor(mp, sc->tp, ip, whichfork);
424 xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork); 445 xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork);
425 error = xchk_btree(sc, cur, xchk_bmapbt_rec, &oinfo, info); 446 error = xchk_btree(sc, cur, xchk_bmapbt_rec, &oinfo, info);
426 xfs_btree_del_cursor(cur, error); 447 xfs_btree_del_cursor(cur, error);
448out:
427 return error; 449 return error;
428} 450}
429 451
@@ -500,7 +522,7 @@ xchk_bmap_check_rmap(
500 522
501out: 523out:
502 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 524 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
503 return XFS_BTREE_QUERY_RANGE_ABORT; 525 return -ECANCELED;
504 return 0; 526 return 0;
505} 527}
506 528
@@ -529,7 +551,7 @@ xchk_bmap_check_ag_rmaps(
529 sbcri.sc = sc; 551 sbcri.sc = sc;
530 sbcri.whichfork = whichfork; 552 sbcri.whichfork = whichfork;
531 error = xfs_rmap_query_all(cur, xchk_bmap_check_rmap, &sbcri); 553 error = xfs_rmap_query_all(cur, xchk_bmap_check_rmap, &sbcri);
532 if (error == XFS_BTREE_QUERY_RANGE_ABORT) 554 if (error == -ECANCELED)
533 error = 0; 555 error = 0;
534 556
535 xfs_btree_del_cursor(cur, error); 557 xfs_btree_del_cursor(cur, error);
@@ -671,13 +693,6 @@ xchk_bmap(
671 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 693 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
672 goto out; 694 goto out;
673 695
674 /* Now try to scrub the in-memory extent list. */
675 if (!(ifp->if_flags & XFS_IFEXTENTS)) {
676 error = xfs_iread_extents(sc->tp, ip, whichfork);
677 if (!xchk_fblock_process_error(sc, whichfork, 0, &error))
678 goto out;
679 }
680
681 /* Find the offset of the last extent in the mapping. */ 696 /* Find the offset of the last extent in the mapping. */
682 error = xfs_bmap_last_offset(ip, &endoff, whichfork); 697 error = xfs_bmap_last_offset(ip, &endoff, whichfork);
683 if (!xchk_fblock_process_error(sc, whichfork, 0, &error)) 698 if (!xchk_fblock_process_error(sc, whichfork, 0, &error))
@@ -689,7 +704,7 @@ xchk_bmap(
689 for_each_xfs_iext(ifp, &icur, &irec) { 704 for_each_xfs_iext(ifp, &icur, &irec) {
690 if (xchk_should_terminate(sc, &error) || 705 if (xchk_should_terminate(sc, &error) ||
691 (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) 706 (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
692 break; 707 goto out;
693 if (isnullstartblock(irec.br_startblock)) 708 if (isnullstartblock(irec.br_startblock))
694 continue; 709 continue;
695 if (irec.br_startoff >= endoff) { 710 if (irec.br_startoff >= endoff) {
@@ -697,7 +712,7 @@ xchk_bmap(
697 irec.br_startoff); 712 irec.br_startoff);
698 goto out; 713 goto out;
699 } 714 }
700 error = xchk_bmap_extent(ip, NULL, &info, &irec); 715 error = xchk_bmap_iextent(ip, &info, &irec);
701 if (error) 716 if (error)
702 goto out; 717 goto out;
703 } 718 }
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index fc3f510c9034..98f82d7c8b40 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -125,7 +125,7 @@ xchk_setup_fscounters(
125 struct xchk_fscounters *fsc; 125 struct xchk_fscounters *fsc;
126 int error; 126 int error;
127 127
128 sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), KM_SLEEP); 128 sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), 0);
129 if (!sc->buf) 129 if (!sc->buf)
130 return -ENOMEM; 130 return -ENOMEM;
131 fsc = sc->buf; 131 fsc = sc->buf;
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 4cfeec57fb05..b70a88bc975e 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -351,7 +351,7 @@ xrep_init_btblock(
351 xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); 351 xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
352 xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno); 352 xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno);
353 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF); 353 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
354 xfs_trans_log_buf(tp, bp, 0, bp->b_length); 354 xfs_trans_log_buf(tp, bp, 0, BBTOB(bp->b_length) - 1);
355 bp->b_ops = ops; 355 bp->b_ops = ops;
356 *bpp = bp; 356 *bpp = bp;
357 357
@@ -664,7 +664,7 @@ xrep_findroot_agfl_walk(
664{ 664{
665 xfs_agblock_t *agbno = priv; 665 xfs_agblock_t *agbno = priv;
666 666
667 return (*agbno == bno) ? XFS_ITER_ABORT : 0; 667 return (*agbno == bno) ? -ECANCELED : 0;
668} 668}
669 669
670/* Does this block match the btree information passed in? */ 670/* Does this block match the btree information passed in? */
@@ -694,7 +694,7 @@ xrep_findroot_block(
694 if (owner == XFS_RMAP_OWN_AG) { 694 if (owner == XFS_RMAP_OWN_AG) {
695 error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp, 695 error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp,
696 xrep_findroot_agfl_walk, &agbno); 696 xrep_findroot_agfl_walk, &agbno);
697 if (error == XFS_ITER_ABORT) 697 if (error == -ECANCELED)
698 return 0; 698 return 0;
699 if (error) 699 if (error)
700 return error; 700 return error;
diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
index 99c0b1234c3c..5641ae512c9e 100644
--- a/fs/xfs/scrub/symlink.c
+++ b/fs/xfs/scrub/symlink.c
@@ -22,7 +22,7 @@ xchk_setup_symlink(
22 struct xfs_inode *ip) 22 struct xfs_inode *ip)
23{ 23{
24 /* Allocate the buffer without the inode lock held. */ 24 /* Allocate the buffer without the inode lock held. */
25 sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, KM_SLEEP); 25 sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, 0);
26 if (!sc->buf) 26 if (!sc->buf)
27 return -ENOMEM; 27 return -ENOMEM;
28 28
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index cbda40d40326..96d7071cfa46 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -112,7 +112,7 @@ xfs_get_acl(struct inode *inode, int type)
112{ 112{
113 struct xfs_inode *ip = XFS_I(inode); 113 struct xfs_inode *ip = XFS_I(inode);
114 struct posix_acl *acl = NULL; 114 struct posix_acl *acl = NULL;
115 struct xfs_acl *xfs_acl; 115 struct xfs_acl *xfs_acl = NULL;
116 unsigned char *ea_name; 116 unsigned char *ea_name;
117 int error; 117 int error;
118 int len; 118 int len;
@@ -135,12 +135,8 @@ xfs_get_acl(struct inode *inode, int type)
135 * go out to the disk. 135 * go out to the disk.
136 */ 136 */
137 len = XFS_ACL_MAX_SIZE(ip->i_mount); 137 len = XFS_ACL_MAX_SIZE(ip->i_mount);
138 xfs_acl = kmem_zalloc_large(len, KM_SLEEP); 138 error = xfs_attr_get(ip, ea_name, (unsigned char **)&xfs_acl, &len,
139 if (!xfs_acl) 139 ATTR_ALLOC | ATTR_ROOT);
140 return ERR_PTR(-ENOMEM);
141
142 error = xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl,
143 &len, ATTR_ROOT);
144 if (error) { 140 if (error) {
145 /* 141 /*
146 * If the attribute doesn't exist make sure we have a negative 142 * If the attribute doesn't exist make sure we have a negative
@@ -151,8 +147,8 @@ xfs_get_acl(struct inode *inode, int type)
151 } else { 147 } else {
152 acl = xfs_acl_from_disk(xfs_acl, len, 148 acl = xfs_acl_from_disk(xfs_acl, len,
153 XFS_ACL_MAX_ENTRIES(ip->i_mount)); 149 XFS_ACL_MAX_ENTRIES(ip->i_mount));
150 kmem_free(xfs_acl);
154 } 151 }
155 kmem_free(xfs_acl);
156 return acl; 152 return acl;
157} 153}
158 154
@@ -180,7 +176,7 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
180 struct xfs_acl *xfs_acl; 176 struct xfs_acl *xfs_acl;
181 int len = XFS_ACL_MAX_SIZE(ip->i_mount); 177 int len = XFS_ACL_MAX_SIZE(ip->i_mount);
182 178
183 xfs_acl = kmem_zalloc_large(len, KM_SLEEP); 179 xfs_acl = kmem_zalloc_large(len, 0);
184 if (!xfs_acl) 180 if (!xfs_acl)
185 return -ENOMEM; 181 return -ENOMEM;
186 182
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index dc93c51c17de..a640a285cc52 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -147,7 +147,7 @@ xfs_attr3_leaf_inactive(
147 * Allocate storage for a list of all the "remote" value extents. 147 * Allocate storage for a list of all the "remote" value extents.
148 */ 148 */
149 size = count * sizeof(xfs_attr_inactive_list_t); 149 size = count * sizeof(xfs_attr_inactive_list_t);
150 list = kmem_alloc(size, KM_SLEEP); 150 list = kmem_alloc(size, 0);
151 151
152 /* 152 /*
153 * Identify each of the "remote" value extents. 153 * Identify each of the "remote" value extents.
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 58fc820a70c6..00758fdc2fec 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -109,7 +109,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
109 * It didn't all fit, so we have to sort everything on hashval. 109 * It didn't all fit, so we have to sort everything on hashval.
110 */ 110 */
111 sbsize = sf->hdr.count * sizeof(*sbuf); 111 sbsize = sf->hdr.count * sizeof(*sbuf);
112 sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS); 112 sbp = sbuf = kmem_alloc(sbsize, KM_NOFS);
113 113
114 /* 114 /*
115 * Scan the attribute list for the rest of the entries, storing 115 * Scan the attribute list for the rest of the entries, storing
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 9fa4a7ee8cfc..83d24e983d4c 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -141,7 +141,7 @@ xfs_bui_init(
141{ 141{
142 struct xfs_bui_log_item *buip; 142 struct xfs_bui_log_item *buip;
143 143
144 buip = kmem_zone_zalloc(xfs_bui_zone, KM_SLEEP); 144 buip = kmem_zone_zalloc(xfs_bui_zone, 0);
145 145
146 xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops); 146 xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops);
147 buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS; 147 buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS;
@@ -218,7 +218,7 @@ xfs_trans_get_bud(
218{ 218{
219 struct xfs_bud_log_item *budp; 219 struct xfs_bud_log_item *budp;
220 220
221 budp = kmem_zone_zalloc(xfs_bud_zone, KM_SLEEP); 221 budp = kmem_zone_zalloc(xfs_bud_zone, 0);
222 xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD, 222 xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD,
223 &xfs_bud_item_ops); 223 &xfs_bud_item_ops);
224 budp->bud_buip = buip; 224 budp->bud_buip = buip;
@@ -542,9 +542,7 @@ xfs_bui_recover(
542 irec.br_blockcount = count; 542 irec.br_blockcount = count;
543 irec.br_startoff = bmap->me_startoff; 543 irec.br_startoff = bmap->me_startoff;
544 irec.br_state = state; 544 irec.br_state = state;
545 error = xfs_bmap_unmap_extent(tp, ip, &irec); 545 xfs_bmap_unmap_extent(tp, ip, &irec);
546 if (error)
547 goto err_inode;
548 } 546 }
549 547
550 set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); 548 set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 98c6a7a71427..0910cb75b65d 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -39,9 +39,9 @@
39xfs_daddr_t 39xfs_daddr_t
40xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) 40xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
41{ 41{
42 return (XFS_IS_REALTIME_INODE(ip) ? \ 42 if (XFS_IS_REALTIME_INODE(ip))
43 (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \ 43 return XFS_FSB_TO_BB(ip->i_mount, fsb);
44 XFS_FSB_TO_DADDR((ip)->i_mount, (fsb))); 44 return XFS_FSB_TO_DADDR(ip->i_mount, fsb);
45} 45}
46 46
47/* 47/*
@@ -1532,24 +1532,16 @@ xfs_swap_extent_rmap(
1532 trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec); 1532 trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec);
1533 1533
1534 /* Remove the mapping from the donor file. */ 1534 /* Remove the mapping from the donor file. */
1535 error = xfs_bmap_unmap_extent(tp, tip, &uirec); 1535 xfs_bmap_unmap_extent(tp, tip, &uirec);
1536 if (error)
1537 goto out;
1538 1536
1539 /* Remove the mapping from the source file. */ 1537 /* Remove the mapping from the source file. */
1540 error = xfs_bmap_unmap_extent(tp, ip, &irec); 1538 xfs_bmap_unmap_extent(tp, ip, &irec);
1541 if (error)
1542 goto out;
1543 1539
1544 /* Map the donor file's blocks into the source file. */ 1540 /* Map the donor file's blocks into the source file. */
1545 error = xfs_bmap_map_extent(tp, ip, &uirec); 1541 xfs_bmap_map_extent(tp, ip, &uirec);
1546 if (error)
1547 goto out;
1548 1542
1549 /* Map the source file's blocks into the donor file. */ 1543 /* Map the source file's blocks into the donor file. */
1550 error = xfs_bmap_map_extent(tp, tip, &irec); 1544 xfs_bmap_map_extent(tp, tip, &irec);
1551 if (error)
1552 goto out;
1553 1545
1554 error = xfs_defer_finish(tpp); 1546 error = xfs_defer_finish(tpp);
1555 tp = *tpp; 1547 tp = *tpp;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index ca0849043f54..120ef99d09e8 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -353,7 +353,8 @@ xfs_buf_allocate_memory(
353 */ 353 */
354 size = BBTOB(bp->b_length); 354 size = BBTOB(bp->b_length);
355 if (size < PAGE_SIZE) { 355 if (size < PAGE_SIZE) {
356 bp->b_addr = kmem_alloc(size, KM_NOFS); 356 int align_mask = xfs_buftarg_dma_alignment(bp->b_target);
357 bp->b_addr = kmem_alloc_io(size, align_mask, KM_NOFS);
357 if (!bp->b_addr) { 358 if (!bp->b_addr) {
358 /* low memory - use alloc_page loop instead */ 359 /* low memory - use alloc_page loop instead */
359 goto use_alloc_page; 360 goto use_alloc_page;
@@ -368,7 +369,7 @@ xfs_buf_allocate_memory(
368 } 369 }
369 bp->b_offset = offset_in_page(bp->b_addr); 370 bp->b_offset = offset_in_page(bp->b_addr);
370 bp->b_pages = bp->b_page_array; 371 bp->b_pages = bp->b_page_array;
371 bp->b_pages[0] = virt_to_page(bp->b_addr); 372 bp->b_pages[0] = kmem_to_page(bp->b_addr);
372 bp->b_page_count = 1; 373 bp->b_page_count = 1;
373 bp->b_flags |= _XBF_KMEM; 374 bp->b_flags |= _XBF_KMEM;
374 return 0; 375 return 0;
@@ -1741,7 +1742,7 @@ xfs_alloc_buftarg(
1741{ 1742{
1742 xfs_buftarg_t *btp; 1743 xfs_buftarg_t *btp;
1743 1744
1744 btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS); 1745 btp = kmem_zalloc(sizeof(*btp), KM_NOFS);
1745 1746
1746 btp->bt_mount = mp; 1747 btp->bt_mount = mp;
1747 btp->bt_dev = bdev->bd_dev; 1748 btp->bt_dev = bdev->bd_dev;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index c6e57a3f409e..f6ce17d8d848 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -350,6 +350,12 @@ extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int);
350#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) 350#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
351#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) 351#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
352 352
353static inline int
354xfs_buftarg_dma_alignment(struct xfs_buftarg *bt)
355{
356 return queue_dma_alignment(bt->bt_bdev->bd_disk->queue);
357}
358
353int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops); 359int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
354bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic); 360bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic);
355bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic); 361bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 7dcaec54a20b..d74fbd1e9d3e 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -702,7 +702,7 @@ xfs_buf_item_get_format(
702 } 702 }
703 703
704 bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format), 704 bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format),
705 KM_SLEEP); 705 0);
706 if (!bip->bli_formats) 706 if (!bip->bli_formats)
707 return -ENOMEM; 707 return -ENOMEM;
708 return 0; 708 return 0;
@@ -747,7 +747,7 @@ xfs_buf_item_init(
747 return 0; 747 return 0;
748 } 748 }
749 749
750 bip = kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP); 750 bip = kmem_zone_zalloc(xfs_buf_item_zone, 0);
751 xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); 751 xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
752 bip->bli_buf = bp; 752 bip->bli_buf = bp;
753 753
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index fb1ad4483081..aeb95e7391c1 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -440,7 +440,7 @@ xfs_dquot_alloc(
440{ 440{
441 struct xfs_dquot *dqp; 441 struct xfs_dquot *dqp;
442 442
443 dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP); 443 dqp = kmem_zone_zalloc(xfs_qm_dqzone, 0);
444 444
445 dqp->dq_flags = type; 445 dqp->dq_flags = type;
446 dqp->q_core.d_id = cpu_to_be32(id); 446 dqp->q_core.d_id = cpu_to_be32(id);
@@ -1239,7 +1239,7 @@ xfs_qm_exit(void)
1239/* 1239/*
1240 * Iterate every dquot of a particular type. The caller must ensure that the 1240 * Iterate every dquot of a particular type. The caller must ensure that the
1241 * particular quota type is active. iter_fn can return negative error codes, 1241 * particular quota type is active. iter_fn can return negative error codes,
1242 * or XFS_ITER_ABORT to indicate that it wants to stop iterating. 1242 * or -ECANCELED to indicate that it wants to stop iterating.
1243 */ 1243 */
1244int 1244int
1245xfs_qm_dqiterate( 1245xfs_qm_dqiterate(
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 282ec5af293e..d60647d7197b 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -347,7 +347,7 @@ xfs_qm_qoff_logitem_init(
347{ 347{
348 struct xfs_qoff_logitem *qf; 348 struct xfs_qoff_logitem *qf;
349 349
350 qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), KM_SLEEP); 350 qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), 0);
351 351
352 xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ? 352 xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
353 &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops); 353 &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 544c9482a0ef..849fd4476950 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -213,7 +213,7 @@ xfs_errortag_init(
213 struct xfs_mount *mp) 213 struct xfs_mount *mp)
214{ 214{
215 mp->m_errortag = kmem_zalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX, 215 mp->m_errortag = kmem_zalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX,
216 KM_SLEEP | KM_MAYFAIL); 216 KM_MAYFAIL);
217 if (!mp->m_errortag) 217 if (!mp->m_errortag)
218 return -ENOMEM; 218 return -ENOMEM;
219 219
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 0ed68379e551..2183d87be4cf 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -33,7 +33,7 @@ xfs_extent_busy_insert(
33 struct rb_node **rbp; 33 struct rb_node **rbp;
34 struct rb_node *parent = NULL; 34 struct rb_node *parent = NULL;
35 35
36 new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_SLEEP); 36 new = kmem_zalloc(sizeof(struct xfs_extent_busy), 0);
37 new->agno = agno; 37 new->agno = agno;
38 new->bno = bno; 38 new->bno = bno;
39 new->length = len; 39 new->length = len;
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 86f6512d6864..e44efc41a041 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -163,9 +163,9 @@ xfs_efi_init(
163 if (nextents > XFS_EFI_MAX_FAST_EXTENTS) { 163 if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
164 size = (uint)(sizeof(xfs_efi_log_item_t) + 164 size = (uint)(sizeof(xfs_efi_log_item_t) +
165 ((nextents - 1) * sizeof(xfs_extent_t))); 165 ((nextents - 1) * sizeof(xfs_extent_t)));
166 efip = kmem_zalloc(size, KM_SLEEP); 166 efip = kmem_zalloc(size, 0);
167 } else { 167 } else {
168 efip = kmem_zone_zalloc(xfs_efi_zone, KM_SLEEP); 168 efip = kmem_zone_zalloc(xfs_efi_zone, 0);
169 } 169 }
170 170
171 xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); 171 xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
@@ -333,9 +333,9 @@ xfs_trans_get_efd(
333 if (nextents > XFS_EFD_MAX_FAST_EXTENTS) { 333 if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
334 efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) + 334 efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) +
335 (nextents - 1) * sizeof(struct xfs_extent), 335 (nextents - 1) * sizeof(struct xfs_extent),
336 KM_SLEEP); 336 0);
337 } else { 337 } else {
338 efdp = kmem_zone_zalloc(xfs_efd_zone, KM_SLEEP); 338 efdp = kmem_zone_zalloc(xfs_efd_zone, 0);
339 } 339 }
340 340
341 xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD, 341 xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 28101bbc0b78..d952d5962e93 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -28,6 +28,7 @@
28#include <linux/falloc.h> 28#include <linux/falloc.h>
29#include <linux/backing-dev.h> 29#include <linux/backing-dev.h>
30#include <linux/mman.h> 30#include <linux/mman.h>
31#include <linux/fadvise.h>
31 32
32static const struct vm_operations_struct xfs_file_vm_ops; 33static const struct vm_operations_struct xfs_file_vm_ops;
33 34
@@ -933,6 +934,30 @@ out_unlock:
933 return error; 934 return error;
934} 935}
935 936
937STATIC int
938xfs_file_fadvise(
939 struct file *file,
940 loff_t start,
941 loff_t end,
942 int advice)
943{
944 struct xfs_inode *ip = XFS_I(file_inode(file));
945 int ret;
946 int lockflags = 0;
947
948 /*
949 * Operations creating pages in page cache need protection from hole
950 * punching and similar ops
951 */
952 if (advice == POSIX_FADV_WILLNEED) {
953 lockflags = XFS_IOLOCK_SHARED;
954 xfs_ilock(ip, lockflags);
955 }
956 ret = generic_fadvise(file, start, end, advice);
957 if (lockflags)
958 xfs_iunlock(ip, lockflags);
959 return ret;
960}
936 961
937STATIC loff_t 962STATIC loff_t
938xfs_file_remap_range( 963xfs_file_remap_range(
@@ -1232,6 +1257,7 @@ const struct file_operations xfs_file_operations = {
1232 .fsync = xfs_file_fsync, 1257 .fsync = xfs_file_fsync,
1233 .get_unmapped_area = thp_get_unmapped_area, 1258 .get_unmapped_area = thp_get_unmapped_area,
1234 .fallocate = xfs_file_fallocate, 1259 .fallocate = xfs_file_fallocate,
1260 .fadvise = xfs_file_fadvise,
1235 .remap_file_range = xfs_file_remap_range, 1261 .remap_file_range = xfs_file_remap_range,
1236}; 1262};
1237 1263
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 5a8f9641562a..d082143feb5a 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -250,7 +250,7 @@ xfs_getfsmap_helper(
250 rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); 250 rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
251 if (info->next_daddr < rec_daddr) 251 if (info->next_daddr < rec_daddr)
252 info->next_daddr = rec_daddr; 252 info->next_daddr = rec_daddr;
253 return XFS_BTREE_QUERY_RANGE_CONTINUE; 253 return 0;
254 } 254 }
255 255
256 /* Are we just counting mappings? */ 256 /* Are we just counting mappings? */
@@ -259,14 +259,14 @@ xfs_getfsmap_helper(
259 info->head->fmh_entries++; 259 info->head->fmh_entries++;
260 260
261 if (info->last) 261 if (info->last)
262 return XFS_BTREE_QUERY_RANGE_CONTINUE; 262 return 0;
263 263
264 info->head->fmh_entries++; 264 info->head->fmh_entries++;
265 265
266 rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); 266 rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
267 if (info->next_daddr < rec_daddr) 267 if (info->next_daddr < rec_daddr)
268 info->next_daddr = rec_daddr; 268 info->next_daddr = rec_daddr;
269 return XFS_BTREE_QUERY_RANGE_CONTINUE; 269 return 0;
270 } 270 }
271 271
272 /* 272 /*
@@ -276,7 +276,7 @@ xfs_getfsmap_helper(
276 */ 276 */
277 if (rec_daddr > info->next_daddr) { 277 if (rec_daddr > info->next_daddr) {
278 if (info->head->fmh_entries >= info->head->fmh_count) 278 if (info->head->fmh_entries >= info->head->fmh_count)
279 return XFS_BTREE_QUERY_RANGE_ABORT; 279 return -ECANCELED;
280 280
281 fmr.fmr_device = info->dev; 281 fmr.fmr_device = info->dev;
282 fmr.fmr_physical = info->next_daddr; 282 fmr.fmr_physical = info->next_daddr;
@@ -295,7 +295,7 @@ xfs_getfsmap_helper(
295 295
296 /* Fill out the extent we found */ 296 /* Fill out the extent we found */
297 if (info->head->fmh_entries >= info->head->fmh_count) 297 if (info->head->fmh_entries >= info->head->fmh_count)
298 return XFS_BTREE_QUERY_RANGE_ABORT; 298 return -ECANCELED;
299 299
300 trace_xfs_fsmap_mapping(mp, info->dev, info->agno, rec); 300 trace_xfs_fsmap_mapping(mp, info->dev, info->agno, rec);
301 301
@@ -328,7 +328,7 @@ out:
328 rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); 328 rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
329 if (info->next_daddr < rec_daddr) 329 if (info->next_daddr < rec_daddr)
330 info->next_daddr = rec_daddr; 330 info->next_daddr = rec_daddr;
331 return XFS_BTREE_QUERY_RANGE_CONTINUE; 331 return 0;
332} 332}
333 333
334/* Transform a rmapbt irec into a fsmap */ 334/* Transform a rmapbt irec into a fsmap */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 0b0fd10a36d4..944add5ff8e0 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -40,7 +40,7 @@ xfs_inode_alloc(
40 * KM_MAYFAIL and return NULL here on ENOMEM. Set the 40 * KM_MAYFAIL and return NULL here on ENOMEM. Set the
41 * code up to do this anyway. 41 * code up to do this anyway.
42 */ 42 */
43 ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP); 43 ip = kmem_zone_alloc(xfs_inode_zone, 0);
44 if (!ip) 44 if (!ip)
45 return NULL; 45 return NULL;
46 if (inode_init_always(mp->m_super, VFS_I(ip))) { 46 if (inode_init_always(mp->m_super, VFS_I(ip))) {
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index d99a0a3e5f40..3ebd1b7f49d8 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -89,7 +89,7 @@ xfs_icreate_log(
89{ 89{
90 struct xfs_icreate_item *icp; 90 struct xfs_icreate_item *icp;
91 91
92 icp = kmem_zone_zalloc(xfs_icreate_zone, KM_SLEEP); 92 icp = kmem_zone_zalloc(xfs_icreate_zone, 0);
93 93
94 xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE, 94 xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE,
95 &xfs_icreate_item_ops); 95 &xfs_icreate_item_ops);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 6467d5e1df2d..18f4b262e61c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2018,7 +2018,7 @@ xfs_iunlink_add_backref(
2018 if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK)) 2018 if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
2019 return 0; 2019 return 0;
2020 2020
2021 iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS); 2021 iu = kmem_zalloc(sizeof(*iu), KM_NOFS);
2022 iu->iu_agino = prev_agino; 2022 iu->iu_agino = prev_agino;
2023 iu->iu_next_unlinked = this_agino; 2023 iu->iu_next_unlinked = this_agino;
2024 2024
@@ -3282,7 +3282,8 @@ xfs_rename(
3282 spaceres); 3282 spaceres);
3283 3283
3284 /* 3284 /*
3285 * Set up the target. 3285 * Check for expected errors before we dirty the transaction
3286 * so we can return an error without a transaction abort.
3286 */ 3287 */
3287 if (target_ip == NULL) { 3288 if (target_ip == NULL) {
3288 /* 3289 /*
@@ -3294,6 +3295,46 @@ xfs_rename(
3294 if (error) 3295 if (error)
3295 goto out_trans_cancel; 3296 goto out_trans_cancel;
3296 } 3297 }
3298 } else {
3299 /*
3300 * If target exists and it's a directory, check that whether
3301 * it can be destroyed.
3302 */
3303 if (S_ISDIR(VFS_I(target_ip)->i_mode) &&
3304 (!xfs_dir_isempty(target_ip) ||
3305 (VFS_I(target_ip)->i_nlink > 2))) {
3306 error = -EEXIST;
3307 goto out_trans_cancel;
3308 }
3309 }
3310
3311 /*
3312 * Directory entry creation below may acquire the AGF. Remove
3313 * the whiteout from the unlinked list first to preserve correct
3314 * AGI/AGF locking order. This dirties the transaction so failures
3315 * after this point will abort and log recovery will clean up the
3316 * mess.
3317 *
3318 * For whiteouts, we need to bump the link count on the whiteout
3319 * inode. After this point, we have a real link, clear the tmpfile
3320 * state flag from the inode so it doesn't accidentally get misused
3321 * in future.
3322 */
3323 if (wip) {
3324 ASSERT(VFS_I(wip)->i_nlink == 0);
3325 error = xfs_iunlink_remove(tp, wip);
3326 if (error)
3327 goto out_trans_cancel;
3328
3329 xfs_bumplink(tp, wip);
3330 xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
3331 VFS_I(wip)->i_state &= ~I_LINKABLE;
3332 }
3333
3334 /*
3335 * Set up the target.
3336 */
3337 if (target_ip == NULL) {
3297 /* 3338 /*
3298 * If target does not exist and the rename crosses 3339 * If target does not exist and the rename crosses
3299 * directories, adjust the target directory link count 3340 * directories, adjust the target directory link count
@@ -3312,22 +3353,6 @@ xfs_rename(
3312 } 3353 }
3313 } else { /* target_ip != NULL */ 3354 } else { /* target_ip != NULL */
3314 /* 3355 /*
3315 * If target exists and it's a directory, check that both
3316 * target and source are directories and that target can be
3317 * destroyed, or that neither is a directory.
3318 */
3319 if (S_ISDIR(VFS_I(target_ip)->i_mode)) {
3320 /*
3321 * Make sure target dir is empty.
3322 */
3323 if (!(xfs_dir_isempty(target_ip)) ||
3324 (VFS_I(target_ip)->i_nlink > 2)) {
3325 error = -EEXIST;
3326 goto out_trans_cancel;
3327 }
3328 }
3329
3330 /*
3331 * Link the source inode under the target name. 3356 * Link the source inode under the target name.
3332 * If the source inode is a directory and we are moving 3357 * If the source inode is a directory and we are moving
3333 * it across directories, its ".." entry will be 3358 * it across directories, its ".." entry will be
@@ -3417,30 +3442,6 @@ xfs_rename(
3417 if (error) 3442 if (error)
3418 goto out_trans_cancel; 3443 goto out_trans_cancel;
3419 3444
3420 /*
3421 * For whiteouts, we need to bump the link count on the whiteout inode.
3422 * This means that failures all the way up to this point leave the inode
3423 * on the unlinked list and so cleanup is a simple matter of dropping
3424 * the remaining reference to it. If we fail here after bumping the link
3425 * count, we're shutting down the filesystem so we'll never see the
3426 * intermediate state on disk.
3427 */
3428 if (wip) {
3429 ASSERT(VFS_I(wip)->i_nlink == 0);
3430 xfs_bumplink(tp, wip);
3431 error = xfs_iunlink_remove(tp, wip);
3432 if (error)
3433 goto out_trans_cancel;
3434 xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
3435
3436 /*
3437 * Now we have a real link, clear the "I'm a tmpfile" state
3438 * flag from the inode so it doesn't accidentally get misused in
3439 * future.
3440 */
3441 VFS_I(wip)->i_state &= ~I_LINKABLE;
3442 }
3443
3444 xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 3445 xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3445 xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); 3446 xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
3446 if (new_parent) 3447 if (new_parent)
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index c9a502eed204..bb8f076805b9 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -651,7 +651,7 @@ xfs_inode_item_init(
651 struct xfs_inode_log_item *iip; 651 struct xfs_inode_log_item *iip;
652 652
653 ASSERT(ip->i_itemp == NULL); 653 ASSERT(ip->i_itemp == NULL);
654 iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP); 654 iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, 0);
655 655
656 iip->ili_inode = ip; 656 iip->ili_inode = ip;
657 xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE, 657 xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index affa557c2337..d58f0d6a699e 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -396,7 +396,7 @@ xfs_attrlist_by_handle(
396 if (IS_ERR(dentry)) 396 if (IS_ERR(dentry))
397 return PTR_ERR(dentry); 397 return PTR_ERR(dentry);
398 398
399 kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP); 399 kbuf = kmem_zalloc_large(al_hreq.buflen, 0);
400 if (!kbuf) 400 if (!kbuf)
401 goto out_dput; 401 goto out_dput;
402 402
@@ -434,11 +434,11 @@ xfs_attrmulti_attr_get(
434 434
435 if (*len > XFS_XATTR_SIZE_MAX) 435 if (*len > XFS_XATTR_SIZE_MAX)
436 return -EINVAL; 436 return -EINVAL;
437 kbuf = kmem_zalloc_large(*len, KM_SLEEP); 437 kbuf = kmem_zalloc_large(*len, 0);
438 if (!kbuf) 438 if (!kbuf)
439 return -ENOMEM; 439 return -ENOMEM;
440 440
441 error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags); 441 error = xfs_attr_get(XFS_I(inode), name, &kbuf, (int *)len, flags);
442 if (error) 442 if (error)
443 goto out_kfree; 443 goto out_kfree;
444 444
@@ -831,7 +831,7 @@ xfs_bulkstat_fmt(
831/* 831/*
832 * Check the incoming bulk request @hdr from userspace and initialize the 832 * Check the incoming bulk request @hdr from userspace and initialize the
833 * internal @breq bulk request appropriately. Returns 0 if the bulk request 833 * internal @breq bulk request appropriately. Returns 0 if the bulk request
834 * should proceed; XFS_ITER_ABORT if there's nothing to do; or the usual 834 * should proceed; -ECANCELED if there's nothing to do; or the usual
835 * negative error code. 835 * negative error code.
836 */ 836 */
837static int 837static int
@@ -889,13 +889,13 @@ xfs_bulk_ireq_setup(
889 889
890 /* Asking for an inode past the end of the AG? We're done! */ 890 /* Asking for an inode past the end of the AG? We're done! */
891 if (XFS_INO_TO_AGNO(mp, breq->startino) > hdr->agno) 891 if (XFS_INO_TO_AGNO(mp, breq->startino) > hdr->agno)
892 return XFS_ITER_ABORT; 892 return -ECANCELED;
893 } else if (hdr->agno) 893 } else if (hdr->agno)
894 return -EINVAL; 894 return -EINVAL;
895 895
896 /* Asking for an inode past the end of the FS? We're done! */ 896 /* Asking for an inode past the end of the FS? We're done! */
897 if (XFS_INO_TO_AGNO(mp, breq->startino) >= mp->m_sb.sb_agcount) 897 if (XFS_INO_TO_AGNO(mp, breq->startino) >= mp->m_sb.sb_agcount)
898 return XFS_ITER_ABORT; 898 return -ECANCELED;
899 899
900 return 0; 900 return 0;
901} 901}
@@ -936,7 +936,7 @@ xfs_ioc_bulkstat(
936 return -EFAULT; 936 return -EFAULT;
937 937
938 error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->bulkstat); 938 error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->bulkstat);
939 if (error == XFS_ITER_ABORT) 939 if (error == -ECANCELED)
940 goto out_teardown; 940 goto out_teardown;
941 if (error < 0) 941 if (error < 0)
942 return error; 942 return error;
@@ -986,7 +986,7 @@ xfs_ioc_inumbers(
986 return -EFAULT; 986 return -EFAULT;
987 987
988 error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->inumbers); 988 error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->inumbers);
989 if (error == XFS_ITER_ABORT) 989 if (error == -ECANCELED)
990 goto out_teardown; 990 goto out_teardown;
991 if (error < 0) 991 if (error < 0)
992 return error; 992 return error;
@@ -1038,6 +1038,10 @@ xfs_ioc_ag_geometry(
1038 1038
1039 if (copy_from_user(&ageo, arg, sizeof(ageo))) 1039 if (copy_from_user(&ageo, arg, sizeof(ageo)))
1040 return -EFAULT; 1040 return -EFAULT;
1041 if (ageo.ag_flags)
1042 return -EINVAL;
1043 if (memchr_inv(&ageo.ag_reserved, 0, sizeof(ageo.ag_reserved)))
1044 return -EINVAL;
1041 1045
1042 error = xfs_ag_get_geometry(mp, ageo.ag_number, &ageo); 1046 error = xfs_ag_get_geometry(mp, ageo.ag_number, &ageo);
1043 if (error) 1047 if (error)
@@ -1309,8 +1313,7 @@ xfs_ioctl_setattr_dax_invalidate(
1309 if (fa->fsx_xflags & FS_XFLAG_DAX) { 1313 if (fa->fsx_xflags & FS_XFLAG_DAX) {
1310 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) 1314 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
1311 return -EINVAL; 1315 return -EINVAL;
1312 if (S_ISREG(inode->i_mode) && 1316 if (!bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)),
1313 !bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)),
1314 sb->s_blocksize)) 1317 sb->s_blocksize))
1315 return -EINVAL; 1318 return -EINVAL;
1316 } 1319 }
@@ -1881,7 +1884,7 @@ xfs_ioc_getfsmap(
1881 info.mp = ip->i_mount; 1884 info.mp = ip->i_mount;
1882 info.data = arg; 1885 info.data = arg;
1883 error = xfs_getfsmap(ip->i_mount, &xhead, xfs_getfsmap_format, &info); 1886 error = xfs_getfsmap(ip->i_mount, &xhead, xfs_getfsmap_format, &info);
1884 if (error == XFS_BTREE_QUERY_RANGE_ABORT) { 1887 if (error == -ECANCELED) {
1885 error = 0; 1888 error = 0;
1886 aborted = true; 1889 aborted = true;
1887 } else if (error) 1890 } else if (error)
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 7bd7534f5051..1e08bf79b478 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -381,7 +381,7 @@ xfs_compat_attrlist_by_handle(
381 return PTR_ERR(dentry); 381 return PTR_ERR(dentry);
382 382
383 error = -ENOMEM; 383 error = -ENOMEM;
384 kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP); 384 kbuf = kmem_zalloc_large(al_hreq.buflen, 0);
385 if (!kbuf) 385 if (!kbuf)
386 goto out_dput; 386 goto out_dput;
387 387
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 3a4310d7cb59..f780e223b118 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -58,7 +58,7 @@ xfs_bmbt_to_iomap(
58{ 58{
59 struct xfs_mount *mp = ip->i_mount; 59 struct xfs_mount *mp = ip->i_mount;
60 60
61 if (unlikely(!imap->br_startblock && !XFS_IS_REALTIME_INODE(ip))) 61 if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
62 return xfs_alert_fsblock_zero(ip, imap); 62 return xfs_alert_fsblock_zero(ip, imap);
63 63
64 if (imap->br_startblock == HOLESTARTBLOCK) { 64 if (imap->br_startblock == HOLESTARTBLOCK) {
@@ -297,7 +297,7 @@ xfs_iomap_write_direct(
297 goto out_unlock; 297 goto out_unlock;
298 } 298 }
299 299
300 if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) 300 if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
301 error = xfs_alert_fsblock_zero(ip, imap); 301 error = xfs_alert_fsblock_zero(ip, imap);
302 302
303out_unlock: 303out_unlock:
@@ -814,7 +814,7 @@ xfs_iomap_write_unwritten(
814 if (error) 814 if (error)
815 return error; 815 return error;
816 816
817 if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) 817 if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock)))
818 return xfs_alert_fsblock_zero(ip, &imap); 818 return xfs_alert_fsblock_zero(ip, &imap);
819 819
820 if ((numblks_fsb = imap.br_blockcount) == 0) { 820 if ((numblks_fsb = imap.br_blockcount) == 0) {
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index f5c955d35be4..884950adbd16 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -137,7 +137,7 @@ xfs_bulkstat_one_int(
137 xfs_irele(ip); 137 xfs_irele(ip);
138 138
139 error = bc->formatter(bc->breq, buf); 139 error = bc->formatter(bc->breq, buf);
140 if (error == XFS_IBULK_ABORT) 140 if (error == -ECANCELED)
141 goto out_advance; 141 goto out_advance;
142 if (error) 142 if (error)
143 goto out; 143 goto out;
@@ -169,7 +169,7 @@ xfs_bulkstat_one(
169 ASSERT(breq->icount == 1); 169 ASSERT(breq->icount == 1);
170 170
171 bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat), 171 bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat),
172 KM_SLEEP | KM_MAYFAIL); 172 KM_MAYFAIL);
173 if (!bc.buf) 173 if (!bc.buf)
174 return -ENOMEM; 174 return -ENOMEM;
175 175
@@ -181,7 +181,7 @@ xfs_bulkstat_one(
181 * If we reported one inode to userspace then we abort because we hit 181 * If we reported one inode to userspace then we abort because we hit
182 * the end of the buffer. Don't leak that back to userspace. 182 * the end of the buffer. Don't leak that back to userspace.
183 */ 183 */
184 if (error == XFS_IWALK_ABORT) 184 if (error == -ECANCELED)
185 error = 0; 185 error = 0;
186 186
187 return error; 187 return error;
@@ -243,7 +243,7 @@ xfs_bulkstat(
243 return 0; 243 return 0;
244 244
245 bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat), 245 bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat),
246 KM_SLEEP | KM_MAYFAIL); 246 KM_MAYFAIL);
247 if (!bc.buf) 247 if (!bc.buf)
248 return -ENOMEM; 248 return -ENOMEM;
249 249
@@ -342,7 +342,7 @@ xfs_inumbers_walk(
342 int error; 342 int error;
343 343
344 error = ic->formatter(ic->breq, &inogrp); 344 error = ic->formatter(ic->breq, &inogrp);
345 if (error && error != XFS_IBULK_ABORT) 345 if (error && error != -ECANCELED)
346 return error; 346 return error;
347 347
348 ic->breq->startino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino) + 348 ic->breq->startino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino) +
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index e90c1fc5b981..96a1e2a9be3f 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -18,9 +18,6 @@ struct xfs_ibulk {
18/* Only iterate within the same AG as startino */ 18/* Only iterate within the same AG as startino */
19#define XFS_IBULK_SAME_AG (XFS_IWALK_SAME_AG) 19#define XFS_IBULK_SAME_AG (XFS_IWALK_SAME_AG)
20 20
21/* Return value that means we want to abort the walk. */
22#define XFS_IBULK_ABORT (XFS_IWALK_ABORT)
23
24/* 21/*
25 * Advance the user buffer pointer by one record of the given size. If the 22 * Advance the user buffer pointer by one record of the given size. If the
26 * buffer is now full, return the appropriate error code. 23 * buffer is now full, return the appropriate error code.
@@ -34,13 +31,21 @@ xfs_ibulk_advance(
34 31
35 breq->ubuffer = b + bytes; 32 breq->ubuffer = b + bytes;
36 breq->ocount++; 33 breq->ocount++;
37 return breq->ocount == breq->icount ? XFS_IBULK_ABORT : 0; 34 return breq->ocount == breq->icount ? -ECANCELED : 0;
38} 35}
39 36
40/* 37/*
41 * Return stat information in bulk (by-inode) for the filesystem. 38 * Return stat information in bulk (by-inode) for the filesystem.
42 */ 39 */
43 40
41/*
42 * Return codes for the formatter function are 0 to continue iterating, and
43 * non-zero to stop iterating. Any non-zero value will be passed up to the
44 * bulkstat/inumbers caller. The special value -ECANCELED can be used to stop
45 * iteration, as neither bulkstat nor inumbers will ever generate that error
46 * code on their own.
47 */
48
44typedef int (*bulkstat_one_fmt_pf)(struct xfs_ibulk *breq, 49typedef int (*bulkstat_one_fmt_pf)(struct xfs_ibulk *breq,
45 const struct xfs_bulkstat *bstat); 50 const struct xfs_bulkstat *bstat);
46 51
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
index 8c7d727149ea..aa375cf53021 100644
--- a/fs/xfs/xfs_iwalk.c
+++ b/fs/xfs/xfs_iwalk.c
@@ -31,7 +31,7 @@
31 * inode it finds, it calls a walk function with the relevant inode number and 31 * inode it finds, it calls a walk function with the relevant inode number and
32 * a pointer to caller-provided data. The walk function can return the usual 32 * a pointer to caller-provided data. The walk function can return the usual
33 * negative error code to stop the iteration; 0 to continue the iteration; or 33 * negative error code to stop the iteration; 0 to continue the iteration; or
34 * XFS_IWALK_ABORT to stop the iteration. This return value is returned to the 34 * -ECANCELED to stop the iteration. This return value is returned to the
35 * caller. 35 * caller.
36 * 36 *
37 * Internally, we allow the walk function to do anything, which means that we 37 * Internally, we allow the walk function to do anything, which means that we
@@ -616,7 +616,7 @@ xfs_iwalk_threaded(
616 if (xfs_pwork_ctl_want_abort(&pctl)) 616 if (xfs_pwork_ctl_want_abort(&pctl))
617 break; 617 break;
618 618
619 iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), KM_SLEEP); 619 iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), 0);
620 iwag->mp = mp; 620 iwag->mp = mp;
621 iwag->iwalk_fn = iwalk_fn; 621 iwag->iwalk_fn = iwalk_fn;
622 iwag->data = data; 622 iwag->data = data;
diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h
index 6c960e10ed4d..37a795f03267 100644
--- a/fs/xfs/xfs_iwalk.h
+++ b/fs/xfs/xfs_iwalk.h
@@ -6,12 +6,17 @@
6#ifndef __XFS_IWALK_H__ 6#ifndef __XFS_IWALK_H__
7#define __XFS_IWALK_H__ 7#define __XFS_IWALK_H__
8 8
9/*
10 * Return codes for the inode/inobt walk function are 0 to continue iterating,
11 * and non-zero to stop iterating. Any non-zero value will be passed up to the
12 * iwalk or inobt_walk caller. The special value -ECANCELED can be used to
13 * stop iteration, as neither iwalk nor inobt_walk will ever generate that
14 * error code on their own.
15 */
16
9/* Walk all inodes in the filesystem starting from @startino. */ 17/* Walk all inodes in the filesystem starting from @startino. */
10typedef int (*xfs_iwalk_fn)(struct xfs_mount *mp, struct xfs_trans *tp, 18typedef int (*xfs_iwalk_fn)(struct xfs_mount *mp, struct xfs_trans *tp,
11 xfs_ino_t ino, void *data); 19 xfs_ino_t ino, void *data);
12/* Return values for xfs_iwalk_fn. */
13#define XFS_IWALK_CONTINUE (XFS_ITER_CONTINUE)
14#define XFS_IWALK_ABORT (XFS_ITER_ABORT)
15 20
16int xfs_iwalk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino, 21int xfs_iwalk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino,
17 unsigned int flags, xfs_iwalk_fn iwalk_fn, 22 unsigned int flags, xfs_iwalk_fn iwalk_fn,
@@ -30,8 +35,6 @@ typedef int (*xfs_inobt_walk_fn)(struct xfs_mount *mp, struct xfs_trans *tp,
30 xfs_agnumber_t agno, 35 xfs_agnumber_t agno,
31 const struct xfs_inobt_rec_incore *irec, 36 const struct xfs_inobt_rec_incore *irec,
32 void *data); 37 void *data);
33/* Return value (for xfs_inobt_walk_fn) that aborts the walk immediately. */
34#define XFS_INOBT_WALK_ABORT (XFS_IWALK_ABORT)
35 38
36int xfs_inobt_walk(struct xfs_mount *mp, struct xfs_trans *tp, 39int xfs_inobt_walk(struct xfs_mount *mp, struct xfs_trans *tp,
37 xfs_ino_t startino, unsigned int flags, 40 xfs_ino_t startino, unsigned int flags,
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 7fc3c1ad36bc..a2beee9f74da 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -214,15 +214,42 @@ xlog_grant_head_wake(
214{ 214{
215 struct xlog_ticket *tic; 215 struct xlog_ticket *tic;
216 int need_bytes; 216 int need_bytes;
217 bool woken_task = false;
217 218
218 list_for_each_entry(tic, &head->waiters, t_queue) { 219 list_for_each_entry(tic, &head->waiters, t_queue) {
220
221 /*
222 * There is a chance that the size of the CIL checkpoints in
223 * progress at the last AIL push target calculation resulted in
224 * limiting the target to the log head (l_last_sync_lsn) at the
225 * time. This may not reflect where the log head is now as the
226 * CIL checkpoints may have completed.
227 *
228 * Hence when we are woken here, it may be that the head of the
229 * log that has moved rather than the tail. As the tail didn't
230 * move, there still won't be space available for the
231 * reservation we require. However, if the AIL has already
232 * pushed to the target defined by the old log head location, we
233 * will hang here waiting for something else to update the AIL
234 * push target.
235 *
236 * Therefore, if there isn't space to wake the first waiter on
237 * the grant head, we need to push the AIL again to ensure the
238 * target reflects both the current log tail and log head
239 * position before we wait for the tail to move again.
240 */
241
219 need_bytes = xlog_ticket_reservation(log, head, tic); 242 need_bytes = xlog_ticket_reservation(log, head, tic);
220 if (*free_bytes < need_bytes) 243 if (*free_bytes < need_bytes) {
244 if (!woken_task)
245 xlog_grant_push_ail(log, need_bytes);
221 return false; 246 return false;
247 }
222 248
223 *free_bytes -= need_bytes; 249 *free_bytes -= need_bytes;
224 trace_xfs_log_grant_wake_up(log, tic); 250 trace_xfs_log_grant_wake_up(log, tic);
225 wake_up_process(tic->t_task); 251 wake_up_process(tic->t_task);
252 woken_task = true;
226 } 253 }
227 254
228 return true; 255 return true;
@@ -428,8 +455,7 @@ xfs_log_reserve(
428 XFS_STATS_INC(mp, xs_try_logspace); 455 XFS_STATS_INC(mp, xs_try_logspace);
429 456
430 ASSERT(*ticp == NULL); 457 ASSERT(*ticp == NULL);
431 tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, 458 tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, 0);
432 KM_SLEEP);
433 *ticp = tic; 459 *ticp = tic;
434 460
435 xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt 461 xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
@@ -1404,6 +1430,7 @@ xlog_alloc_log(
1404 */ 1430 */
1405 ASSERT(log->l_iclog_size >= 4096); 1431 ASSERT(log->l_iclog_size >= 4096);
1406 for (i = 0; i < log->l_iclog_bufs; i++) { 1432 for (i = 0; i < log->l_iclog_bufs; i++) {
1433 int align_mask = xfs_buftarg_dma_alignment(mp->m_logdev_targp);
1407 size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) * 1434 size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) *
1408 sizeof(struct bio_vec); 1435 sizeof(struct bio_vec);
1409 1436
@@ -1415,8 +1442,8 @@ xlog_alloc_log(
1415 iclog->ic_prev = prev_iclog; 1442 iclog->ic_prev = prev_iclog;
1416 prev_iclog = iclog; 1443 prev_iclog = iclog;
1417 1444
1418 iclog->ic_data = kmem_alloc_large(log->l_iclog_size, 1445 iclog->ic_data = kmem_alloc_io(log->l_iclog_size, align_mask,
1419 KM_MAYFAIL); 1446 KM_MAYFAIL);
1420 if (!iclog->ic_data) 1447 if (!iclog->ic_data)
1421 goto out_free_iclog; 1448 goto out_free_iclog;
1422#ifdef DEBUG 1449#ifdef DEBUG
@@ -2496,21 +2523,35 @@ next_lv:
2496 ***************************************************************************** 2523 *****************************************************************************
2497 */ 2524 */
2498 2525
2499/* Clean iclogs starting from the head. This ordering must be 2526/*
2500 * maintained, so an iclog doesn't become ACTIVE beyond one that 2527 * An iclog has just finished IO completion processing, so we need to update
2501 * is SYNCING. This is also required to maintain the notion that we use 2528 * the iclog state and propagate that up into the overall log state. Hence we
2502 * a ordered wait queue to hold off would be writers to the log when every 2529 * prepare the iclog for cleaning, and then clean all the pending dirty iclogs
2503 * iclog is trying to sync to disk. 2530 * starting from the head, and then wake up any threads that are waiting for the
2531 * iclog to be marked clean.
2532 *
2533 * The ordering of marking iclogs ACTIVE must be maintained, so an iclog
2534 * doesn't become ACTIVE beyond one that is SYNCING. This is also required to
2535 * maintain the notion that we use a ordered wait queue to hold off would be
2536 * writers to the log when every iclog is trying to sync to disk.
2537 *
2538 * Caller must hold the icloglock before calling us.
2504 * 2539 *
2505 * State Change: DIRTY -> ACTIVE 2540 * State Change: !IOERROR -> DIRTY -> ACTIVE
2506 */ 2541 */
2507STATIC void 2542STATIC void
2508xlog_state_clean_log( 2543xlog_state_clean_iclog(
2509 struct xlog *log) 2544 struct xlog *log,
2545 struct xlog_in_core *dirty_iclog)
2510{ 2546{
2511 xlog_in_core_t *iclog; 2547 struct xlog_in_core *iclog;
2512 int changed = 0; 2548 int changed = 0;
2513 2549
2550 /* Prepare the completed iclog. */
2551 if (!(dirty_iclog->ic_state & XLOG_STATE_IOERROR))
2552 dirty_iclog->ic_state = XLOG_STATE_DIRTY;
2553
2554 /* Walk all the iclogs to update the ordered active state. */
2514 iclog = log->l_iclog; 2555 iclog = log->l_iclog;
2515 do { 2556 do {
2516 if (iclog->ic_state == XLOG_STATE_DIRTY) { 2557 if (iclog->ic_state == XLOG_STATE_DIRTY) {
@@ -2548,7 +2589,13 @@ xlog_state_clean_log(
2548 iclog = iclog->ic_next; 2589 iclog = iclog->ic_next;
2549 } while (iclog != log->l_iclog); 2590 } while (iclog != log->l_iclog);
2550 2591
2551 /* log is locked when we are called */ 2592
2593 /*
2594 * Wake up threads waiting in xfs_log_force() for the dirty iclog
2595 * to be cleaned.
2596 */
2597 wake_up_all(&dirty_iclog->ic_force_wait);
2598
2552 /* 2599 /*
2553 * Change state for the dummy log recording. 2600 * Change state for the dummy log recording.
2554 * We usually go to NEED. But we go to NEED2 if the changed indicates 2601 * We usually go to NEED. But we go to NEED2 if the changed indicates
@@ -2582,7 +2629,7 @@ xlog_state_clean_log(
2582 ASSERT(0); 2629 ASSERT(0);
2583 } 2630 }
2584 } 2631 }
2585} /* xlog_state_clean_log */ 2632}
2586 2633
2587STATIC xfs_lsn_t 2634STATIC xfs_lsn_t
2588xlog_get_lowest_lsn( 2635xlog_get_lowest_lsn(
@@ -2603,30 +2650,205 @@ xlog_get_lowest_lsn(
2603 return lowest_lsn; 2650 return lowest_lsn;
2604} 2651}
2605 2652
2653/*
2654 * Completion of a iclog IO does not imply that a transaction has completed, as
2655 * transactions can be large enough to span many iclogs. We cannot change the
2656 * tail of the log half way through a transaction as this may be the only
2657 * transaction in the log and moving the tail to point to the middle of it
2658 * will prevent recovery from finding the start of the transaction. Hence we
2659 * should only update the last_sync_lsn if this iclog contains transaction
2660 * completion callbacks on it.
2661 *
2662 * We have to do this before we drop the icloglock to ensure we are the only one
2663 * that can update it.
2664 *
2665 * If we are moving the last_sync_lsn forwards, we also need to ensure we kick
2666 * the reservation grant head pushing. This is due to the fact that the push
2667 * target is bound by the current last_sync_lsn value. Hence if we have a large
2668 * amount of log space bound up in this committing transaction then the
2669 * last_sync_lsn value may be the limiting factor preventing tail pushing from
2670 * freeing space in the log. Hence once we've updated the last_sync_lsn we
2671 * should push the AIL to ensure the push target (and hence the grant head) is
2672 * no longer bound by the old log head location and can move forwards and make
2673 * progress again.
2674 */
2675static void
2676xlog_state_set_callback(
2677 struct xlog *log,
2678 struct xlog_in_core *iclog,
2679 xfs_lsn_t header_lsn)
2680{
2681 iclog->ic_state = XLOG_STATE_CALLBACK;
2682
2683 ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
2684 header_lsn) <= 0);
2685
2686 if (list_empty_careful(&iclog->ic_callbacks))
2687 return;
2688
2689 atomic64_set(&log->l_last_sync_lsn, header_lsn);
2690 xlog_grant_push_ail(log, 0);
2691}
2692
2693/*
2694 * Return true if we need to stop processing, false to continue to the next
2695 * iclog. The caller will need to run callbacks if the iclog is returned in the
2696 * XLOG_STATE_CALLBACK state.
2697 */
2698static bool
2699xlog_state_iodone_process_iclog(
2700 struct xlog *log,
2701 struct xlog_in_core *iclog,
2702 struct xlog_in_core *completed_iclog,
2703 bool *ioerror)
2704{
2705 xfs_lsn_t lowest_lsn;
2706 xfs_lsn_t header_lsn;
2707
2708 /* Skip all iclogs in the ACTIVE & DIRTY states */
2709 if (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))
2710 return false;
2711
2712 /*
2713 * Between marking a filesystem SHUTDOWN and stopping the log, we do
2714 * flush all iclogs to disk (if there wasn't a log I/O error). So, we do
2715 * want things to go smoothly in case of just a SHUTDOWN w/o a
2716 * LOG_IO_ERROR.
2717 */
2718 if (iclog->ic_state & XLOG_STATE_IOERROR) {
2719 *ioerror = true;
2720 return false;
2721 }
2722
2723 /*
2724 * Can only perform callbacks in order. Since this iclog is not in the
2725 * DONE_SYNC/ DO_CALLBACK state, we skip the rest and just try to clean
2726 * up. If we set our iclog to DO_CALLBACK, we will not process it when
2727 * we retry since a previous iclog is in the CALLBACK and the state
2728 * cannot change since we are holding the l_icloglock.
2729 */
2730 if (!(iclog->ic_state &
2731 (XLOG_STATE_DONE_SYNC | XLOG_STATE_DO_CALLBACK))) {
2732 if (completed_iclog &&
2733 (completed_iclog->ic_state == XLOG_STATE_DONE_SYNC)) {
2734 completed_iclog->ic_state = XLOG_STATE_DO_CALLBACK;
2735 }
2736 return true;
2737 }
2738
2739 /*
2740 * We now have an iclog that is in either the DO_CALLBACK or DONE_SYNC
2741 * states. The other states (WANT_SYNC, SYNCING, or CALLBACK were caught
2742 * by the above if and are going to clean (i.e. we aren't doing their
2743 * callbacks) see the above if.
2744 *
2745 * We will do one more check here to see if we have chased our tail
2746 * around. If this is not the lowest lsn iclog, then we will leave it
2747 * for another completion to process.
2748 */
2749 header_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
2750 lowest_lsn = xlog_get_lowest_lsn(log);
2751 if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, header_lsn) < 0)
2752 return false;
2753
2754 xlog_state_set_callback(log, iclog, header_lsn);
2755 return false;
2756
2757}
2758
2759/*
2760 * Keep processing entries in the iclog callback list until we come around and
2761 * it is empty. We need to atomically see that the list is empty and change the
2762 * state to DIRTY so that we don't miss any more callbacks being added.
2763 *
2764 * This function is called with the icloglock held and returns with it held. We
2765 * drop it while running callbacks, however, as holding it over thousands of
2766 * callbacks is unnecessary and causes excessive contention if we do.
2767 */
2768static void
2769xlog_state_do_iclog_callbacks(
2770 struct xlog *log,
2771 struct xlog_in_core *iclog,
2772 bool aborted)
2773{
2774 spin_unlock(&log->l_icloglock);
2775 spin_lock(&iclog->ic_callback_lock);
2776 while (!list_empty(&iclog->ic_callbacks)) {
2777 LIST_HEAD(tmp);
2778
2779 list_splice_init(&iclog->ic_callbacks, &tmp);
2780
2781 spin_unlock(&iclog->ic_callback_lock);
2782 xlog_cil_process_committed(&tmp, aborted);
2783 spin_lock(&iclog->ic_callback_lock);
2784 }
2785
2786 /*
2787 * Pick up the icloglock while still holding the callback lock so we
2788 * serialise against anyone trying to add more callbacks to this iclog
2789 * now we've finished processing.
2790 */
2791 spin_lock(&log->l_icloglock);
2792 spin_unlock(&iclog->ic_callback_lock);
2793}
2794
2795#ifdef DEBUG
2796/*
2797 * Make one last gasp attempt to see if iclogs are being left in limbo. If the
2798 * above loop finds an iclog earlier than the current iclog and in one of the
2799 * syncing states, the current iclog is put into DO_CALLBACK and the callbacks
2800 * are deferred to the completion of the earlier iclog. Walk the iclogs in order
2801 * and make sure that no iclog is in DO_CALLBACK unless an earlier iclog is in
2802 * one of the syncing states.
2803 *
2804 * Note that SYNCING|IOERROR is a valid state so we cannot just check for
2805 * ic_state == SYNCING.
2806 */
2807static void
2808xlog_state_callback_check_state(
2809 struct xlog *log)
2810{
2811 struct xlog_in_core *first_iclog = log->l_iclog;
2812 struct xlog_in_core *iclog = first_iclog;
2813
2814 do {
2815 ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK);
2816 /*
2817 * Terminate the loop if iclogs are found in states
2818 * which will cause other threads to clean up iclogs.
2819 *
2820 * SYNCING - i/o completion will go through logs
2821 * DONE_SYNC - interrupt thread should be waiting for
2822 * l_icloglock
2823 * IOERROR - give up hope all ye who enter here
2824 */
2825 if (iclog->ic_state == XLOG_STATE_WANT_SYNC ||
2826 iclog->ic_state & XLOG_STATE_SYNCING ||
2827 iclog->ic_state == XLOG_STATE_DONE_SYNC ||
2828 iclog->ic_state == XLOG_STATE_IOERROR )
2829 break;
2830 iclog = iclog->ic_next;
2831 } while (first_iclog != iclog);
2832}
2833#else
2834#define xlog_state_callback_check_state(l) ((void)0)
2835#endif
2836
2606STATIC void 2837STATIC void
2607xlog_state_do_callback( 2838xlog_state_do_callback(
2608 struct xlog *log, 2839 struct xlog *log,
2609 bool aborted, 2840 bool aborted,
2610 struct xlog_in_core *ciclog) 2841 struct xlog_in_core *ciclog)
2611{ 2842{
2612 xlog_in_core_t *iclog; 2843 struct xlog_in_core *iclog;
2613 xlog_in_core_t *first_iclog; /* used to know when we've 2844 struct xlog_in_core *first_iclog;
2614 * processed all iclogs once */ 2845 bool did_callbacks = false;
2615 int flushcnt = 0; 2846 bool cycled_icloglock;
2616 xfs_lsn_t lowest_lsn; 2847 bool ioerror;
2617 int ioerrors; /* counter: iclogs with errors */ 2848 int flushcnt = 0;
2618 int loopdidcallbacks; /* flag: inner loop did callbacks*/ 2849 int repeats = 0;
2619 int funcdidcallbacks; /* flag: function did callbacks */
2620 int repeats; /* for issuing console warnings if
2621 * looping too many times */
2622 int wake = 0;
2623 2850
2624 spin_lock(&log->l_icloglock); 2851 spin_lock(&log->l_icloglock);
2625 first_iclog = iclog = log->l_iclog;
2626 ioerrors = 0;
2627 funcdidcallbacks = 0;
2628 repeats = 0;
2629
2630 do { 2852 do {
2631 /* 2853 /*
2632 * Scan all iclogs starting with the one pointed to by the 2854 * Scan all iclogs starting with the one pointed to by the
@@ -2638,137 +2860,34 @@ xlog_state_do_callback(
2638 */ 2860 */
2639 first_iclog = log->l_iclog; 2861 first_iclog = log->l_iclog;
2640 iclog = log->l_iclog; 2862 iclog = log->l_iclog;
2641 loopdidcallbacks = 0; 2863 cycled_icloglock = false;
2864 ioerror = false;
2642 repeats++; 2865 repeats++;
2643 2866
2644 do { 2867 do {
2868 if (xlog_state_iodone_process_iclog(log, iclog,
2869 ciclog, &ioerror))
2870 break;
2645 2871
2646 /* skip all iclogs in the ACTIVE & DIRTY states */ 2872 if (!(iclog->ic_state &
2647 if (iclog->ic_state & 2873 (XLOG_STATE_CALLBACK | XLOG_STATE_IOERROR))) {
2648 (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY)) {
2649 iclog = iclog->ic_next; 2874 iclog = iclog->ic_next;
2650 continue; 2875 continue;
2651 } 2876 }
2652 2877
2653 /* 2878 /*
2654 * Between marking a filesystem SHUTDOWN and stopping 2879 * Running callbacks will drop the icloglock which means
2655 * the log, we do flush all iclogs to disk (if there 2880 * we'll have to run at least one more complete loop.
2656 * wasn't a log I/O error). So, we do want things to
2657 * go smoothly in case of just a SHUTDOWN w/o a
2658 * LOG_IO_ERROR.
2659 */
2660 if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
2661 /*
2662 * Can only perform callbacks in order. Since
2663 * this iclog is not in the DONE_SYNC/
2664 * DO_CALLBACK state, we skip the rest and
2665 * just try to clean up. If we set our iclog
2666 * to DO_CALLBACK, we will not process it when
2667 * we retry since a previous iclog is in the
2668 * CALLBACK and the state cannot change since
2669 * we are holding the l_icloglock.
2670 */
2671 if (!(iclog->ic_state &
2672 (XLOG_STATE_DONE_SYNC |
2673 XLOG_STATE_DO_CALLBACK))) {
2674 if (ciclog && (ciclog->ic_state ==
2675 XLOG_STATE_DONE_SYNC)) {
2676 ciclog->ic_state = XLOG_STATE_DO_CALLBACK;
2677 }
2678 break;
2679 }
2680 /*
2681 * We now have an iclog that is in either the
2682 * DO_CALLBACK or DONE_SYNC states. The other
2683 * states (WANT_SYNC, SYNCING, or CALLBACK were
2684 * caught by the above if and are going to
2685 * clean (i.e. we aren't doing their callbacks)
2686 * see the above if.
2687 */
2688
2689 /*
2690 * We will do one more check here to see if we
2691 * have chased our tail around.
2692 */
2693
2694 lowest_lsn = xlog_get_lowest_lsn(log);
2695 if (lowest_lsn &&
2696 XFS_LSN_CMP(lowest_lsn,
2697 be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
2698 iclog = iclog->ic_next;
2699 continue; /* Leave this iclog for
2700 * another thread */
2701 }
2702
2703 iclog->ic_state = XLOG_STATE_CALLBACK;
2704
2705
2706 /*
2707 * Completion of a iclog IO does not imply that
2708 * a transaction has completed, as transactions
2709 * can be large enough to span many iclogs. We
2710 * cannot change the tail of the log half way
2711 * through a transaction as this may be the only
2712 * transaction in the log and moving th etail to
2713 * point to the middle of it will prevent
2714 * recovery from finding the start of the
2715 * transaction. Hence we should only update the
2716 * last_sync_lsn if this iclog contains
2717 * transaction completion callbacks on it.
2718 *
2719 * We have to do this before we drop the
2720 * icloglock to ensure we are the only one that
2721 * can update it.
2722 */
2723 ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
2724 be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
2725 if (!list_empty_careful(&iclog->ic_callbacks))
2726 atomic64_set(&log->l_last_sync_lsn,
2727 be64_to_cpu(iclog->ic_header.h_lsn));
2728
2729 } else
2730 ioerrors++;
2731
2732 spin_unlock(&log->l_icloglock);
2733
2734 /*
2735 * Keep processing entries in the callback list until
2736 * we come around and it is empty. We need to
2737 * atomically see that the list is empty and change the
2738 * state to DIRTY so that we don't miss any more
2739 * callbacks being added.
2740 */
2741 spin_lock(&iclog->ic_callback_lock);
2742 while (!list_empty(&iclog->ic_callbacks)) {
2743 LIST_HEAD(tmp);
2744
2745 list_splice_init(&iclog->ic_callbacks, &tmp);
2746
2747 spin_unlock(&iclog->ic_callback_lock);
2748 xlog_cil_process_committed(&tmp, aborted);
2749 spin_lock(&iclog->ic_callback_lock);
2750 }
2751
2752 loopdidcallbacks++;
2753 funcdidcallbacks++;
2754
2755 spin_lock(&log->l_icloglock);
2756 spin_unlock(&iclog->ic_callback_lock);
2757 if (!(iclog->ic_state & XLOG_STATE_IOERROR))
2758 iclog->ic_state = XLOG_STATE_DIRTY;
2759
2760 /*
2761 * Transition from DIRTY to ACTIVE if applicable.
2762 * NOP if STATE_IOERROR.
2763 */ 2881 */
2764 xlog_state_clean_log(log); 2882 cycled_icloglock = true;
2765 2883 xlog_state_do_iclog_callbacks(log, iclog, aborted);
2766 /* wake up threads waiting in xfs_log_force() */
2767 wake_up_all(&iclog->ic_force_wait);
2768 2884
2885 xlog_state_clean_iclog(log, iclog);
2769 iclog = iclog->ic_next; 2886 iclog = iclog->ic_next;
2770 } while (first_iclog != iclog); 2887 } while (first_iclog != iclog);
2771 2888
2889 did_callbacks |= cycled_icloglock;
2890
2772 if (repeats > 5000) { 2891 if (repeats > 5000) {
2773 flushcnt += repeats; 2892 flushcnt += repeats;
2774 repeats = 0; 2893 repeats = 0;
@@ -2776,50 +2895,15 @@ xlog_state_do_callback(
2776 "%s: possible infinite loop (%d iterations)", 2895 "%s: possible infinite loop (%d iterations)",
2777 __func__, flushcnt); 2896 __func__, flushcnt);
2778 } 2897 }
2779 } while (!ioerrors && loopdidcallbacks); 2898 } while (!ioerror && cycled_icloglock);
2780 2899
2781#ifdef DEBUG 2900 if (did_callbacks)
2782 /* 2901 xlog_state_callback_check_state(log);
2783 * Make one last gasp attempt to see if iclogs are being left in limbo.
2784 * If the above loop finds an iclog earlier than the current iclog and
2785 * in one of the syncing states, the current iclog is put into
2786 * DO_CALLBACK and the callbacks are deferred to the completion of the
2787 * earlier iclog. Walk the iclogs in order and make sure that no iclog
2788 * is in DO_CALLBACK unless an earlier iclog is in one of the syncing
2789 * states.
2790 *
2791 * Note that SYNCING|IOABORT is a valid state so we cannot just check
2792 * for ic_state == SYNCING.
2793 */
2794 if (funcdidcallbacks) {
2795 first_iclog = iclog = log->l_iclog;
2796 do {
2797 ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK);
2798 /*
2799 * Terminate the loop if iclogs are found in states
2800 * which will cause other threads to clean up iclogs.
2801 *
2802 * SYNCING - i/o completion will go through logs
2803 * DONE_SYNC - interrupt thread should be waiting for
2804 * l_icloglock
2805 * IOERROR - give up hope all ye who enter here
2806 */
2807 if (iclog->ic_state == XLOG_STATE_WANT_SYNC ||
2808 iclog->ic_state & XLOG_STATE_SYNCING ||
2809 iclog->ic_state == XLOG_STATE_DONE_SYNC ||
2810 iclog->ic_state == XLOG_STATE_IOERROR )
2811 break;
2812 iclog = iclog->ic_next;
2813 } while (first_iclog != iclog);
2814 }
2815#endif
2816 2902
2817 if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR)) 2903 if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR))
2818 wake = 1;
2819 spin_unlock(&log->l_icloglock);
2820
2821 if (wake)
2822 wake_up_all(&log->l_flush_wait); 2904 wake_up_all(&log->l_flush_wait);
2905
2906 spin_unlock(&log->l_icloglock);
2823} 2907}
2824 2908
2825 2909
@@ -3919,7 +4003,9 @@ xfs_log_force_umount(
3919 * item committed callback functions will do this again under lock to 4003 * item committed callback functions will do this again under lock to
3920 * avoid races. 4004 * avoid races.
3921 */ 4005 */
4006 spin_lock(&log->l_cilp->xc_push_lock);
3922 wake_up_all(&log->l_cilp->xc_commit_wait); 4007 wake_up_all(&log->l_cilp->xc_commit_wait);
4008 spin_unlock(&log->l_cilp->xc_push_lock);
3923 xlog_state_do_callback(log, true, NULL); 4009 xlog_state_do_callback(log, true, NULL);
3924 4010
3925#ifdef XFSERRORDEBUG 4011#ifdef XFSERRORDEBUG
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index fa5602d0fd7f..ef652abd112c 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -38,7 +38,7 @@ xlog_cil_ticket_alloc(
38 struct xlog_ticket *tic; 38 struct xlog_ticket *tic;
39 39
40 tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0, 40 tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
41 KM_SLEEP|KM_NOFS); 41 KM_NOFS);
42 42
43 /* 43 /*
44 * set the current reservation to zero so we know to steal the basic 44 * set the current reservation to zero so we know to steal the basic
@@ -186,7 +186,7 @@ xlog_cil_alloc_shadow_bufs(
186 */ 186 */
187 kmem_free(lip->li_lv_shadow); 187 kmem_free(lip->li_lv_shadow);
188 188
189 lv = kmem_alloc_large(buf_size, KM_SLEEP | KM_NOFS); 189 lv = kmem_alloc_large(buf_size, KM_NOFS);
190 memset(lv, 0, xlog_cil_iovec_space(niovecs)); 190 memset(lv, 0, xlog_cil_iovec_space(niovecs));
191 191
192 lv->lv_item = lip; 192 lv->lv_item = lip;
@@ -660,7 +660,7 @@ xlog_cil_push(
660 if (!cil) 660 if (!cil)
661 return 0; 661 return 0;
662 662
663 new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); 663 new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_NOFS);
664 new_ctx->ticket = xlog_cil_ticket_alloc(log); 664 new_ctx->ticket = xlog_cil_ticket_alloc(log);
665 665
666 down_write(&cil->xc_ctx_lock); 666 down_write(&cil->xc_ctx_lock);
@@ -1179,11 +1179,11 @@ xlog_cil_init(
1179 struct xfs_cil *cil; 1179 struct xfs_cil *cil;
1180 struct xfs_cil_ctx *ctx; 1180 struct xfs_cil_ctx *ctx;
1181 1181
1182 cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL); 1182 cil = kmem_zalloc(sizeof(*cil), KM_MAYFAIL);
1183 if (!cil) 1183 if (!cil)
1184 return -ENOMEM; 1184 return -ENOMEM;
1185 1185
1186 ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL); 1186 ctx = kmem_zalloc(sizeof(*ctx), KM_MAYFAIL);
1187 if (!ctx) { 1187 if (!ctx) {
1188 kmem_free(cil); 1188 kmem_free(cil);
1189 return -ENOMEM; 1189 return -ENOMEM;
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 13d1d3e95b88..508319039dce 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -97,6 +97,8 @@ xlog_alloc_buffer(
97 struct xlog *log, 97 struct xlog *log,
98 int nbblks) 98 int nbblks)
99{ 99{
100 int align_mask = xfs_buftarg_dma_alignment(log->l_targ);
101
100 /* 102 /*
101 * Pass log block 0 since we don't have an addr yet, buffer will be 103 * Pass log block 0 since we don't have an addr yet, buffer will be
102 * verified on read. 104 * verified on read.
@@ -125,7 +127,7 @@ xlog_alloc_buffer(
125 if (nbblks > 1 && log->l_sectBBsize > 1) 127 if (nbblks > 1 && log->l_sectBBsize > 1)
126 nbblks += log->l_sectBBsize; 128 nbblks += log->l_sectBBsize;
127 nbblks = round_up(nbblks, log->l_sectBBsize); 129 nbblks = round_up(nbblks, log->l_sectBBsize);
128 return kmem_alloc_large(BBTOB(nbblks), KM_MAYFAIL); 130 return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL);
129} 131}
130 132
131/* 133/*
@@ -1960,7 +1962,7 @@ xlog_recover_buffer_pass1(
1960 } 1962 }
1961 } 1963 }
1962 1964
1963 bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP); 1965 bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0);
1964 bcp->bc_blkno = buf_f->blf_blkno; 1966 bcp->bc_blkno = buf_f->blf_blkno;
1965 bcp->bc_len = buf_f->blf_len; 1967 bcp->bc_len = buf_f->blf_len;
1966 bcp->bc_refcount = 1; 1968 bcp->bc_refcount = 1;
@@ -2930,7 +2932,7 @@ xlog_recover_inode_pass2(
2930 if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { 2932 if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
2931 in_f = item->ri_buf[0].i_addr; 2933 in_f = item->ri_buf[0].i_addr;
2932 } else { 2934 } else {
2933 in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), KM_SLEEP); 2935 in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0);
2934 need_free = 1; 2936 need_free = 1;
2935 error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f); 2937 error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
2936 if (error) 2938 if (error)
@@ -4161,7 +4163,7 @@ xlog_recover_add_item(
4161{ 4163{
4162 xlog_recover_item_t *item; 4164 xlog_recover_item_t *item;
4163 4165
4164 item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP); 4166 item = kmem_zalloc(sizeof(xlog_recover_item_t), 0);
4165 INIT_LIST_HEAD(&item->ri_list); 4167 INIT_LIST_HEAD(&item->ri_list);
4166 list_add_tail(&item->ri_list, head); 4168 list_add_tail(&item->ri_list, head);
4167} 4169}
@@ -4201,7 +4203,7 @@ xlog_recover_add_to_cont_trans(
4201 old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; 4203 old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
4202 old_len = item->ri_buf[item->ri_cnt-1].i_len; 4204 old_len = item->ri_buf[item->ri_cnt-1].i_len;
4203 4205
4204 ptr = kmem_realloc(old_ptr, len + old_len, KM_SLEEP); 4206 ptr = kmem_realloc(old_ptr, len + old_len, 0);
4205 memcpy(&ptr[old_len], dp, len); 4207 memcpy(&ptr[old_len], dp, len);
4206 item->ri_buf[item->ri_cnt-1].i_len += len; 4208 item->ri_buf[item->ri_cnt-1].i_len += len;
4207 item->ri_buf[item->ri_cnt-1].i_addr = ptr; 4209 item->ri_buf[item->ri_cnt-1].i_addr = ptr;
@@ -4261,7 +4263,7 @@ xlog_recover_add_to_trans(
4261 return 0; 4263 return 0;
4262 } 4264 }
4263 4265
4264 ptr = kmem_alloc(len, KM_SLEEP); 4266 ptr = kmem_alloc(len, 0);
4265 memcpy(ptr, dp, len); 4267 memcpy(ptr, dp, len);
4266 in_f = (struct xfs_inode_log_format *)ptr; 4268 in_f = (struct xfs_inode_log_format *)ptr;
4267 4269
@@ -4289,7 +4291,7 @@ xlog_recover_add_to_trans(
4289 item->ri_total = in_f->ilf_size; 4291 item->ri_total = in_f->ilf_size;
4290 item->ri_buf = 4292 item->ri_buf =
4291 kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t), 4293 kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
4292 KM_SLEEP); 4294 0);
4293 } 4295 }
4294 ASSERT(item->ri_total > item->ri_cnt); 4296 ASSERT(item->ri_total > item->ri_cnt);
4295 /* Description region is ri_buf[0] */ 4297 /* Description region is ri_buf[0] */
@@ -4423,7 +4425,7 @@ xlog_recover_ophdr_to_trans(
4423 * This is a new transaction so allocate a new recovery container to 4425 * This is a new transaction so allocate a new recovery container to
4424 * hold the recovery ops that will follow. 4426 * hold the recovery ops that will follow.
4425 */ 4427 */
4426 trans = kmem_zalloc(sizeof(struct xlog_recover), KM_SLEEP); 4428 trans = kmem_zalloc(sizeof(struct xlog_recover), 0);
4427 trans->r_log_tid = tid; 4429 trans->r_log_tid = tid;
4428 trans->r_lsn = be64_to_cpu(rhead->h_lsn); 4430 trans->r_lsn = be64_to_cpu(rhead->h_lsn);
4429 INIT_LIST_HEAD(&trans->r_itemq); 4431 INIT_LIST_HEAD(&trans->r_itemq);
@@ -5022,16 +5024,27 @@ xlog_recover_process_one_iunlink(
5022} 5024}
5023 5025
5024/* 5026/*
5025 * xlog_iunlink_recover 5027 * Recover AGI unlinked lists
5028 *
5029 * This is called during recovery to process any inodes which we unlinked but
5030 * not freed when the system crashed. These inodes will be on the lists in the
5031 * AGI blocks. What we do here is scan all the AGIs and fully truncate and free
5032 * any inodes found on the lists. Each inode is removed from the lists when it
5033 * has been fully truncated and is freed. The freeing of the inode and its
5034 * removal from the list must be atomic.
5035 *
5036 * If everything we touch in the agi processing loop is already in memory, this
5037 * loop can hold the cpu for a long time. It runs without lock contention,
5038 * memory allocation contention, the need wait for IO, etc, and so will run
5039 * until we either run out of inodes to process, run low on memory or we run out
5040 * of log space.
5026 * 5041 *
5027 * This is called during recovery to process any inodes which 5042 * This behaviour is bad for latency on single CPU and non-preemptible kernels,
5028 * we unlinked but not freed when the system crashed. These 5043 * and can prevent other filesytem work (such as CIL pushes) from running. This
5029 * inodes will be on the lists in the AGI blocks. What we do 5044 * can lead to deadlocks if the recovery process runs out of log reservation
5030 * here is scan all the AGIs and fully truncate and free any 5045 * space. Hence we need to yield the CPU when there is other kernel work
5031 * inodes found on the lists. Each inode is removed from the 5046 * scheduled on this CPU to ensure other scheduled work can run without undue
5032 * lists when it has been fully truncated and is freed. The 5047 * latency.
5033 * freeing of the inode and its removal from the list must be
5034 * atomic.
5035 */ 5048 */
5036STATIC void 5049STATIC void
5037xlog_recover_process_iunlinks( 5050xlog_recover_process_iunlinks(
@@ -5078,6 +5091,7 @@ xlog_recover_process_iunlinks(
5078 while (agino != NULLAGINO) { 5091 while (agino != NULLAGINO) {
5079 agino = xlog_recover_process_one_iunlink(mp, 5092 agino = xlog_recover_process_one_iunlink(mp,
5080 agno, agino, bucket); 5093 agno, agino, bucket);
5094 cond_resched();
5081 } 5095 }
5082 } 5096 }
5083 xfs_buf_rele(agibp); 5097 xfs_buf_rele(agibp);
@@ -5527,7 +5541,7 @@ xlog_do_log_recovery(
5527 */ 5541 */
5528 log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE * 5542 log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
5529 sizeof(struct list_head), 5543 sizeof(struct list_head),
5530 KM_SLEEP); 5544 0);
5531 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) 5545 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
5532 INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); 5546 INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
5533 5547
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 322da6909290..ba5b6f3b2b88 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -82,7 +82,7 @@ xfs_uuid_mount(
82 if (hole < 0) { 82 if (hole < 0) {
83 xfs_uuid_table = kmem_realloc(xfs_uuid_table, 83 xfs_uuid_table = kmem_realloc(xfs_uuid_table,
84 (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table), 84 (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
85 KM_SLEEP); 85 0);
86 hole = xfs_uuid_table_size++; 86 hole = xfs_uuid_table_size++;
87 } 87 }
88 xfs_uuid_table[hole] = *uuid; 88 xfs_uuid_table[hole] = *uuid;
@@ -214,7 +214,7 @@ xfs_initialize_perag(
214 214
215 spin_lock(&mp->m_perag_lock); 215 spin_lock(&mp->m_perag_lock);
216 if (radix_tree_insert(&mp->m_perag_tree, index, pag)) { 216 if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
217 BUG(); 217 WARN_ON_ONCE(1);
218 spin_unlock(&mp->m_perag_lock); 218 spin_unlock(&mp->m_perag_lock);
219 radix_tree_preload_end(); 219 radix_tree_preload_end();
220 error = -EEXIST; 220 error = -EEXIST;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 4adb6837439a..fdb60e09a9c5 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -327,13 +327,6 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
327} 327}
328 328
329/* per-AG block reservation data structures*/ 329/* per-AG block reservation data structures*/
330enum xfs_ag_resv_type {
331 XFS_AG_RESV_NONE = 0,
332 XFS_AG_RESV_AGFL,
333 XFS_AG_RESV_METADATA,
334 XFS_AG_RESV_RMAPBT,
335};
336
337struct xfs_ag_resv { 330struct xfs_ag_resv {
338 /* number of blocks originally reserved here */ 331 /* number of blocks originally reserved here */
339 xfs_extlen_t ar_orig_reserved; 332 xfs_extlen_t ar_orig_reserved;
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 74738813f60d..a06661dac5be 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -333,12 +333,12 @@ xfs_mru_cache_create(
333 if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count)) 333 if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count))
334 return -EINVAL; 334 return -EINVAL;
335 335
336 if (!(mru = kmem_zalloc(sizeof(*mru), KM_SLEEP))) 336 if (!(mru = kmem_zalloc(sizeof(*mru), 0)))
337 return -ENOMEM; 337 return -ENOMEM;
338 338
339 /* An extra list is needed to avoid reaping up to a grp_time early. */ 339 /* An extra list is needed to avoid reaping up to a grp_time early. */
340 mru->grp_count = grp_count + 1; 340 mru->grp_count = grp_count + 1;
341 mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP); 341 mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), 0);
342 342
343 if (!mru->lists) { 343 if (!mru->lists) {
344 err = -ENOMEM; 344 err = -ENOMEM;
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 5e7a37f0cf84..ecd8ce152ab1 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -642,7 +642,7 @@ xfs_qm_init_quotainfo(
642 642
643 ASSERT(XFS_IS_QUOTA_RUNNING(mp)); 643 ASSERT(XFS_IS_QUOTA_RUNNING(mp));
644 644
645 qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP); 645 qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), 0);
646 646
647 error = list_lru_init(&qinf->qi_lru); 647 error = list_lru_init(&qinf->qi_lru);
648 if (error) 648 if (error)
@@ -978,7 +978,7 @@ xfs_qm_reset_dqcounts_buf(
978 if (qip->i_d.di_nblocks == 0) 978 if (qip->i_d.di_nblocks == 0)
979 return 0; 979 return 0;
980 980
981 map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP); 981 map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), 0);
982 982
983 lblkno = 0; 983 lblkno = 0;
984 maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); 984 maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index d8288aa0670a..2328268e6245 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -144,9 +144,9 @@ xfs_cui_init(
144 ASSERT(nextents > 0); 144 ASSERT(nextents > 0);
145 if (nextents > XFS_CUI_MAX_FAST_EXTENTS) 145 if (nextents > XFS_CUI_MAX_FAST_EXTENTS)
146 cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents), 146 cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents),
147 KM_SLEEP); 147 0);
148 else 148 else
149 cuip = kmem_zone_zalloc(xfs_cui_zone, KM_SLEEP); 149 cuip = kmem_zone_zalloc(xfs_cui_zone, 0);
150 150
151 xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops); 151 xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops);
152 cuip->cui_format.cui_nextents = nextents; 152 cuip->cui_format.cui_nextents = nextents;
@@ -223,7 +223,7 @@ xfs_trans_get_cud(
223{ 223{
224 struct xfs_cud_log_item *cudp; 224 struct xfs_cud_log_item *cudp;
225 225
226 cudp = kmem_zone_zalloc(xfs_cud_zone, KM_SLEEP); 226 cudp = kmem_zone_zalloc(xfs_cud_zone, 0);
227 xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD, 227 xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD,
228 &xfs_cud_item_ops); 228 &xfs_cud_item_ops);
229 cudp->cud_cuip = cuip; 229 cudp->cud_cuip = cuip;
@@ -555,26 +555,24 @@ xfs_cui_recover(
555 irec.br_blockcount = new_len; 555 irec.br_blockcount = new_len;
556 switch (type) { 556 switch (type) {
557 case XFS_REFCOUNT_INCREASE: 557 case XFS_REFCOUNT_INCREASE:
558 error = xfs_refcount_increase_extent(tp, &irec); 558 xfs_refcount_increase_extent(tp, &irec);
559 break; 559 break;
560 case XFS_REFCOUNT_DECREASE: 560 case XFS_REFCOUNT_DECREASE:
561 error = xfs_refcount_decrease_extent(tp, &irec); 561 xfs_refcount_decrease_extent(tp, &irec);
562 break; 562 break;
563 case XFS_REFCOUNT_ALLOC_COW: 563 case XFS_REFCOUNT_ALLOC_COW:
564 error = xfs_refcount_alloc_cow_extent(tp, 564 xfs_refcount_alloc_cow_extent(tp,
565 irec.br_startblock, 565 irec.br_startblock,
566 irec.br_blockcount); 566 irec.br_blockcount);
567 break; 567 break;
568 case XFS_REFCOUNT_FREE_COW: 568 case XFS_REFCOUNT_FREE_COW:
569 error = xfs_refcount_free_cow_extent(tp, 569 xfs_refcount_free_cow_extent(tp,
570 irec.br_startblock, 570 irec.br_startblock,
571 irec.br_blockcount); 571 irec.br_blockcount);
572 break; 572 break;
573 default: 573 default:
574 ASSERT(0); 574 ASSERT(0);
575 } 575 }
576 if (error)
577 goto abort_error;
578 requeue_only = true; 576 requeue_only = true;
579 } 577 }
580 } 578 }
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index edbe37b7f636..0f08153b4994 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -495,10 +495,8 @@ xfs_reflink_cancel_cow_blocks(
495 ASSERT((*tpp)->t_firstblock == NULLFSBLOCK); 495 ASSERT((*tpp)->t_firstblock == NULLFSBLOCK);
496 496
497 /* Free the CoW orphan record. */ 497 /* Free the CoW orphan record. */
498 error = xfs_refcount_free_cow_extent(*tpp, 498 xfs_refcount_free_cow_extent(*tpp, del.br_startblock,
499 del.br_startblock, del.br_blockcount); 499 del.br_blockcount);
500 if (error)
501 break;
502 500
503 xfs_bmap_add_free(*tpp, del.br_startblock, 501 xfs_bmap_add_free(*tpp, del.br_startblock,
504 del.br_blockcount, NULL); 502 del.br_blockcount, NULL);
@@ -675,15 +673,10 @@ xfs_reflink_end_cow_extent(
675 trace_xfs_reflink_cow_remap(ip, &del); 673 trace_xfs_reflink_cow_remap(ip, &del);
676 674
677 /* Free the CoW orphan record. */ 675 /* Free the CoW orphan record. */
678 error = xfs_refcount_free_cow_extent(tp, del.br_startblock, 676 xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount);
679 del.br_blockcount);
680 if (error)
681 goto out_cancel;
682 677
683 /* Map the new blocks into the data fork. */ 678 /* Map the new blocks into the data fork. */
684 error = xfs_bmap_map_extent(tp, ip, &del); 679 xfs_bmap_map_extent(tp, ip, &del);
685 if (error)
686 goto out_cancel;
687 680
688 /* Charge this new data fork mapping to the on-disk quota. */ 681 /* Charge this new data fork mapping to the on-disk quota. */
689 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT, 682 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT,
@@ -1070,14 +1063,10 @@ xfs_reflink_remap_extent(
1070 uirec.br_blockcount, uirec.br_startblock); 1063 uirec.br_blockcount, uirec.br_startblock);
1071 1064
1072 /* Update the refcount tree */ 1065 /* Update the refcount tree */
1073 error = xfs_refcount_increase_extent(tp, &uirec); 1066 xfs_refcount_increase_extent(tp, &uirec);
1074 if (error)
1075 goto out_cancel;
1076 1067
1077 /* Map the new blocks into the data fork. */ 1068 /* Map the new blocks into the data fork. */
1078 error = xfs_bmap_map_extent(tp, ip, &uirec); 1069 xfs_bmap_map_extent(tp, ip, &uirec);
1079 if (error)
1080 goto out_cancel;
1081 1070
1082 /* Update quota accounting. */ 1071 /* Update quota accounting. */
1083 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1072 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 77ed557b6127..8939e0ea09cd 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -142,9 +142,9 @@ xfs_rui_init(
142 142
143 ASSERT(nextents > 0); 143 ASSERT(nextents > 0);
144 if (nextents > XFS_RUI_MAX_FAST_EXTENTS) 144 if (nextents > XFS_RUI_MAX_FAST_EXTENTS)
145 ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), KM_SLEEP); 145 ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0);
146 else 146 else
147 ruip = kmem_zone_zalloc(xfs_rui_zone, KM_SLEEP); 147 ruip = kmem_zone_zalloc(xfs_rui_zone, 0);
148 148
149 xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops); 149 xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops);
150 ruip->rui_format.rui_nextents = nextents; 150 ruip->rui_format.rui_nextents = nextents;
@@ -244,7 +244,7 @@ xfs_trans_get_rud(
244{ 244{
245 struct xfs_rud_log_item *rudp; 245 struct xfs_rud_log_item *rudp;
246 246
247 rudp = kmem_zone_zalloc(xfs_rud_zone, KM_SLEEP); 247 rudp = kmem_zone_zalloc(xfs_rud_zone, 0);
248 xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD, 248 xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD,
249 &xfs_rud_item_ops); 249 &xfs_rud_item_ops);
250 rudp->rud_ruip = ruip; 250 rudp->rud_ruip = ruip;
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 5fa4db3c3e32..4a48a8c75b4f 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -865,7 +865,7 @@ xfs_alloc_rsum_cache(
865 * lower bound on the minimum level with any free extents. We can 865 * lower bound on the minimum level with any free extents. We can
866 * continue without the cache if it couldn't be allocated. 866 * continue without the cache if it couldn't be allocated.
867 */ 867 */
868 mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, KM_SLEEP); 868 mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, 0);
869 if (!mp->m_rsum_cache) 869 if (!mp->m_rsum_cache)
870 xfs_warn(mp, "could not allocate realtime summary cache"); 870 xfs_warn(mp, "could not allocate realtime summary cache");
871} 871}
@@ -963,7 +963,7 @@ xfs_growfs_rt(
963 /* 963 /*
964 * Allocate a new (fake) mount/sb. 964 * Allocate a new (fake) mount/sb.
965 */ 965 */
966 nmp = kmem_alloc(sizeof(*nmp), KM_SLEEP); 966 nmp = kmem_alloc(sizeof(*nmp), 0);
967 /* 967 /*
968 * Loop over the bitmap blocks. 968 * Loop over the bitmap blocks.
969 * We will do everything one bitmap block at a time. 969 * We will do everything one bitmap block at a time.
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f9450235533c..391b4748cae3 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -818,7 +818,8 @@ xfs_init_mount_workqueues(
818 goto out_destroy_buf; 818 goto out_destroy_buf;
819 819
820 mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s", 820 mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s",
821 WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); 821 WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND,
822 0, mp->m_fsname);
822 if (!mp->m_cil_workqueue) 823 if (!mp->m_cil_workqueue)
823 goto out_destroy_unwritten; 824 goto out_destroy_unwritten;
824 825
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 8094b1920eef..eaae275ed430 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -23,6 +23,7 @@ struct xlog;
23struct xlog_ticket; 23struct xlog_ticket;
24struct xlog_recover; 24struct xlog_recover;
25struct xlog_recover_item; 25struct xlog_recover_item;
26struct xlog_rec_header;
26struct xfs_buf_log_format; 27struct xfs_buf_log_format;
27struct xfs_inode_log_format; 28struct xfs_inode_log_format;
28struct xfs_bmbt_irec; 29struct xfs_bmbt_irec;
@@ -30,6 +31,10 @@ struct xfs_btree_cur;
30struct xfs_refcount_irec; 31struct xfs_refcount_irec;
31struct xfs_fsmap; 32struct xfs_fsmap;
32struct xfs_rmap_irec; 33struct xfs_rmap_irec;
34struct xfs_icreate_log;
35struct xfs_owner_info;
36struct xfs_trans_res;
37struct xfs_inobt_rec_incore;
33 38
34DECLARE_EVENT_CLASS(xfs_attr_list_class, 39DECLARE_EVENT_CLASS(xfs_attr_list_class,
35 TP_PROTO(struct xfs_attr_list_context *ctx), 40 TP_PROTO(struct xfs_attr_list_context *ctx),
@@ -3575,6 +3580,35 @@ TRACE_EVENT(xfs_pwork_init,
3575 __entry->nr_threads, __entry->pid) 3580 __entry->nr_threads, __entry->pid)
3576) 3581)
3577 3582
3583DECLARE_EVENT_CLASS(xfs_kmem_class,
3584 TP_PROTO(ssize_t size, int flags, unsigned long caller_ip),
3585 TP_ARGS(size, flags, caller_ip),
3586 TP_STRUCT__entry(
3587 __field(ssize_t, size)
3588 __field(int, flags)
3589 __field(unsigned long, caller_ip)
3590 ),
3591 TP_fast_assign(
3592 __entry->size = size;
3593 __entry->flags = flags;
3594 __entry->caller_ip = caller_ip;
3595 ),
3596 TP_printk("size %zd flags 0x%x caller %pS",
3597 __entry->size,
3598 __entry->flags,
3599 (char *)__entry->caller_ip)
3600)
3601
3602#define DEFINE_KMEM_EVENT(name) \
3603DEFINE_EVENT(xfs_kmem_class, name, \
3604 TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), \
3605 TP_ARGS(size, flags, caller_ip))
3606DEFINE_KMEM_EVENT(kmem_alloc);
3607DEFINE_KMEM_EVENT(kmem_alloc_io);
3608DEFINE_KMEM_EVENT(kmem_alloc_large);
3609DEFINE_KMEM_EVENT(kmem_realloc);
3610DEFINE_KMEM_EVENT(kmem_zone_alloc);
3611
3578#endif /* _TRACE_XFS_H */ 3612#endif /* _TRACE_XFS_H */
3579 3613
3580#undef TRACE_INCLUDE_PATH 3614#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index d42a68d8313b..f4795fdb7389 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -90,7 +90,7 @@ xfs_trans_dup(
90 90
91 trace_xfs_trans_dup(tp, _RET_IP_); 91 trace_xfs_trans_dup(tp, _RET_IP_);
92 92
93 ntp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP); 93 ntp = kmem_zone_zalloc(xfs_trans_zone, 0);
94 94
95 /* 95 /*
96 * Initialize the new transaction structure. 96 * Initialize the new transaction structure.
@@ -263,7 +263,7 @@ xfs_trans_alloc(
263 * GFP_NOFS allocation context so that we avoid lockdep false positives 263 * GFP_NOFS allocation context so that we avoid lockdep false positives
264 * by doing GFP_KERNEL allocations inside sb_start_intwrite(). 264 * by doing GFP_KERNEL allocations inside sb_start_intwrite().
265 */ 265 */
266 tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP); 266 tp = kmem_zone_zalloc(xfs_trans_zone, 0);
267 if (!(flags & XFS_TRANS_NO_WRITECOUNT)) 267 if (!(flags & XFS_TRANS_NO_WRITECOUNT))
268 sb_start_intwrite(mp->m_super); 268 sb_start_intwrite(mp->m_super);
269 269
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 1027c9ca6eb8..16457465833b 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -863,7 +863,7 @@ STATIC void
863xfs_trans_alloc_dqinfo( 863xfs_trans_alloc_dqinfo(
864 xfs_trans_t *tp) 864 xfs_trans_t *tp)
865{ 865{
866 tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, KM_SLEEP); 866 tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, 0);
867} 867}
868 868
869void 869void
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 3123b5aaad2a..cb895b1df5e4 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -30,7 +30,7 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused,
30 value = NULL; 30 value = NULL;
31 } 31 }
32 32
33 error = xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags); 33 error = xfs_attr_get(ip, name, (unsigned char **)&value, &asize, xflags);
34 if (error) 34 if (error)
35 return error; 35 return error;
36 return asize; 36 return asize;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ae6648145d18..ffe35d97afcb 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3543,6 +3543,8 @@ extern void inode_nohighmem(struct inode *inode);
3543/* mm/fadvise.c */ 3543/* mm/fadvise.c */
3544extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, 3544extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len,
3545 int advice); 3545 int advice);
3546extern int generic_fadvise(struct file *file, loff_t offset, loff_t len,
3547 int advice);
3546 3548
3547#if defined(CONFIG_IO_URING) 3549#if defined(CONFIG_IO_URING)
3548extern struct sock *io_uring_get_socket(struct file *file); 3550extern struct sock *io_uring_get_socket(struct file *file);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 467bcd032037..4f17c83db575 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -27,8 +27,7 @@
27 * deactivate the pages and clear PG_Referenced. 27 * deactivate the pages and clear PG_Referenced.
28 */ 28 */
29 29
30static int generic_fadvise(struct file *file, loff_t offset, loff_t len, 30int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
31 int advice)
32{ 31{
33 struct inode *inode; 32 struct inode *inode;
34 struct address_space *mapping; 33 struct address_space *mapping;
@@ -178,6 +177,7 @@ static int generic_fadvise(struct file *file, loff_t offset, loff_t len,
178 } 177 }
179 return 0; 178 return 0;
180} 179}
180EXPORT_SYMBOL(generic_fadvise);
181 181
182int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice) 182int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
183{ 183{
diff --git a/mm/madvise.c b/mm/madvise.c
index 968df3aa069f..bac973b9f2cc 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -14,6 +14,7 @@
14#include <linux/userfaultfd_k.h> 14#include <linux/userfaultfd_k.h>
15#include <linux/hugetlb.h> 15#include <linux/hugetlb.h>
16#include <linux/falloc.h> 16#include <linux/falloc.h>
17#include <linux/fadvise.h>
17#include <linux/sched.h> 18#include <linux/sched.h>
18#include <linux/ksm.h> 19#include <linux/ksm.h>
19#include <linux/fs.h> 20#include <linux/fs.h>
@@ -275,6 +276,7 @@ static long madvise_willneed(struct vm_area_struct *vma,
275 unsigned long start, unsigned long end) 276 unsigned long start, unsigned long end)
276{ 277{
277 struct file *file = vma->vm_file; 278 struct file *file = vma->vm_file;
279 loff_t offset;
278 280
279 *prev = vma; 281 *prev = vma;
280#ifdef CONFIG_SWAP 282#ifdef CONFIG_SWAP
@@ -298,12 +300,20 @@ static long madvise_willneed(struct vm_area_struct *vma,
298 return 0; 300 return 0;
299 } 301 }
300 302
301 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 303 /*
302 if (end > vma->vm_end) 304 * Filesystem's fadvise may need to take various locks. We need to
303 end = vma->vm_end; 305 * explicitly grab a reference because the vma (and hence the
304 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 306 * vma's reference to the file) can go away as soon as we drop
305 307 * mmap_sem.
306 force_page_cache_readahead(file->f_mapping, file, start, end - start); 308 */
309 *prev = NULL; /* tell sys_madvise we drop mmap_sem */
310 get_file(file);
311 up_read(&current->mm->mmap_sem);
312 offset = (loff_t)(start - vma->vm_start)
313 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
314 vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
315 fput(file);
316 down_read(&current->mm->mmap_sem);
307 return 0; 317 return 0;
308} 318}
309 319