aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-09-18 21:32:43 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-09-18 21:32:43 -0400
commitb41dae061bbd722b9d7fa828f35d22035b218e18 (patch)
treea5c0bade0c3d221483b54204bfc47e4fdbf09316
parente6bc9de714972cac34daa1dc1567ee48a47a9342 (diff)
parent14e15f1bcd738dc13dd7c1e78e4800e8bc577980 (diff)
Merge tag 'xfs-5.4-merge-7' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux
Pull xfs updates from Darrick Wong: "For this cycle we have the usual pile of cleanups and bug fixes, some performance improvements for online metadata scrubbing, massive speedups in the directory entry creation code, some performance improvement in the file ACL lookup code, a fix for a logging stall during mount, and fixes for concurrency problems. It has survived a couple of weeks of xfstests runs and merges cleanly. Summary: - Remove KM_SLEEP/KM_NOSLEEP. - Ensure that memory buffers for IO are properly sector-aligned to avoid problems that the block layer doesn't check. - Make the bmap scrubber more efficient in its record checking. - Don't crash xfs_db when superblock inode geometry is corrupt. - Fix btree key helper functions. - Remove unneeded error returns for things that can't fail. - Fix buffer logging bugs in repair. - Clean up iterator return values. - Speed up directory entry creation. - Enable allocation of xattr value memory buffer during lookup. - Fix readahead racing with truncate/punch hole. - Other minor cleanups. - Fix one AGI/AGF deadlock with RENAME_WHITEOUT. - More BUG -> WARN whackamole. - Fix various problems with the log failing to advance under certain circumstances, which results in stalls during mount" * tag 'xfs-5.4-merge-7' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: (45 commits) xfs: push the grant head when the log head moves forward xfs: push iclog state cleaning into xlog_state_clean_log xfs: factor iclog state processing out of xlog_state_do_callback() xfs: factor callbacks out of xlog_state_do_callback() xfs: factor debug code out of xlog_state_do_callback() xfs: prevent CIL push holdoff in log recovery xfs: fix missed wakeup on l_flush_wait xfs: push the AIL in xlog_grant_head_wake xfs: Use WARN_ON_ONCE for bailout mount-operation xfs: Fix deadlock between AGI and AGF with RENAME_WHITEOUT xfs: define a flags field for the AG geometry ioctl structure xfs: add a xfs_valid_startblock helper xfs: remove the unused XFS_ALLOC_USERDATA flag xfs: cleanup xfs_fsb_to_db xfs: fix the dax supported check in xfs_ioctl_setattr_dax_invalidate xfs: Fix stale data exposure when readahead races with hole punch fs: Export generic_fadvise() mm: Handle MADV_WILLNEED through vfs_fadvise() xfs: allocate xattr buffer on demand xfs: consolidate attribute value copying ...
-rw-r--r--fs/xfs/kmem.c79
-rw-r--r--fs/xfs/kmem.h15
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c2
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h7
-rw-r--r--fs/xfs/libxfs/xfs_attr.c79
-rw-r--r--fs/xfs/libxfs/xfs_attr.h6
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c130
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c2
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c85
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h11
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c16
-rw-r--r--fs/xfs/libxfs/xfs_btree.c14
-rw-r--r--fs/xfs/libxfs/xfs_btree.h10
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c6
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.h4
-rw-r--r--fs/xfs/libxfs/xfs_defer.c2
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c14
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c2
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c678
-rw-r--r--fs/xfs/libxfs/xfs_dir2_sf.c8
-rw-r--r--fs/xfs/libxfs/xfs_fs.h2
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c9
-rw-r--r--fs/xfs/libxfs/xfs_iext_tree.c8
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c16
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c50
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h12
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c59
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h11
-rw-r--r--fs/xfs/libxfs/xfs_shared.h6
-rw-r--r--fs/xfs/libxfs/xfs_types.h8
-rw-r--r--fs/xfs/scrub/agheader.c4
-rw-r--r--fs/xfs/scrub/attr.c6
-rw-r--r--fs/xfs/scrub/bmap.c81
-rw-r--r--fs/xfs/scrub/fscounters.c2
-rw-r--r--fs/xfs/scrub/repair.c6
-rw-r--r--fs/xfs/scrub/symlink.c2
-rw-r--r--fs/xfs/xfs_acl.c14
-rw-r--r--fs/xfs/xfs_attr_inactive.c2
-rw-r--r--fs/xfs/xfs_attr_list.c2
-rw-r--r--fs/xfs/xfs_bmap_item.c8
-rw-r--r--fs/xfs/xfs_bmap_util.c22
-rw-r--r--fs/xfs/xfs_buf.c7
-rw-r--r--fs/xfs/xfs_buf.h6
-rw-r--r--fs/xfs/xfs_buf_item.c4
-rw-r--r--fs/xfs/xfs_dquot.c4
-rw-r--r--fs/xfs/xfs_dquot_item.c2
-rw-r--r--fs/xfs/xfs_error.c2
-rw-r--r--fs/xfs/xfs_extent_busy.c2
-rw-r--r--fs/xfs/xfs_extfree_item.c8
-rw-r--r--fs/xfs/xfs_file.c26
-rw-r--r--fs/xfs/xfs_fsmap.c12
-rw-r--r--fs/xfs/xfs_icache.c2
-rw-r--r--fs/xfs/xfs_icreate_item.c2
-rw-r--r--fs/xfs/xfs_inode.c85
-rw-r--r--fs/xfs/xfs_inode_item.c2
-rw-r--r--fs/xfs/xfs_ioctl.c25
-rw-r--r--fs/xfs/xfs_ioctl32.c2
-rw-r--r--fs/xfs/xfs_iomap.c6
-rw-r--r--fs/xfs/xfs_itable.c10
-rw-r--r--fs/xfs/xfs_itable.h13
-rw-r--r--fs/xfs/xfs_iwalk.c4
-rw-r--r--fs/xfs/xfs_iwalk.h13
-rw-r--r--fs/xfs/xfs_log.c466
-rw-r--r--fs/xfs/xfs_log_cil.c10
-rw-r--r--fs/xfs/xfs_log_recover.c50
-rw-r--r--fs/xfs/xfs_mount.c4
-rw-r--r--fs/xfs/xfs_mount.h7
-rw-r--r--fs/xfs/xfs_mru_cache.c4
-rw-r--r--fs/xfs/xfs_qm.c4
-rw-r--r--fs/xfs/xfs_refcount_item.c16
-rw-r--r--fs/xfs/xfs_reflink.c23
-rw-r--r--fs/xfs/xfs_rmap_item.c6
-rw-r--r--fs/xfs/xfs_rtalloc.c4
-rw-r--r--fs/xfs/xfs_super.c3
-rw-r--r--fs/xfs/xfs_trace.h34
-rw-r--r--fs/xfs/xfs_trans.c4
-rw-r--r--fs/xfs/xfs_trans_dquot.c2
-rw-r--r--fs/xfs/xfs_xattr.c2
-rw-r--r--include/linux/fs.h2
-rw-r--r--mm/fadvise.c4
-rw-r--r--mm/madvise.c22
81 files changed, 1315 insertions, 1089 deletions
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 16bb9a328678..da031b93e182 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -3,10 +3,10 @@
3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4 * All Rights Reserved. 4 * All Rights Reserved.
5 */ 5 */
6#include <linux/sched/mm.h> 6#include "xfs.h"
7#include <linux/backing-dev.h> 7#include <linux/backing-dev.h>
8#include "kmem.h"
9#include "xfs_message.h" 8#include "xfs_message.h"
9#include "xfs_trace.h"
10 10
11void * 11void *
12kmem_alloc(size_t size, xfs_km_flags_t flags) 12kmem_alloc(size_t size, xfs_km_flags_t flags)
@@ -15,9 +15,11 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
15 gfp_t lflags = kmem_flags_convert(flags); 15 gfp_t lflags = kmem_flags_convert(flags);
16 void *ptr; 16 void *ptr;
17 17
18 trace_kmem_alloc(size, flags, _RET_IP_);
19
18 do { 20 do {
19 ptr = kmalloc(size, lflags); 21 ptr = kmalloc(size, lflags);
20 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) 22 if (ptr || (flags & KM_MAYFAIL))
21 return ptr; 23 return ptr;
22 if (!(++retries % 100)) 24 if (!(++retries % 100))
23 xfs_err(NULL, 25 xfs_err(NULL,
@@ -28,28 +30,24 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
28 } while (1); 30 } while (1);
29} 31}
30 32
31void * 33
32kmem_alloc_large(size_t size, xfs_km_flags_t flags) 34/*
35 * __vmalloc() will allocate data pages and auxillary structures (e.g.
36 * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context here. Hence
37 * we need to tell memory reclaim that we are in such a context via
38 * PF_MEMALLOC_NOFS to prevent memory reclaim re-entering the filesystem here
39 * and potentially deadlocking.
40 */
41static void *
42__kmem_vmalloc(size_t size, xfs_km_flags_t flags)
33{ 43{
34 unsigned nofs_flag = 0; 44 unsigned nofs_flag = 0;
35 void *ptr; 45 void *ptr;
36 gfp_t lflags; 46 gfp_t lflags = kmem_flags_convert(flags);
37
38 ptr = kmem_alloc(size, flags | KM_MAYFAIL);
39 if (ptr)
40 return ptr;
41 47
42 /*
43 * __vmalloc() will allocate data pages and auxillary structures (e.g.
44 * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context
45 * here. Hence we need to tell memory reclaim that we are in such a
46 * context via PF_MEMALLOC_NOFS to prevent memory reclaim re-entering
47 * the filesystem here and potentially deadlocking.
48 */
49 if (flags & KM_NOFS) 48 if (flags & KM_NOFS)
50 nofs_flag = memalloc_nofs_save(); 49 nofs_flag = memalloc_nofs_save();
51 50
52 lflags = kmem_flags_convert(flags);
53 ptr = __vmalloc(size, lflags, PAGE_KERNEL); 51 ptr = __vmalloc(size, lflags, PAGE_KERNEL);
54 52
55 if (flags & KM_NOFS) 53 if (flags & KM_NOFS)
@@ -58,6 +56,44 @@ kmem_alloc_large(size_t size, xfs_km_flags_t flags)
58 return ptr; 56 return ptr;
59} 57}
60 58
59/*
60 * Same as kmem_alloc_large, except we guarantee the buffer returned is aligned
61 * to the @align_mask. We only guarantee alignment up to page size, we'll clamp
62 * alignment at page size if it is larger. vmalloc always returns a PAGE_SIZE
63 * aligned region.
64 */
65void *
66kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags)
67{
68 void *ptr;
69
70 trace_kmem_alloc_io(size, flags, _RET_IP_);
71
72 if (WARN_ON_ONCE(align_mask >= PAGE_SIZE))
73 align_mask = PAGE_SIZE - 1;
74
75 ptr = kmem_alloc(size, flags | KM_MAYFAIL);
76 if (ptr) {
77 if (!((uintptr_t)ptr & align_mask))
78 return ptr;
79 kfree(ptr);
80 }
81 return __kmem_vmalloc(size, flags);
82}
83
84void *
85kmem_alloc_large(size_t size, xfs_km_flags_t flags)
86{
87 void *ptr;
88
89 trace_kmem_alloc_large(size, flags, _RET_IP_);
90
91 ptr = kmem_alloc(size, flags | KM_MAYFAIL);
92 if (ptr)
93 return ptr;
94 return __kmem_vmalloc(size, flags);
95}
96
61void * 97void *
62kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags) 98kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags)
63{ 99{
@@ -65,9 +101,11 @@ kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags)
65 gfp_t lflags = kmem_flags_convert(flags); 101 gfp_t lflags = kmem_flags_convert(flags);
66 void *ptr; 102 void *ptr;
67 103
104 trace_kmem_realloc(newsize, flags, _RET_IP_);
105
68 do { 106 do {
69 ptr = krealloc(old, newsize, lflags); 107 ptr = krealloc(old, newsize, lflags);
70 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) 108 if (ptr || (flags & KM_MAYFAIL))
71 return ptr; 109 return ptr;
72 if (!(++retries % 100)) 110 if (!(++retries % 100))
73 xfs_err(NULL, 111 xfs_err(NULL,
@@ -85,9 +123,10 @@ kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags)
85 gfp_t lflags = kmem_flags_convert(flags); 123 gfp_t lflags = kmem_flags_convert(flags);
86 void *ptr; 124 void *ptr;
87 125
126 trace_kmem_zone_alloc(kmem_cache_size(zone), flags, _RET_IP_);
88 do { 127 do {
89 ptr = kmem_cache_alloc(zone, lflags); 128 ptr = kmem_cache_alloc(zone, lflags);
90 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) 129 if (ptr || (flags & KM_MAYFAIL))
91 return ptr; 130 return ptr;
92 if (!(++retries % 100)) 131 if (!(++retries % 100))
93 xfs_err(NULL, 132 xfs_err(NULL,
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 267655acd426..8170d95cf930 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -16,8 +16,6 @@
16 */ 16 */
17 17
18typedef unsigned __bitwise xfs_km_flags_t; 18typedef unsigned __bitwise xfs_km_flags_t;
19#define KM_SLEEP ((__force xfs_km_flags_t)0x0001u)
20#define KM_NOSLEEP ((__force xfs_km_flags_t)0x0002u)
21#define KM_NOFS ((__force xfs_km_flags_t)0x0004u) 19#define KM_NOFS ((__force xfs_km_flags_t)0x0004u)
22#define KM_MAYFAIL ((__force xfs_km_flags_t)0x0008u) 20#define KM_MAYFAIL ((__force xfs_km_flags_t)0x0008u)
23#define KM_ZERO ((__force xfs_km_flags_t)0x0010u) 21#define KM_ZERO ((__force xfs_km_flags_t)0x0010u)
@@ -32,15 +30,11 @@ kmem_flags_convert(xfs_km_flags_t flags)
32{ 30{
33 gfp_t lflags; 31 gfp_t lflags;
34 32
35 BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_ZERO)); 33 BUG_ON(flags & ~(KM_NOFS|KM_MAYFAIL|KM_ZERO));
36 34
37 if (flags & KM_NOSLEEP) { 35 lflags = GFP_KERNEL | __GFP_NOWARN;
38 lflags = GFP_ATOMIC | __GFP_NOWARN; 36 if (flags & KM_NOFS)
39 } else { 37 lflags &= ~__GFP_FS;
40 lflags = GFP_KERNEL | __GFP_NOWARN;
41 if (flags & KM_NOFS)
42 lflags &= ~__GFP_FS;
43 }
44 38
45 /* 39 /*
46 * Default page/slab allocator behavior is to retry for ever 40 * Default page/slab allocator behavior is to retry for ever
@@ -59,6 +53,7 @@ kmem_flags_convert(xfs_km_flags_t flags)
59} 53}
60 54
61extern void *kmem_alloc(size_t, xfs_km_flags_t); 55extern void *kmem_alloc(size_t, xfs_km_flags_t);
56extern void *kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags);
62extern void *kmem_alloc_large(size_t size, xfs_km_flags_t); 57extern void *kmem_alloc_large(size_t size, xfs_km_flags_t);
63extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t); 58extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t);
64static inline void kmem_free(const void *ptr) 59static inline void kmem_free(const void *ptr)
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 372ad55631fc..533b04aaf6f6 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2205,7 +2205,7 @@ xfs_defer_agfl_block(
2205 ASSERT(xfs_bmap_free_item_zone != NULL); 2205 ASSERT(xfs_bmap_free_item_zone != NULL);
2206 ASSERT(oinfo != NULL); 2206 ASSERT(oinfo != NULL);
2207 2207
2208 new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); 2208 new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0);
2209 new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); 2209 new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
2210 new->xefi_blockcount = 1; 2210 new->xefi_blockcount = 1;
2211 new->xefi_oinfo = *oinfo; 2211 new->xefi_oinfo = *oinfo;
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index d6ed5d2c07c2..58fa85cec325 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -81,10 +81,9 @@ typedef struct xfs_alloc_arg {
81/* 81/*
82 * Defines for datatype 82 * Defines for datatype
83 */ 83 */
84#define XFS_ALLOC_USERDATA (1 << 0)/* allocation is for user data*/ 84#define XFS_ALLOC_INITIAL_USER_DATA (1 << 0)/* special case start of file */
85#define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */ 85#define XFS_ALLOC_USERDATA_ZERO (1 << 1)/* zero extent on allocation */
86#define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */ 86#define XFS_ALLOC_NOBUSY (1 << 2)/* Busy extents not allowed */
87#define XFS_ALLOC_NOBUSY (1 << 3)/* Busy extents not allowed */
88 87
89static inline bool 88static inline bool
90xfs_alloc_is_userdata(int datatype) 89xfs_alloc_is_userdata(int datatype)
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index d48fcf11cc35..510ca6974604 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -97,7 +97,10 @@ xfs_inode_hasattr(
97 * Overall external interface routines. 97 * Overall external interface routines.
98 *========================================================================*/ 98 *========================================================================*/
99 99
100/* Retrieve an extended attribute and its value. Must have ilock. */ 100/*
101 * Retrieve an extended attribute and its value. Must have ilock.
102 * Returns 0 on successful retrieval, otherwise an error.
103 */
101int 104int
102xfs_attr_get_ilocked( 105xfs_attr_get_ilocked(
103 struct xfs_inode *ip, 106 struct xfs_inode *ip,
@@ -115,12 +118,28 @@ xfs_attr_get_ilocked(
115 return xfs_attr_node_get(args); 118 return xfs_attr_node_get(args);
116} 119}
117 120
118/* Retrieve an extended attribute by name, and its value. */ 121/*
122 * Retrieve an extended attribute by name, and its value if requested.
123 *
124 * If ATTR_KERNOVAL is set in @flags, then the caller does not want the value,
125 * just an indication whether the attribute exists and the size of the value if
126 * it exists. The size is returned in @valuelenp,
127 *
128 * If the attribute is found, but exceeds the size limit set by the caller in
129 * @valuelenp, return -ERANGE with the size of the attribute that was found in
130 * @valuelenp.
131 *
132 * If ATTR_ALLOC is set in @flags, allocate the buffer for the value after
133 * existence of the attribute has been determined. On success, return that
134 * buffer to the caller and leave them to free it. On failure, free any
135 * allocated buffer and ensure the buffer pointer returned to the caller is
136 * null.
137 */
119int 138int
120xfs_attr_get( 139xfs_attr_get(
121 struct xfs_inode *ip, 140 struct xfs_inode *ip,
122 const unsigned char *name, 141 const unsigned char *name,
123 unsigned char *value, 142 unsigned char **value,
124 int *valuelenp, 143 int *valuelenp,
125 int flags) 144 int flags)
126{ 145{
@@ -128,6 +147,8 @@ xfs_attr_get(
128 uint lock_mode; 147 uint lock_mode;
129 int error; 148 int error;
130 149
150 ASSERT((flags & (ATTR_ALLOC | ATTR_KERNOVAL)) || *value);
151
131 XFS_STATS_INC(ip->i_mount, xs_attr_get); 152 XFS_STATS_INC(ip->i_mount, xs_attr_get);
132 153
133 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 154 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -137,17 +158,29 @@ xfs_attr_get(
137 if (error) 158 if (error)
138 return error; 159 return error;
139 160
140 args.value = value;
141 args.valuelen = *valuelenp;
142 /* Entirely possible to look up a name which doesn't exist */ 161 /* Entirely possible to look up a name which doesn't exist */
143 args.op_flags = XFS_DA_OP_OKNOENT; 162 args.op_flags = XFS_DA_OP_OKNOENT;
163 if (flags & ATTR_ALLOC)
164 args.op_flags |= XFS_DA_OP_ALLOCVAL;
165 else
166 args.value = *value;
167 args.valuelen = *valuelenp;
144 168
145 lock_mode = xfs_ilock_attr_map_shared(ip); 169 lock_mode = xfs_ilock_attr_map_shared(ip);
146 error = xfs_attr_get_ilocked(ip, &args); 170 error = xfs_attr_get_ilocked(ip, &args);
147 xfs_iunlock(ip, lock_mode); 171 xfs_iunlock(ip, lock_mode);
148
149 *valuelenp = args.valuelen; 172 *valuelenp = args.valuelen;
150 return error == -EEXIST ? 0 : error; 173
174 /* on error, we have to clean up allocated value buffers */
175 if (error) {
176 if (flags & ATTR_ALLOC) {
177 kmem_free(args.value);
178 *value = NULL;
179 }
180 return error;
181 }
182 *value = args.value;
183 return 0;
151} 184}
152 185
153/* 186/*
@@ -768,6 +801,8 @@ xfs_attr_leaf_removename(
768 * 801 *
769 * This leaf block cannot have a "remote" value, we only call this routine 802 * This leaf block cannot have a "remote" value, we only call this routine
770 * if bmap_one_block() says there is only one block (ie: no remote blks). 803 * if bmap_one_block() says there is only one block (ie: no remote blks).
804 *
805 * Returns 0 on successful retrieval, otherwise an error.
771 */ 806 */
772STATIC int 807STATIC int
773xfs_attr_leaf_get(xfs_da_args_t *args) 808xfs_attr_leaf_get(xfs_da_args_t *args)
@@ -789,9 +824,6 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
789 } 824 }
790 error = xfs_attr3_leaf_getvalue(bp, args); 825 error = xfs_attr3_leaf_getvalue(bp, args);
791 xfs_trans_brelse(args->trans, bp); 826 xfs_trans_brelse(args->trans, bp);
792 if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) {
793 error = xfs_attr_rmtval_get(args);
794 }
795 return error; 827 return error;
796} 828}
797 829
@@ -1268,11 +1300,13 @@ xfs_attr_refillstate(xfs_da_state_t *state)
1268} 1300}
1269 1301
1270/* 1302/*
1271 * Look up a filename in a node attribute list. 1303 * Retrieve the attribute data from a node attribute list.
1272 * 1304 *
1273 * This routine gets called for any attribute fork that has more than one 1305 * This routine gets called for any attribute fork that has more than one
1274 * block, ie: both true Btree attr lists and for single-leaf-blocks with 1306 * block, ie: both true Btree attr lists and for single-leaf-blocks with
1275 * "remote" values taking up more blocks. 1307 * "remote" values taking up more blocks.
1308 *
1309 * Returns 0 on successful retrieval, otherwise an error.
1276 */ 1310 */
1277STATIC int 1311STATIC int
1278xfs_attr_node_get(xfs_da_args_t *args) 1312xfs_attr_node_get(xfs_da_args_t *args)
@@ -1294,24 +1328,21 @@ xfs_attr_node_get(xfs_da_args_t *args)
1294 error = xfs_da3_node_lookup_int(state, &retval); 1328 error = xfs_da3_node_lookup_int(state, &retval);
1295 if (error) { 1329 if (error) {
1296 retval = error; 1330 retval = error;
1297 } else if (retval == -EEXIST) { 1331 goto out_release;
1298 blk = &state->path.blk[ state->path.active-1 ];
1299 ASSERT(blk->bp != NULL);
1300 ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
1301
1302 /*
1303 * Get the value, local or "remote"
1304 */
1305 retval = xfs_attr3_leaf_getvalue(blk->bp, args);
1306 if (!retval && (args->rmtblkno > 0)
1307 && !(args->flags & ATTR_KERNOVAL)) {
1308 retval = xfs_attr_rmtval_get(args);
1309 }
1310 } 1332 }
1333 if (retval != -EEXIST)
1334 goto out_release;
1335
1336 /*
1337 * Get the value, local or "remote"
1338 */
1339 blk = &state->path.blk[state->path.active - 1];
1340 retval = xfs_attr3_leaf_getvalue(blk->bp, args);
1311 1341
1312 /* 1342 /*
1313 * If not in a transaction, we have to release all the buffers. 1343 * If not in a transaction, we have to release all the buffers.
1314 */ 1344 */
1345out_release:
1315 for (i = 0; i < state->path.active; i++) { 1346 for (i = 0; i < state->path.active; i++) {
1316 xfs_trans_brelse(args->trans, state->path.blk[i].bp); 1347 xfs_trans_brelse(args->trans, state->path.blk[i].bp);
1317 state->path.blk[i].bp = NULL; 1348 state->path.blk[i].bp = NULL;
diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
index ff28ebf3b635..94badfa1743e 100644
--- a/fs/xfs/libxfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -37,6 +37,7 @@ struct xfs_attr_list_context;
37#define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */ 37#define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */
38 38
39#define ATTR_INCOMPLETE 0x4000 /* [kernel] return INCOMPLETE attr keys */ 39#define ATTR_INCOMPLETE 0x4000 /* [kernel] return INCOMPLETE attr keys */
40#define ATTR_ALLOC 0x8000 /* allocate xattr buffer on demand */
40 41
41#define XFS_ATTR_FLAGS \ 42#define XFS_ATTR_FLAGS \
42 { ATTR_DONTFOLLOW, "DONTFOLLOW" }, \ 43 { ATTR_DONTFOLLOW, "DONTFOLLOW" }, \
@@ -47,7 +48,8 @@ struct xfs_attr_list_context;
47 { ATTR_REPLACE, "REPLACE" }, \ 48 { ATTR_REPLACE, "REPLACE" }, \
48 { ATTR_KERNOTIME, "KERNOTIME" }, \ 49 { ATTR_KERNOTIME, "KERNOTIME" }, \
49 { ATTR_KERNOVAL, "KERNOVAL" }, \ 50 { ATTR_KERNOVAL, "KERNOVAL" }, \
50 { ATTR_INCOMPLETE, "INCOMPLETE" } 51 { ATTR_INCOMPLETE, "INCOMPLETE" }, \
52 { ATTR_ALLOC, "ALLOC" }
51 53
52/* 54/*
53 * The maximum size (into the kernel or returned from the kernel) of an 55 * The maximum size (into the kernel or returned from the kernel) of an
@@ -143,7 +145,7 @@ int xfs_attr_list_int(struct xfs_attr_list_context *);
143int xfs_inode_hasattr(struct xfs_inode *ip); 145int xfs_inode_hasattr(struct xfs_inode *ip);
144int xfs_attr_get_ilocked(struct xfs_inode *ip, struct xfs_da_args *args); 146int xfs_attr_get_ilocked(struct xfs_inode *ip, struct xfs_da_args *args);
145int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name, 147int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
146 unsigned char *value, int *valuelenp, int flags); 148 unsigned char **value, int *valuelenp, int flags);
147int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, 149int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
148 unsigned char *value, int valuelen, int flags); 150 unsigned char *value, int valuelen, int flags);
149int xfs_attr_set_args(struct xfs_da_args *args); 151int xfs_attr_set_args(struct xfs_da_args *args);
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 70eb941d02e4..b9f019603d0b 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -393,6 +393,50 @@ xfs_attr_namesp_match(int arg_flags, int ondisk_flags)
393 return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags); 393 return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags);
394} 394}
395 395
396static int
397xfs_attr_copy_value(
398 struct xfs_da_args *args,
399 unsigned char *value,
400 int valuelen)
401{
402 /*
403 * No copy if all we have to do is get the length
404 */
405 if (args->flags & ATTR_KERNOVAL) {
406 args->valuelen = valuelen;
407 return 0;
408 }
409
410 /*
411 * No copy if the length of the existing buffer is too small
412 */
413 if (args->valuelen < valuelen) {
414 args->valuelen = valuelen;
415 return -ERANGE;
416 }
417
418 if (args->op_flags & XFS_DA_OP_ALLOCVAL) {
419 args->value = kmem_alloc_large(valuelen, 0);
420 if (!args->value)
421 return -ENOMEM;
422 }
423 args->valuelen = valuelen;
424
425 /* remote block xattr requires IO for copy-in */
426 if (args->rmtblkno)
427 return xfs_attr_rmtval_get(args);
428
429 /*
430 * This is to prevent a GCC warning because the remote xattr case
431 * doesn't have a value to pass in. In that case, we never reach here,
432 * but GCC can't work that out and so throws a "passing NULL to
433 * memcpy" warning.
434 */
435 if (!value)
436 return -EINVAL;
437 memcpy(args->value, value, valuelen);
438 return 0;
439}
396 440
397/*======================================================================== 441/*========================================================================
398 * External routines when attribute fork size < XFS_LITINO(mp). 442 * External routines when attribute fork size < XFS_LITINO(mp).
@@ -720,15 +764,19 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args)
720} 764}
721 765
722/* 766/*
723 * Look up a name in a shortform attribute list structure. 767 * Retreive the attribute value and length.
768 *
769 * If ATTR_KERNOVAL is specified, only the length needs to be returned.
770 * Unlike a lookup, we only return an error if the attribute does not
771 * exist or we can't retrieve the value.
724 */ 772 */
725/*ARGSUSED*/
726int 773int
727xfs_attr_shortform_getvalue(xfs_da_args_t *args) 774xfs_attr_shortform_getvalue(
775 struct xfs_da_args *args)
728{ 776{
729 xfs_attr_shortform_t *sf; 777 struct xfs_attr_shortform *sf;
730 xfs_attr_sf_entry_t *sfe; 778 struct xfs_attr_sf_entry *sfe;
731 int i; 779 int i;
732 780
733 ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE); 781 ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE);
734 sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data; 782 sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data;
@@ -741,18 +789,8 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args)
741 continue; 789 continue;
742 if (!xfs_attr_namesp_match(args->flags, sfe->flags)) 790 if (!xfs_attr_namesp_match(args->flags, sfe->flags))
743 continue; 791 continue;
744 if (args->flags & ATTR_KERNOVAL) { 792 return xfs_attr_copy_value(args, &sfe->nameval[args->namelen],
745 args->valuelen = sfe->valuelen; 793 sfe->valuelen);
746 return -EEXIST;
747 }
748 if (args->valuelen < sfe->valuelen) {
749 args->valuelen = sfe->valuelen;
750 return -ERANGE;
751 }
752 args->valuelen = sfe->valuelen;
753 memcpy(args->value, &sfe->nameval[args->namelen],
754 args->valuelen);
755 return -EEXIST;
756 } 794 }
757 return -ENOATTR; 795 return -ENOATTR;
758} 796}
@@ -782,7 +820,7 @@ xfs_attr_shortform_to_leaf(
782 ifp = dp->i_afp; 820 ifp = dp->i_afp;
783 sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; 821 sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
784 size = be16_to_cpu(sf->hdr.totsize); 822 size = be16_to_cpu(sf->hdr.totsize);
785 tmpbuffer = kmem_alloc(size, KM_SLEEP); 823 tmpbuffer = kmem_alloc(size, 0);
786 ASSERT(tmpbuffer != NULL); 824 ASSERT(tmpbuffer != NULL);
787 memcpy(tmpbuffer, ifp->if_u1.if_data, size); 825 memcpy(tmpbuffer, ifp->if_u1.if_data, size);
788 sf = (xfs_attr_shortform_t *)tmpbuffer; 826 sf = (xfs_attr_shortform_t *)tmpbuffer;
@@ -985,7 +1023,7 @@ xfs_attr3_leaf_to_shortform(
985 1023
986 trace_xfs_attr_leaf_to_sf(args); 1024 trace_xfs_attr_leaf_to_sf(args);
987 1025
988 tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP); 1026 tmpbuffer = kmem_alloc(args->geo->blksize, 0);
989 if (!tmpbuffer) 1027 if (!tmpbuffer)
990 return -ENOMEM; 1028 return -ENOMEM;
991 1029
@@ -1448,7 +1486,7 @@ xfs_attr3_leaf_compact(
1448 1486
1449 trace_xfs_attr_leaf_compact(args); 1487 trace_xfs_attr_leaf_compact(args);
1450 1488
1451 tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP); 1489 tmpbuffer = kmem_alloc(args->geo->blksize, 0);
1452 memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); 1490 memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
1453 memset(bp->b_addr, 0, args->geo->blksize); 1491 memset(bp->b_addr, 0, args->geo->blksize);
1454 leaf_src = (xfs_attr_leafblock_t *)tmpbuffer; 1492 leaf_src = (xfs_attr_leafblock_t *)tmpbuffer;
@@ -2167,7 +2205,7 @@ xfs_attr3_leaf_unbalance(
2167 struct xfs_attr_leafblock *tmp_leaf; 2205 struct xfs_attr_leafblock *tmp_leaf;
2168 struct xfs_attr3_icleaf_hdr tmphdr; 2206 struct xfs_attr3_icleaf_hdr tmphdr;
2169 2207
2170 tmp_leaf = kmem_zalloc(state->args->geo->blksize, KM_SLEEP); 2208 tmp_leaf = kmem_zalloc(state->args->geo->blksize, 0);
2171 2209
2172 /* 2210 /*
2173 * Copy the header into the temp leaf so that all the stuff 2211 * Copy the header into the temp leaf so that all the stuff
@@ -2350,6 +2388,10 @@ xfs_attr3_leaf_lookup_int(
2350/* 2388/*
2351 * Get the value associated with an attribute name from a leaf attribute 2389 * Get the value associated with an attribute name from a leaf attribute
2352 * list structure. 2390 * list structure.
2391 *
2392 * If ATTR_KERNOVAL is specified, only the length needs to be returned.
2393 * Unlike a lookup, we only return an error if the attribute does not
2394 * exist or we can't retrieve the value.
2353 */ 2395 */
2354int 2396int
2355xfs_attr3_leaf_getvalue( 2397xfs_attr3_leaf_getvalue(
@@ -2361,7 +2403,6 @@ xfs_attr3_leaf_getvalue(
2361 struct xfs_attr_leaf_entry *entry; 2403 struct xfs_attr_leaf_entry *entry;
2362 struct xfs_attr_leaf_name_local *name_loc; 2404 struct xfs_attr_leaf_name_local *name_loc;
2363 struct xfs_attr_leaf_name_remote *name_rmt; 2405 struct xfs_attr_leaf_name_remote *name_rmt;
2364 int valuelen;
2365 2406
2366 leaf = bp->b_addr; 2407 leaf = bp->b_addr;
2367 xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); 2408 xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
@@ -2373,36 +2414,19 @@ xfs_attr3_leaf_getvalue(
2373 name_loc = xfs_attr3_leaf_name_local(leaf, args->index); 2414 name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
2374 ASSERT(name_loc->namelen == args->namelen); 2415 ASSERT(name_loc->namelen == args->namelen);
2375 ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0); 2416 ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0);
2376 valuelen = be16_to_cpu(name_loc->valuelen); 2417 return xfs_attr_copy_value(args,
2377 if (args->flags & ATTR_KERNOVAL) { 2418 &name_loc->nameval[args->namelen],
2378 args->valuelen = valuelen; 2419 be16_to_cpu(name_loc->valuelen));
2379 return 0; 2420 }
2380 } 2421
2381 if (args->valuelen < valuelen) { 2422 name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
2382 args->valuelen = valuelen; 2423 ASSERT(name_rmt->namelen == args->namelen);
2383 return -ERANGE; 2424 ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
2384 } 2425 args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
2385 args->valuelen = valuelen; 2426 args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
2386 memcpy(args->value, &name_loc->nameval[args->namelen], valuelen); 2427 args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
2387 } else { 2428 args->rmtvaluelen);
2388 name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); 2429 return xfs_attr_copy_value(args, NULL, args->rmtvaluelen);
2389 ASSERT(name_rmt->namelen == args->namelen);
2390 ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
2391 args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
2392 args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
2393 args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
2394 args->rmtvaluelen);
2395 if (args->flags & ATTR_KERNOVAL) {
2396 args->valuelen = args->rmtvaluelen;
2397 return 0;
2398 }
2399 if (args->valuelen < args->rmtvaluelen) {
2400 args->valuelen = args->rmtvaluelen;
2401 return -ERANGE;
2402 }
2403 args->valuelen = args->rmtvaluelen;
2404 }
2405 return 0;
2406} 2430}
2407 2431
2408/*======================================================================== 2432/*========================================================================
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 4eb30d357045..3e39b7d40f25 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -358,6 +358,8 @@ xfs_attr_rmtval_copyin(
358/* 358/*
359 * Read the value associated with an attribute from the out-of-line buffer 359 * Read the value associated with an attribute from the out-of-line buffer
360 * that we stored it in. 360 * that we stored it in.
361 *
362 * Returns 0 on successful retrieval, otherwise an error.
361 */ 363 */
362int 364int
363xfs_attr_rmtval_get( 365xfs_attr_rmtval_get(
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 07aad70f3931..054b4ce30033 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -553,7 +553,7 @@ __xfs_bmap_add_free(
553#endif 553#endif
554 ASSERT(xfs_bmap_free_item_zone != NULL); 554 ASSERT(xfs_bmap_free_item_zone != NULL);
555 555
556 new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); 556 new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0);
557 new->xefi_startblock = bno; 557 new->xefi_startblock = bno;
558 new->xefi_blockcount = (xfs_extlen_t)len; 558 new->xefi_blockcount = (xfs_extlen_t)len;
559 if (oinfo) 559 if (oinfo)
@@ -1099,7 +1099,7 @@ xfs_bmap_add_attrfork(
1099 if (error) 1099 if (error)
1100 goto trans_cancel; 1100 goto trans_cancel;
1101 ASSERT(ip->i_afp == NULL); 1101 ASSERT(ip->i_afp == NULL);
1102 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); 1102 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, 0);
1103 ip->i_afp->if_flags = XFS_IFEXTENTS; 1103 ip->i_afp->if_flags = XFS_IFEXTENTS;
1104 logflags = 0; 1104 logflags = 0;
1105 switch (ip->i_d.di_format) { 1105 switch (ip->i_d.di_format) {
@@ -1985,11 +1985,8 @@ xfs_bmap_add_extent_delay_real(
1985 } 1985 }
1986 1986
1987 /* add reverse mapping unless caller opted out */ 1987 /* add reverse mapping unless caller opted out */
1988 if (!(bma->flags & XFS_BMAPI_NORMAP)) { 1988 if (!(bma->flags & XFS_BMAPI_NORMAP))
1989 error = xfs_rmap_map_extent(bma->tp, bma->ip, whichfork, new); 1989 xfs_rmap_map_extent(bma->tp, bma->ip, whichfork, new);
1990 if (error)
1991 goto done;
1992 }
1993 1990
1994 /* convert to a btree if necessary */ 1991 /* convert to a btree if necessary */
1995 if (xfs_bmap_needs_btree(bma->ip, whichfork)) { 1992 if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
@@ -2471,9 +2468,7 @@ xfs_bmap_add_extent_unwritten_real(
2471 } 2468 }
2472 2469
2473 /* update reverse mappings */ 2470 /* update reverse mappings */
2474 error = xfs_rmap_convert_extent(mp, tp, ip, whichfork, new); 2471 xfs_rmap_convert_extent(mp, tp, ip, whichfork, new);
2475 if (error)
2476 goto done;
2477 2472
2478 /* convert to a btree if necessary */ 2473 /* convert to a btree if necessary */
2479 if (xfs_bmap_needs_btree(ip, whichfork)) { 2474 if (xfs_bmap_needs_btree(ip, whichfork)) {
@@ -2832,11 +2827,8 @@ xfs_bmap_add_extent_hole_real(
2832 } 2827 }
2833 2828
2834 /* add reverse mapping unless caller opted out */ 2829 /* add reverse mapping unless caller opted out */
2835 if (!(flags & XFS_BMAPI_NORMAP)) { 2830 if (!(flags & XFS_BMAPI_NORMAP))
2836 error = xfs_rmap_map_extent(tp, ip, whichfork, new); 2831 xfs_rmap_map_extent(tp, ip, whichfork, new);
2837 if (error)
2838 goto done;
2839 }
2840 2832
2841 /* convert to a btree if necessary */ 2833 /* convert to a btree if necessary */
2842 if (xfs_bmap_needs_btree(ip, whichfork)) { 2834 if (xfs_bmap_needs_btree(ip, whichfork)) {
@@ -4050,12 +4042,8 @@ xfs_bmapi_allocate(
4050 */ 4042 */
4051 if (!(bma->flags & XFS_BMAPI_METADATA)) { 4043 if (!(bma->flags & XFS_BMAPI_METADATA)) {
4052 bma->datatype = XFS_ALLOC_NOBUSY; 4044 bma->datatype = XFS_ALLOC_NOBUSY;
4053 if (whichfork == XFS_DATA_FORK) { 4045 if (whichfork == XFS_DATA_FORK && bma->offset == 0)
4054 if (bma->offset == 0) 4046 bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
4055 bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
4056 else
4057 bma->datatype |= XFS_ALLOC_USERDATA;
4058 }
4059 if (bma->flags & XFS_BMAPI_ZERO) 4047 if (bma->flags & XFS_BMAPI_ZERO)
4060 bma->datatype |= XFS_ALLOC_USERDATA_ZERO; 4048 bma->datatype |= XFS_ALLOC_USERDATA_ZERO;
4061 } 4049 }
@@ -4401,12 +4389,9 @@ xfs_bmapi_write(
4401 * If this is a CoW allocation, record the data in 4389 * If this is a CoW allocation, record the data in
4402 * the refcount btree for orphan recovery. 4390 * the refcount btree for orphan recovery.
4403 */ 4391 */
4404 if (whichfork == XFS_COW_FORK) { 4392 if (whichfork == XFS_COW_FORK)
4405 error = xfs_refcount_alloc_cow_extent(tp, 4393 xfs_refcount_alloc_cow_extent(tp, bma.blkno,
4406 bma.blkno, bma.length); 4394 bma.length);
4407 if (error)
4408 goto error0;
4409 }
4410 } 4395 }
4411 4396
4412 /* Deal with the allocated space we found. */ 4397 /* Deal with the allocated space we found. */
@@ -4530,7 +4515,7 @@ xfs_bmapi_convert_delalloc(
4530 if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK)) 4515 if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK))
4531 goto out_finish; 4516 goto out_finish;
4532 error = -EFSCORRUPTED; 4517 error = -EFSCORRUPTED;
4533 if (WARN_ON_ONCE(!bma.got.br_startblock && !XFS_IS_REALTIME_INODE(ip))) 4518 if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock)))
4534 goto out_finish; 4519 goto out_finish;
4535 4520
4536 XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length)); 4521 XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length));
@@ -4540,12 +4525,8 @@ xfs_bmapi_convert_delalloc(
4540 *imap = bma.got; 4525 *imap = bma.got;
4541 *seq = READ_ONCE(ifp->if_seq); 4526 *seq = READ_ONCE(ifp->if_seq);
4542 4527
4543 if (whichfork == XFS_COW_FORK) { 4528 if (whichfork == XFS_COW_FORK)
4544 error = xfs_refcount_alloc_cow_extent(tp, bma.blkno, 4529 xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length);
4545 bma.length);
4546 if (error)
4547 goto out_finish;
4548 }
4549 4530
4550 error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags, 4531 error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
4551 whichfork); 4532 whichfork);
@@ -5149,18 +5130,14 @@ xfs_bmap_del_extent_real(
5149 } 5130 }
5150 5131
5151 /* remove reverse mapping */ 5132 /* remove reverse mapping */
5152 error = xfs_rmap_unmap_extent(tp, ip, whichfork, del); 5133 xfs_rmap_unmap_extent(tp, ip, whichfork, del);
5153 if (error)
5154 goto done;
5155 5134
5156 /* 5135 /*
5157 * If we need to, add to list of extents to delete. 5136 * If we need to, add to list of extents to delete.
5158 */ 5137 */
5159 if (do_fx && !(bflags & XFS_BMAPI_REMAP)) { 5138 if (do_fx && !(bflags & XFS_BMAPI_REMAP)) {
5160 if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) { 5139 if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
5161 error = xfs_refcount_decrease_extent(tp, del); 5140 xfs_refcount_decrease_extent(tp, del);
5162 if (error)
5163 goto done;
5164 } else { 5141 } else {
5165 __xfs_bmap_add_free(tp, del->br_startblock, 5142 __xfs_bmap_add_free(tp, del->br_startblock,
5166 del->br_blockcount, NULL, 5143 del->br_blockcount, NULL,
@@ -5651,12 +5628,11 @@ done:
5651 &new); 5628 &new);
5652 5629
5653 /* update reverse mapping. rmap functions merge the rmaps for us */ 5630 /* update reverse mapping. rmap functions merge the rmaps for us */
5654 error = xfs_rmap_unmap_extent(tp, ip, whichfork, got); 5631 xfs_rmap_unmap_extent(tp, ip, whichfork, got);
5655 if (error)
5656 return error;
5657 memcpy(&new, got, sizeof(new)); 5632 memcpy(&new, got, sizeof(new));
5658 new.br_startoff = left->br_startoff + left->br_blockcount; 5633 new.br_startoff = left->br_startoff + left->br_blockcount;
5659 return xfs_rmap_map_extent(tp, ip, whichfork, &new); 5634 xfs_rmap_map_extent(tp, ip, whichfork, &new);
5635 return 0;
5660} 5636}
5661 5637
5662static int 5638static int
@@ -5695,10 +5671,9 @@ xfs_bmap_shift_update_extent(
5695 got); 5671 got);
5696 5672
5697 /* update reverse mapping */ 5673 /* update reverse mapping */
5698 error = xfs_rmap_unmap_extent(tp, ip, whichfork, &prev); 5674 xfs_rmap_unmap_extent(tp, ip, whichfork, &prev);
5699 if (error) 5675 xfs_rmap_map_extent(tp, ip, whichfork, got);
5700 return error; 5676 return 0;
5701 return xfs_rmap_map_extent(tp, ip, whichfork, got);
5702} 5677}
5703 5678
5704int 5679int
@@ -6094,7 +6069,7 @@ __xfs_bmap_add(
6094 bmap->br_blockcount, 6069 bmap->br_blockcount,
6095 bmap->br_state); 6070 bmap->br_state);
6096 6071
6097 bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_SLEEP | KM_NOFS); 6072 bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_NOFS);
6098 INIT_LIST_HEAD(&bi->bi_list); 6073 INIT_LIST_HEAD(&bi->bi_list);
6099 bi->bi_type = type; 6074 bi->bi_type = type;
6100 bi->bi_owner = ip; 6075 bi->bi_owner = ip;
@@ -6106,29 +6081,29 @@ __xfs_bmap_add(
6106} 6081}
6107 6082
6108/* Map an extent into a file. */ 6083/* Map an extent into a file. */
6109int 6084void
6110xfs_bmap_map_extent( 6085xfs_bmap_map_extent(
6111 struct xfs_trans *tp, 6086 struct xfs_trans *tp,
6112 struct xfs_inode *ip, 6087 struct xfs_inode *ip,
6113 struct xfs_bmbt_irec *PREV) 6088 struct xfs_bmbt_irec *PREV)
6114{ 6089{
6115 if (!xfs_bmap_is_update_needed(PREV)) 6090 if (!xfs_bmap_is_update_needed(PREV))
6116 return 0; 6091 return;
6117 6092
6118 return __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, XFS_DATA_FORK, PREV); 6093 __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, XFS_DATA_FORK, PREV);
6119} 6094}
6120 6095
6121/* Unmap an extent out of a file. */ 6096/* Unmap an extent out of a file. */
6122int 6097void
6123xfs_bmap_unmap_extent( 6098xfs_bmap_unmap_extent(
6124 struct xfs_trans *tp, 6099 struct xfs_trans *tp,
6125 struct xfs_inode *ip, 6100 struct xfs_inode *ip,
6126 struct xfs_bmbt_irec *PREV) 6101 struct xfs_bmbt_irec *PREV)
6127{ 6102{
6128 if (!xfs_bmap_is_update_needed(PREV)) 6103 if (!xfs_bmap_is_update_needed(PREV))
6129 return 0; 6104 return;
6130 6105
6131 return __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, XFS_DATA_FORK, PREV); 6106 __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, XFS_DATA_FORK, PREV);
6132} 6107}
6133 6108
6134/* 6109/*
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 8f597f9abdbe..5bb446d80542 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -171,6 +171,13 @@ static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec)
171 !isnullstartblock(irec->br_startblock); 171 !isnullstartblock(irec->br_startblock);
172} 172}
173 173
174/*
175 * Check the mapping for obviously garbage allocations that could trash the
176 * filesystem immediately.
177 */
178#define xfs_valid_startblock(ip, startblock) \
179 ((startblock) != 0 || XFS_IS_REALTIME_INODE(ip))
180
174void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, 181void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
175 xfs_filblks_t len); 182 xfs_filblks_t len);
176int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); 183int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
@@ -254,9 +261,9 @@ int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_inode *ip,
254 enum xfs_bmap_intent_type type, int whichfork, 261 enum xfs_bmap_intent_type type, int whichfork,
255 xfs_fileoff_t startoff, xfs_fsblock_t startblock, 262 xfs_fileoff_t startoff, xfs_fsblock_t startblock,
256 xfs_filblks_t *blockcount, xfs_exntst_t state); 263 xfs_filblks_t *blockcount, xfs_exntst_t state);
257int xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, 264void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
258 struct xfs_bmbt_irec *imap); 265 struct xfs_bmbt_irec *imap);
259int xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, 266void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
260 struct xfs_bmbt_irec *imap); 267 struct xfs_bmbt_irec *imap);
261 268
262static inline int xfs_bmap_fork_to_state(int whichfork) 269static inline int xfs_bmap_fork_to_state(int whichfork)
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index fbb18ba5d905..ffe608d2a2d9 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -400,8 +400,20 @@ xfs_bmbt_diff_two_keys(
400 union xfs_btree_key *k1, 400 union xfs_btree_key *k1,
401 union xfs_btree_key *k2) 401 union xfs_btree_key *k2)
402{ 402{
403 return (int64_t)be64_to_cpu(k1->bmbt.br_startoff) - 403 uint64_t a = be64_to_cpu(k1->bmbt.br_startoff);
404 be64_to_cpu(k2->bmbt.br_startoff); 404 uint64_t b = be64_to_cpu(k2->bmbt.br_startoff);
405
406 /*
407 * Note: This routine previously casted a and b to int64 and subtracted
408 * them to generate a result. This lead to problems if b was the
409 * "maximum" key value (all ones) being signed incorrectly, hence this
410 * somewhat less efficient version.
411 */
412 if (a > b)
413 return 1;
414 if (b > a)
415 return -1;
416 return 0;
405} 417}
406 418
407static xfs_failaddr_t 419static xfs_failaddr_t
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index f1048efa4268..71de937f9e64 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -4466,8 +4466,6 @@ xfs_btree_lblock_verify(
4466 * btree block 4466 * btree block
4467 * 4467 *
4468 * @bp: buffer containing the btree block 4468 * @bp: buffer containing the btree block
4469 * @max_recs: pointer to the m_*_mxr max records field in the xfs mount
4470 * @pag_max_level: pointer to the per-ag max level field
4471 */ 4469 */
4472xfs_failaddr_t 4470xfs_failaddr_t
4473xfs_btree_sblock_v5hdr_verify( 4471xfs_btree_sblock_v5hdr_verify(
@@ -4600,7 +4598,7 @@ xfs_btree_simple_query_range(
4600 4598
4601 /* Callback */ 4599 /* Callback */
4602 error = fn(cur, recp, priv); 4600 error = fn(cur, recp, priv);
4603 if (error < 0 || error == XFS_BTREE_QUERY_RANGE_ABORT) 4601 if (error)
4604 break; 4602 break;
4605 4603
4606advloop: 4604advloop:
@@ -4702,8 +4700,7 @@ pop_up:
4702 */ 4700 */
4703 if (ldiff >= 0 && hdiff >= 0) { 4701 if (ldiff >= 0 && hdiff >= 0) {
4704 error = fn(cur, recp, priv); 4702 error = fn(cur, recp, priv);
4705 if (error < 0 || 4703 if (error)
4706 error == XFS_BTREE_QUERY_RANGE_ABORT)
4707 break; 4704 break;
4708 } else if (hdiff < 0) { 4705 } else if (hdiff < 0) {
4709 /* Record is larger than high key; pop. */ 4706 /* Record is larger than high key; pop. */
@@ -4774,8 +4771,7 @@ out:
4774 * Query a btree for all records overlapping a given interval of keys. The 4771 * Query a btree for all records overlapping a given interval of keys. The
4775 * supplied function will be called with each record found; return one of the 4772 * supplied function will be called with each record found; return one of the
4776 * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error 4773 * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error
4777 * code. This function returns XFS_BTREE_QUERY_RANGE_ABORT, zero, or a 4774 * code. This function returns -ECANCELED, zero, or a negative error code.
4778 * negative error code.
4779 */ 4775 */
4780int 4776int
4781xfs_btree_query_range( 4777xfs_btree_query_range(
@@ -4891,7 +4887,7 @@ xfs_btree_has_record_helper(
4891 union xfs_btree_rec *rec, 4887 union xfs_btree_rec *rec,
4892 void *priv) 4888 void *priv)
4893{ 4889{
4894 return XFS_BTREE_QUERY_RANGE_ABORT; 4890 return -ECANCELED;
4895} 4891}
4896 4892
4897/* Is there a record covering a given range of keys? */ 4893/* Is there a record covering a given range of keys? */
@@ -4906,7 +4902,7 @@ xfs_btree_has_record(
4906 4902
4907 error = xfs_btree_query_range(cur, low, high, 4903 error = xfs_btree_query_range(cur, low, high,
4908 &xfs_btree_has_record_helper, NULL); 4904 &xfs_btree_has_record_helper, NULL);
4909 if (error == XFS_BTREE_QUERY_RANGE_ABORT) { 4905 if (error == -ECANCELED) {
4910 *exists = true; 4906 *exists = true;
4911 return 0; 4907 return 0;
4912 } 4908 }
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index fa3cd8ab9aba..ced1e65d1483 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -464,9 +464,13 @@ xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp,
464uint xfs_btree_compute_maxlevels(uint *limits, unsigned long len); 464uint xfs_btree_compute_maxlevels(uint *limits, unsigned long len);
465unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len); 465unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len);
466 466
467/* return codes */ 467/*
468#define XFS_BTREE_QUERY_RANGE_CONTINUE (XFS_ITER_CONTINUE) /* keep iterating */ 468 * Return codes for the query range iterator function are 0 to continue
469#define XFS_BTREE_QUERY_RANGE_ABORT (XFS_ITER_ABORT) /* stop iterating */ 469 * iterating, and non-zero to stop iterating. Any non-zero value will be
470 * passed up to the _query_range caller. The special value -ECANCELED can be
471 * used to stop iteration, because _query_range never generates that error
472 * code on its own.
473 */
470typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur, 474typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur,
471 union xfs_btree_rec *rec, void *priv); 475 union xfs_btree_rec *rec, void *priv);
472 476
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 0bf56e94bfe9..4fd1223c1bd5 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -2098,7 +2098,7 @@ xfs_da_grow_inode_int(
2098 * If we didn't get it and the block might work if fragmented, 2098 * If we didn't get it and the block might work if fragmented,
2099 * try without the CONTIG flag. Loop until we get it all. 2099 * try without the CONTIG flag. Loop until we get it all.
2100 */ 2100 */
2101 mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP); 2101 mapp = kmem_alloc(sizeof(*mapp) * count, 0);
2102 for (b = *bno, mapi = 0; b < *bno + count; ) { 2102 for (b = *bno, mapi = 0; b < *bno + count; ) {
2103 nmap = min(XFS_BMAP_MAX_NMAP, count); 2103 nmap = min(XFS_BMAP_MAX_NMAP, count);
2104 c = (int)(*bno + count - b); 2104 c = (int)(*bno + count - b);
@@ -2480,7 +2480,7 @@ xfs_buf_map_from_irec(
2480 2480
2481 if (nirecs > 1) { 2481 if (nirecs > 1) {
2482 map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), 2482 map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map),
2483 KM_SLEEP | KM_NOFS); 2483 KM_NOFS);
2484 if (!map) 2484 if (!map)
2485 return -ENOMEM; 2485 return -ENOMEM;
2486 *mapp = map; 2486 *mapp = map;
@@ -2539,7 +2539,7 @@ xfs_dabuf_map(
2539 */ 2539 */
2540 if (nfsb != 1) 2540 if (nfsb != 1)
2541 irecs = kmem_zalloc(sizeof(irec) * nfsb, 2541 irecs = kmem_zalloc(sizeof(irec) * nfsb,
2542 KM_SLEEP | KM_NOFS); 2542 KM_NOFS);
2543 2543
2544 nirecs = nfsb; 2544 nirecs = nfsb;
2545 error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs, 2545 error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs,
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index 84dd865b6c3d..ae0bbd20d9ca 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -81,13 +81,15 @@ typedef struct xfs_da_args {
81#define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */ 81#define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */
82#define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */ 82#define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */
83#define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */ 83#define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */
84#define XFS_DA_OP_ALLOCVAL 0x0020 /* lookup to alloc buffer if found */
84 85
85#define XFS_DA_OP_FLAGS \ 86#define XFS_DA_OP_FLAGS \
86 { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ 87 { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \
87 { XFS_DA_OP_RENAME, "RENAME" }, \ 88 { XFS_DA_OP_RENAME, "RENAME" }, \
88 { XFS_DA_OP_ADDNAME, "ADDNAME" }, \ 89 { XFS_DA_OP_ADDNAME, "ADDNAME" }, \
89 { XFS_DA_OP_OKNOENT, "OKNOENT" }, \ 90 { XFS_DA_OP_OKNOENT, "OKNOENT" }, \
90 { XFS_DA_OP_CILOOKUP, "CILOOKUP" } 91 { XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \
92 { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" }
91 93
92/* 94/*
93 * Storage for holding state during Btree searches and split/join ops. 95 * Storage for holding state during Btree searches and split/join ops.
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index eb2be2a6a25a..22557527cfdb 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -517,7 +517,7 @@ xfs_defer_add(
517 } 517 }
518 if (!dfp) { 518 if (!dfp) {
519 dfp = kmem_alloc(sizeof(struct xfs_defer_pending), 519 dfp = kmem_alloc(sizeof(struct xfs_defer_pending),
520 KM_SLEEP | KM_NOFS); 520 KM_NOFS);
521 dfp->dfp_type = type; 521 dfp->dfp_type = type;
522 dfp->dfp_intent = NULL; 522 dfp->dfp_intent = NULL;
523 dfp->dfp_done = NULL; 523 dfp->dfp_done = NULL;
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 67840723edbb..867c5dee0751 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -110,9 +110,9 @@ xfs_da_mount(
110 110
111 nodehdr_size = mp->m_dir_inode_ops->node_hdr_size; 111 nodehdr_size = mp->m_dir_inode_ops->node_hdr_size;
112 mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry), 112 mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
113 KM_SLEEP | KM_MAYFAIL); 113 KM_MAYFAIL);
114 mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry), 114 mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
115 KM_SLEEP | KM_MAYFAIL); 115 KM_MAYFAIL);
116 if (!mp->m_dir_geo || !mp->m_attr_geo) { 116 if (!mp->m_dir_geo || !mp->m_attr_geo) {
117 kmem_free(mp->m_dir_geo); 117 kmem_free(mp->m_dir_geo);
118 kmem_free(mp->m_attr_geo); 118 kmem_free(mp->m_attr_geo);
@@ -217,7 +217,7 @@ xfs_dir_init(
217 if (error) 217 if (error)
218 return error; 218 return error;
219 219
220 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); 220 args = kmem_zalloc(sizeof(*args), KM_NOFS);
221 if (!args) 221 if (!args)
222 return -ENOMEM; 222 return -ENOMEM;
223 223
@@ -254,7 +254,7 @@ xfs_dir_createname(
254 XFS_STATS_INC(dp->i_mount, xs_dir_create); 254 XFS_STATS_INC(dp->i_mount, xs_dir_create);
255 } 255 }
256 256
257 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); 257 args = kmem_zalloc(sizeof(*args), KM_NOFS);
258 if (!args) 258 if (!args)
259 return -ENOMEM; 259 return -ENOMEM;
260 260
@@ -353,7 +353,7 @@ xfs_dir_lookup(
353 * lockdep Doing this avoids having to add a bunch of lockdep class 353 * lockdep Doing this avoids having to add a bunch of lockdep class
354 * annotations into the reclaim path for the ilock. 354 * annotations into the reclaim path for the ilock.
355 */ 355 */
356 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); 356 args = kmem_zalloc(sizeof(*args), KM_NOFS);
357 args->geo = dp->i_mount->m_dir_geo; 357 args->geo = dp->i_mount->m_dir_geo;
358 args->name = name->name; 358 args->name = name->name;
359 args->namelen = name->len; 359 args->namelen = name->len;
@@ -422,7 +422,7 @@ xfs_dir_removename(
422 ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); 422 ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
423 XFS_STATS_INC(dp->i_mount, xs_dir_remove); 423 XFS_STATS_INC(dp->i_mount, xs_dir_remove);
424 424
425 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); 425 args = kmem_zalloc(sizeof(*args), KM_NOFS);
426 if (!args) 426 if (!args)
427 return -ENOMEM; 427 return -ENOMEM;
428 428
@@ -483,7 +483,7 @@ xfs_dir_replace(
483 if (rval) 483 if (rval)
484 return rval; 484 return rval;
485 485
486 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); 486 args = kmem_zalloc(sizeof(*args), KM_NOFS);
487 if (!args) 487 if (!args)
488 return -ENOMEM; 488 return -ENOMEM;
489 489
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index a6fb0cc2085e..9595ced393dc 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -1092,7 +1092,7 @@ xfs_dir2_sf_to_block(
1092 * Copy the directory into a temporary buffer. 1092 * Copy the directory into a temporary buffer.
1093 * Then pitch the incore inode data so we can make extents. 1093 * Then pitch the incore inode data so we can make extents.
1094 */ 1094 */
1095 sfp = kmem_alloc(ifp->if_bytes, KM_SLEEP); 1095 sfp = kmem_alloc(ifp->if_bytes, 0);
1096 memcpy(sfp, oldsfp, ifp->if_bytes); 1096 memcpy(sfp, oldsfp, ifp->if_bytes);
1097 1097
1098 xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK); 1098 xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK);
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index 1fc44efc344d..705c4f562758 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -32,8 +32,6 @@ static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state,
32static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp, 32static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp,
33 int index, xfs_da_state_blk_t *dblk, 33 int index, xfs_da_state_blk_t *dblk,
34 int *rval); 34 int *rval);
35static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
36 xfs_da_state_blk_t *fblk);
37 35
38/* 36/*
39 * Check internal consistency of a leafn block. 37 * Check internal consistency of a leafn block.
@@ -1611,113 +1609,152 @@ xfs_dir2_leafn_unbalance(
1611} 1609}
1612 1610
1613/* 1611/*
1614 * Top-level node form directory addname routine. 1612 * Add a new data block to the directory at the free space index that the caller
1613 * has specified.
1615 */ 1614 */
1616int /* error */ 1615static int
1617xfs_dir2_node_addname( 1616xfs_dir2_node_add_datablk(
1618 xfs_da_args_t *args) /* operation arguments */ 1617 struct xfs_da_args *args,
1618 struct xfs_da_state_blk *fblk,
1619 xfs_dir2_db_t *dbno,
1620 struct xfs_buf **dbpp,
1621 struct xfs_buf **fbpp,
1622 int *findex)
1619{ 1623{
1620 xfs_da_state_blk_t *blk; /* leaf block for insert */ 1624 struct xfs_inode *dp = args->dp;
1621 int error; /* error return value */ 1625 struct xfs_trans *tp = args->trans;
1622 int rval; /* sub-return value */ 1626 struct xfs_mount *mp = dp->i_mount;
1623 xfs_da_state_t *state; /* btree cursor */ 1627 struct xfs_dir3_icfree_hdr freehdr;
1628 struct xfs_dir2_data_free *bf;
1629 struct xfs_dir2_data_hdr *hdr;
1630 struct xfs_dir2_free *free = NULL;
1631 xfs_dir2_db_t fbno;
1632 struct xfs_buf *fbp;
1633 struct xfs_buf *dbp;
1634 __be16 *bests = NULL;
1635 int error;
1624 1636
1625 trace_xfs_dir2_node_addname(args); 1637 /* Not allowed to allocate, return failure. */
1638 if (args->total == 0)
1639 return -ENOSPC;
1640
1641 /* Allocate and initialize the new data block. */
1642 error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, dbno);
1643 if (error)
1644 return error;
1645 error = xfs_dir3_data_init(args, *dbno, &dbp);
1646 if (error)
1647 return error;
1626 1648
1627 /* 1649 /*
1628 * Allocate and initialize the state (btree cursor). 1650 * Get the freespace block corresponding to the data block
1629 */ 1651 * that was just allocated.
1630 state = xfs_da_state_alloc();
1631 state->args = args;
1632 state->mp = args->dp->i_mount;
1633 /*
1634 * Look up the name. We're not supposed to find it, but
1635 * this gives us the insertion point.
1636 */ 1652 */
1637 error = xfs_da3_node_lookup_int(state, &rval); 1653 fbno = dp->d_ops->db_to_fdb(args->geo, *dbno);
1654 error = xfs_dir2_free_try_read(tp, dp,
1655 xfs_dir2_db_to_da(args->geo, fbno), &fbp);
1638 if (error) 1656 if (error)
1639 rval = error; 1657 return error;
1640 if (rval != -ENOENT) { 1658
1641 goto done;
1642 }
1643 /* 1659 /*
1644 * Add the data entry to a data block. 1660 * If there wasn't a freespace block, the read will
1645 * Extravalid is set to a freeblock found by lookup. 1661 * return a NULL fbp. Allocate and initialize a new one.
1646 */ 1662 */
1647 rval = xfs_dir2_node_addname_int(args, 1663 if (!fbp) {
1648 state->extravalid ? &state->extrablk : NULL); 1664 error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fbno);
1649 if (rval) { 1665 if (error)
1650 goto done; 1666 return error;
1667
1668 if (dp->d_ops->db_to_fdb(args->geo, *dbno) != fbno) {
1669 xfs_alert(mp,
1670"%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld",
1671 __func__, (unsigned long long)dp->i_ino,
1672 (long long)dp->d_ops->db_to_fdb(args->geo, *dbno),
1673 (long long)*dbno, (long long)fbno);
1674 if (fblk) {
1675 xfs_alert(mp,
1676 " fblk "PTR_FMT" blkno %llu index %d magic 0x%x",
1677 fblk, (unsigned long long)fblk->blkno,
1678 fblk->index, fblk->magic);
1679 } else {
1680 xfs_alert(mp, " ... fblk is NULL");
1681 }
1682 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
1683 return -EFSCORRUPTED;
1684 }
1685
1686 /* Get a buffer for the new block. */
1687 error = xfs_dir3_free_get_buf(args, fbno, &fbp);
1688 if (error)
1689 return error;
1690 free = fbp->b_addr;
1691 bests = dp->d_ops->free_bests_p(free);
1692 dp->d_ops->free_hdr_from_disk(&freehdr, free);
1693
1694 /* Remember the first slot as our empty slot. */
1695 freehdr.firstdb = (fbno - xfs_dir2_byte_to_db(args->geo,
1696 XFS_DIR2_FREE_OFFSET)) *
1697 dp->d_ops->free_max_bests(args->geo);
1698 } else {
1699 free = fbp->b_addr;
1700 bests = dp->d_ops->free_bests_p(free);
1701 dp->d_ops->free_hdr_from_disk(&freehdr, free);
1651 } 1702 }
1652 blk = &state->path.blk[state->path.active - 1]; 1703
1653 ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); 1704 /* Set the freespace block index from the data block number. */
1705 *findex = dp->d_ops->db_to_fdindex(args->geo, *dbno);
1706
1707 /* Extend the freespace table if the new data block is off the end. */
1708 if (*findex >= freehdr.nvalid) {
1709 ASSERT(*findex < dp->d_ops->free_max_bests(args->geo));
1710 freehdr.nvalid = *findex + 1;
1711 bests[*findex] = cpu_to_be16(NULLDATAOFF);
1712 }
1713
1654 /* 1714 /*
1655 * Add the new leaf entry. 1715 * If this entry was for an empty data block (this should always be
1716 * true) then update the header.
1656 */ 1717 */
1657 rval = xfs_dir2_leafn_add(blk->bp, args, blk->index); 1718 if (bests[*findex] == cpu_to_be16(NULLDATAOFF)) {
1658 if (rval == 0) { 1719 freehdr.nused++;
1659 /* 1720 dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
1660 * It worked, fix the hash values up the btree. 1721 xfs_dir2_free_log_header(args, fbp);
1661 */
1662 if (!(args->op_flags & XFS_DA_OP_JUSTCHECK))
1663 xfs_da3_fixhashpath(state, &state->path);
1664 } else {
1665 /*
1666 * It didn't work, we need to split the leaf block.
1667 */
1668 if (args->total == 0) {
1669 ASSERT(rval == -ENOSPC);
1670 goto done;
1671 }
1672 /*
1673 * Split the leaf block and insert the new entry.
1674 */
1675 rval = xfs_da3_split(state);
1676 } 1722 }
1677done: 1723
1678 xfs_da_state_free(state); 1724 /* Update the freespace value for the new block in the table. */
1679 return rval; 1725 hdr = dbp->b_addr;
1726 bf = dp->d_ops->data_bestfree_p(hdr);
1727 bests[*findex] = bf[0].length;
1728
1729 *dbpp = dbp;
1730 *fbpp = fbp;
1731 return 0;
1680} 1732}
1681 1733
1682/* 1734static int
1683 * Add the data entry for a node-format directory name addition. 1735xfs_dir2_node_find_freeblk(
1684 * The leaf entry is added in xfs_dir2_leafn_add. 1736 struct xfs_da_args *args,
1685 * We may enter with a freespace block that the lookup found. 1737 struct xfs_da_state_blk *fblk,
1686 */ 1738 xfs_dir2_db_t *dbnop,
1687static int /* error */ 1739 struct xfs_buf **fbpp,
1688xfs_dir2_node_addname_int( 1740 int *findexp,
1689 xfs_da_args_t *args, /* operation arguments */ 1741 int length)
1690 xfs_da_state_blk_t *fblk) /* optional freespace block */
1691{ 1742{
1692 xfs_dir2_data_hdr_t *hdr; /* data block header */
1693 xfs_dir2_db_t dbno; /* data block number */
1694 struct xfs_buf *dbp; /* data block buffer */
1695 xfs_dir2_data_entry_t *dep; /* data entry pointer */
1696 xfs_inode_t *dp; /* incore directory inode */
1697 xfs_dir2_data_unused_t *dup; /* data unused entry pointer */
1698 int error; /* error return value */
1699 xfs_dir2_db_t fbno; /* freespace block number */
1700 struct xfs_buf *fbp; /* freespace buffer */
1701 int findex; /* freespace entry index */
1702 xfs_dir2_free_t *free=NULL; /* freespace block structure */
1703 xfs_dir2_db_t ifbno; /* initial freespace block no */
1704 xfs_dir2_db_t lastfbno=0; /* highest freespace block no */
1705 int length; /* length of the new entry */
1706 int logfree; /* need to log free entry */
1707 xfs_mount_t *mp; /* filesystem mount point */
1708 int needlog; /* need to log data header */
1709 int needscan; /* need to rescan data frees */
1710 __be16 *tagp; /* data entry tag pointer */
1711 xfs_trans_t *tp; /* transaction pointer */
1712 __be16 *bests;
1713 struct xfs_dir3_icfree_hdr freehdr; 1743 struct xfs_dir3_icfree_hdr freehdr;
1714 struct xfs_dir2_data_free *bf; 1744 struct xfs_dir2_free *free = NULL;
1715 xfs_dir2_data_aoff_t aoff; 1745 struct xfs_inode *dp = args->dp;
1746 struct xfs_trans *tp = args->trans;
1747 struct xfs_buf *fbp = NULL;
1748 xfs_dir2_db_t firstfbno;
1749 xfs_dir2_db_t lastfbno;
1750 xfs_dir2_db_t ifbno = -1;
1751 xfs_dir2_db_t dbno = -1;
1752 xfs_dir2_db_t fbno;
1753 xfs_fileoff_t fo;
1754 __be16 *bests = NULL;
1755 int findex = 0;
1756 int error;
1716 1757
1717 dp = args->dp;
1718 mp = dp->i_mount;
1719 tp = args->trans;
1720 length = dp->d_ops->data_entsize(args->namelen);
1721 /* 1758 /*
1722 * If we came in with a freespace block that means that lookup 1759 * If we came in with a freespace block that means that lookup
1723 * found an entry with our hash value. This is the freespace 1760 * found an entry with our hash value. This is the freespace
@@ -1725,288 +1762,157 @@ xfs_dir2_node_addname_int(
1725 */ 1762 */
1726 if (fblk) { 1763 if (fblk) {
1727 fbp = fblk->bp; 1764 fbp = fblk->bp;
1728 /*
1729 * Remember initial freespace block number.
1730 */
1731 ifbno = fblk->blkno;
1732 free = fbp->b_addr; 1765 free = fbp->b_addr;
1733 findex = fblk->index; 1766 findex = fblk->index;
1734 bests = dp->d_ops->free_bests_p(free);
1735 dp->d_ops->free_hdr_from_disk(&freehdr, free);
1736
1737 /*
1738 * This means the free entry showed that the data block had
1739 * space for our entry, so we remembered it.
1740 * Use that data block.
1741 */
1742 if (findex >= 0) { 1767 if (findex >= 0) {
1768 /* caller already found the freespace for us. */
1769 bests = dp->d_ops->free_bests_p(free);
1770 dp->d_ops->free_hdr_from_disk(&freehdr, free);
1771
1743 ASSERT(findex < freehdr.nvalid); 1772 ASSERT(findex < freehdr.nvalid);
1744 ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF); 1773 ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF);
1745 ASSERT(be16_to_cpu(bests[findex]) >= length); 1774 ASSERT(be16_to_cpu(bests[findex]) >= length);
1746 dbno = freehdr.firstdb + findex; 1775 dbno = freehdr.firstdb + findex;
1747 } else { 1776 goto found_block;
1748 /*
1749 * The data block looked at didn't have enough room.
1750 * We'll start at the beginning of the freespace entries.
1751 */
1752 dbno = -1;
1753 findex = 0;
1754 } 1777 }
1755 } else { 1778
1756 /* 1779 /*
1757 * Didn't come in with a freespace block, so no data block. 1780 * The data block looked at didn't have enough room.
1781 * We'll start at the beginning of the freespace entries.
1758 */ 1782 */
1759 ifbno = dbno = -1; 1783 ifbno = fblk->blkno;
1784 xfs_trans_brelse(tp, fbp);
1760 fbp = NULL; 1785 fbp = NULL;
1761 findex = 0; 1786 fblk->bp = NULL;
1762 } 1787 }
1763 1788
1764 /* 1789 /*
1765 * If we don't have a data block yet, we're going to scan the 1790 * If we don't have a data block yet, we're going to scan the freespace
1766 * freespace blocks looking for one. Figure out what the 1791 * data for a data block with enough free space in it.
1767 * highest freespace block number is.
1768 */
1769 if (dbno == -1) {
1770 xfs_fileoff_t fo; /* freespace block number */
1771
1772 if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK)))
1773 return error;
1774 lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo);
1775 fbno = ifbno;
1776 }
1777 /*
1778 * While we haven't identified a data block, search the freeblock
1779 * data for a good data block. If we find a null freeblock entry,
1780 * indicating a hole in the data blocks, remember that.
1781 */ 1792 */
1782 while (dbno == -1) { 1793 error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK);
1783 /* 1794 if (error)
1784 * If we don't have a freeblock in hand, get the next one. 1795 return error;
1785 */ 1796 lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo);
1786 if (fbp == NULL) { 1797 firstfbno = xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET);
1787 /*
1788 * Happens the first time through unless lookup gave
1789 * us a freespace block to start with.
1790 */
1791 if (++fbno == 0)
1792 fbno = xfs_dir2_byte_to_db(args->geo,
1793 XFS_DIR2_FREE_OFFSET);
1794 /*
1795 * If it's ifbno we already looked at it.
1796 */
1797 if (fbno == ifbno)
1798 fbno++;
1799 /*
1800 * If it's off the end we're done.
1801 */
1802 if (fbno >= lastfbno)
1803 break;
1804 /*
1805 * Read the block. There can be holes in the
1806 * freespace blocks, so this might not succeed.
1807 * This should be really rare, so there's no reason
1808 * to avoid it.
1809 */
1810 error = xfs_dir2_free_try_read(tp, dp,
1811 xfs_dir2_db_to_da(args->geo, fbno),
1812 &fbp);
1813 if (error)
1814 return error;
1815 if (!fbp)
1816 continue;
1817 free = fbp->b_addr;
1818 findex = 0;
1819 }
1820 /*
1821 * Look at the current free entry. Is it good enough?
1822 *
1823 * The bests initialisation should be where the bufer is read in
1824 * the above branch. But gcc is too stupid to realise that bests
1825 * and the freehdr are actually initialised if they are placed
1826 * there, so we have to do it here to avoid warnings. Blech.
1827 */
1828 bests = dp->d_ops->free_bests_p(free);
1829 dp->d_ops->free_hdr_from_disk(&freehdr, free);
1830 if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
1831 be16_to_cpu(bests[findex]) >= length)
1832 dbno = freehdr.firstdb + findex;
1833 else {
1834 /*
1835 * Are we done with the freeblock?
1836 */
1837 if (++findex == freehdr.nvalid) {
1838 /*
1839 * Drop the block.
1840 */
1841 xfs_trans_brelse(tp, fbp);
1842 fbp = NULL;
1843 if (fblk && fblk->bp)
1844 fblk->bp = NULL;
1845 }
1846 }
1847 }
1848 /*
1849 * If we don't have a data block, we need to allocate one and make
1850 * the freespace entries refer to it.
1851 */
1852 if (unlikely(dbno == -1)) {
1853 /*
1854 * Not allowed to allocate, return failure.
1855 */
1856 if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
1857 return -ENOSPC;
1858
1859 /*
1860 * Allocate and initialize the new data block.
1861 */
1862 if (unlikely((error = xfs_dir2_grow_inode(args,
1863 XFS_DIR2_DATA_SPACE,
1864 &dbno)) ||
1865 (error = xfs_dir3_data_init(args, dbno, &dbp))))
1866 return error;
1867 1798
1868 /* 1799 for (fbno = lastfbno - 1; fbno >= firstfbno; fbno--) {
1869 * If (somehow) we have a freespace block, get rid of it. 1800 /* If it's ifbno we already looked at it. */
1870 */ 1801 if (fbno == ifbno)
1871 if (fbp) 1802 continue;
1872 xfs_trans_brelse(tp, fbp);
1873 if (fblk && fblk->bp)
1874 fblk->bp = NULL;
1875 1803
1876 /* 1804 /*
1877 * Get the freespace block corresponding to the data block 1805 * Read the block. There can be holes in the freespace blocks,
1878 * that was just allocated. 1806 * so this might not succeed. This should be really rare, so
1807 * there's no reason to avoid it.
1879 */ 1808 */
1880 fbno = dp->d_ops->db_to_fdb(args->geo, dbno);
1881 error = xfs_dir2_free_try_read(tp, dp, 1809 error = xfs_dir2_free_try_read(tp, dp,
1882 xfs_dir2_db_to_da(args->geo, fbno), 1810 xfs_dir2_db_to_da(args->geo, fbno),
1883 &fbp); 1811 &fbp);
1884 if (error) 1812 if (error)
1885 return error; 1813 return error;
1814 if (!fbp)
1815 continue;
1886 1816
1887 /* 1817 free = fbp->b_addr;
1888 * If there wasn't a freespace block, the read will 1818 bests = dp->d_ops->free_bests_p(free);
1889 * return a NULL fbp. Allocate and initialize a new one. 1819 dp->d_ops->free_hdr_from_disk(&freehdr, free);
1890 */
1891 if (!fbp) {
1892 error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE,
1893 &fbno);
1894 if (error)
1895 return error;
1896 1820
1897 if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) { 1821 /* Scan the free entry array for a large enough free space. */
1898 xfs_alert(mp, 1822 for (findex = freehdr.nvalid - 1; findex >= 0; findex--) {
1899"%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld ifbno %llu lastfbno %d", 1823 if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
1900 __func__, (unsigned long long)dp->i_ino, 1824 be16_to_cpu(bests[findex]) >= length) {
1901 (long long)dp->d_ops->db_to_fdb( 1825 dbno = freehdr.firstdb + findex;
1902 args->geo, dbno), 1826 goto found_block;
1903 (long long)dbno, (long long)fbno,
1904 (unsigned long long)ifbno, lastfbno);
1905 if (fblk) {
1906 xfs_alert(mp,
1907 " fblk "PTR_FMT" blkno %llu index %d magic 0x%x",
1908 fblk,
1909 (unsigned long long)fblk->blkno,
1910 fblk->index,
1911 fblk->magic);
1912 } else {
1913 xfs_alert(mp, " ... fblk is NULL");
1914 }
1915 XFS_ERROR_REPORT("xfs_dir2_node_addname_int",
1916 XFS_ERRLEVEL_LOW, mp);
1917 return -EFSCORRUPTED;
1918 } 1827 }
1919
1920 /*
1921 * Get a buffer for the new block.
1922 */
1923 error = xfs_dir3_free_get_buf(args, fbno, &fbp);
1924 if (error)
1925 return error;
1926 free = fbp->b_addr;
1927 bests = dp->d_ops->free_bests_p(free);
1928 dp->d_ops->free_hdr_from_disk(&freehdr, free);
1929
1930 /*
1931 * Remember the first slot as our empty slot.
1932 */
1933 freehdr.firstdb =
1934 (fbno - xfs_dir2_byte_to_db(args->geo,
1935 XFS_DIR2_FREE_OFFSET)) *
1936 dp->d_ops->free_max_bests(args->geo);
1937 } else {
1938 free = fbp->b_addr;
1939 bests = dp->d_ops->free_bests_p(free);
1940 dp->d_ops->free_hdr_from_disk(&freehdr, free);
1941 } 1828 }
1942 1829
1943 /* 1830 /* Didn't find free space, go on to next free block */
1944 * Set the freespace block index from the data block number. 1831 xfs_trans_brelse(tp, fbp);
1945 */
1946 findex = dp->d_ops->db_to_fdindex(args->geo, dbno);
1947 /*
1948 * If it's after the end of the current entries in the
1949 * freespace block, extend that table.
1950 */
1951 if (findex >= freehdr.nvalid) {
1952 ASSERT(findex < dp->d_ops->free_max_bests(args->geo));
1953 freehdr.nvalid = findex + 1;
1954 /*
1955 * Tag new entry so nused will go up.
1956 */
1957 bests[findex] = cpu_to_be16(NULLDATAOFF);
1958 }
1959 /*
1960 * If this entry was for an empty data block
1961 * (this should always be true) then update the header.
1962 */
1963 if (bests[findex] == cpu_to_be16(NULLDATAOFF)) {
1964 freehdr.nused++;
1965 dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
1966 xfs_dir2_free_log_header(args, fbp);
1967 }
1968 /*
1969 * Update the real value in the table.
1970 * We haven't allocated the data entry yet so this will
1971 * change again.
1972 */
1973 hdr = dbp->b_addr;
1974 bf = dp->d_ops->data_bestfree_p(hdr);
1975 bests[findex] = bf[0].length;
1976 logfree = 1;
1977 } 1832 }
1833
1834found_block:
1835 *dbnop = dbno;
1836 *fbpp = fbp;
1837 *findexp = findex;
1838 return 0;
1839}
1840
1841
1842/*
1843 * Add the data entry for a node-format directory name addition.
1844 * The leaf entry is added in xfs_dir2_leafn_add.
1845 * We may enter with a freespace block that the lookup found.
1846 */
1847static int
1848xfs_dir2_node_addname_int(
1849 struct xfs_da_args *args, /* operation arguments */
1850 struct xfs_da_state_blk *fblk) /* optional freespace block */
1851{
1852 struct xfs_dir2_data_unused *dup; /* data unused entry pointer */
1853 struct xfs_dir2_data_entry *dep; /* data entry pointer */
1854 struct xfs_dir2_data_hdr *hdr; /* data block header */
1855 struct xfs_dir2_data_free *bf;
1856 struct xfs_dir2_free *free = NULL; /* freespace block structure */
1857 struct xfs_trans *tp = args->trans;
1858 struct xfs_inode *dp = args->dp;
1859 struct xfs_buf *dbp; /* data block buffer */
1860 struct xfs_buf *fbp; /* freespace buffer */
1861 xfs_dir2_data_aoff_t aoff;
1862 xfs_dir2_db_t dbno; /* data block number */
1863 int error; /* error return value */
1864 int findex; /* freespace entry index */
1865 int length; /* length of the new entry */
1866 int logfree = 0; /* need to log free entry */
1867 int needlog = 0; /* need to log data header */
1868 int needscan = 0; /* need to rescan data frees */
1869 __be16 *tagp; /* data entry tag pointer */
1870 __be16 *bests;
1871
1872 length = dp->d_ops->data_entsize(args->namelen);
1873 error = xfs_dir2_node_find_freeblk(args, fblk, &dbno, &fbp, &findex,
1874 length);
1875 if (error)
1876 return error;
1877
1978 /* 1878 /*
1979 * We had a data block so we don't have to make a new one. 1879 * Now we know if we must allocate blocks, so if we are checking whether
1880 * we can insert without allocation then we can return now.
1980 */ 1881 */
1981 else { 1882 if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
1982 /* 1883 if (dbno == -1)
1983 * If just checking, we succeeded. 1884 return -ENOSPC;
1984 */ 1885 return 0;
1985 if (args->op_flags & XFS_DA_OP_JUSTCHECK) 1886 }
1986 return 0;
1987 1887
1988 /* 1888 /*
1989 * Read the data block in. 1889 * If we don't have a data block, we need to allocate one and make
1990 */ 1890 * the freespace entries refer to it.
1891 */
1892 if (dbno == -1) {
1893 /* we're going to have to log the free block index later */
1894 logfree = 1;
1895 error = xfs_dir2_node_add_datablk(args, fblk, &dbno, &dbp, &fbp,
1896 &findex);
1897 } else {
1898 /* Read the data block in. */
1991 error = xfs_dir3_data_read(tp, dp, 1899 error = xfs_dir3_data_read(tp, dp,
1992 xfs_dir2_db_to_da(args->geo, dbno), 1900 xfs_dir2_db_to_da(args->geo, dbno),
1993 -1, &dbp); 1901 -1, &dbp);
1994 if (error)
1995 return error;
1996 hdr = dbp->b_addr;
1997 bf = dp->d_ops->data_bestfree_p(hdr);
1998 logfree = 0;
1999 } 1902 }
1903 if (error)
1904 return error;
1905
1906 /* setup for data block up now */
1907 hdr = dbp->b_addr;
1908 bf = dp->d_ops->data_bestfree_p(hdr);
2000 ASSERT(be16_to_cpu(bf[0].length) >= length); 1909 ASSERT(be16_to_cpu(bf[0].length) >= length);
2001 /* 1910
2002 * Point to the existing unused space. 1911 /* Point to the existing unused space. */
2003 */
2004 dup = (xfs_dir2_data_unused_t *) 1912 dup = (xfs_dir2_data_unused_t *)
2005 ((char *)hdr + be16_to_cpu(bf[0].offset)); 1913 ((char *)hdr + be16_to_cpu(bf[0].offset));
2006 needscan = needlog = 0; 1914
2007 /* 1915 /* Mark the first part of the unused space, inuse for us. */
2008 * Mark the first part of the unused space, inuse for us.
2009 */
2010 aoff = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr); 1916 aoff = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr);
2011 error = xfs_dir2_data_use_free(args, dbp, dup, aoff, length, 1917 error = xfs_dir2_data_use_free(args, dbp, dup, aoff, length,
2012 &needlog, &needscan); 1918 &needlog, &needscan);
@@ -2014,9 +1920,8 @@ xfs_dir2_node_addname_int(
2014 xfs_trans_brelse(tp, dbp); 1920 xfs_trans_brelse(tp, dbp);
2015 return error; 1921 return error;
2016 } 1922 }
2017 /* 1923
2018 * Fill in the new entry and log it. 1924 /* Fill in the new entry and log it. */
2019 */
2020 dep = (xfs_dir2_data_entry_t *)dup; 1925 dep = (xfs_dir2_data_entry_t *)dup;
2021 dep->inumber = cpu_to_be64(args->inumber); 1926 dep->inumber = cpu_to_be64(args->inumber);
2022 dep->namelen = args->namelen; 1927 dep->namelen = args->namelen;
@@ -2025,38 +1930,101 @@ xfs_dir2_node_addname_int(
2025 tagp = dp->d_ops->data_entry_tag_p(dep); 1930 tagp = dp->d_ops->data_entry_tag_p(dep);
2026 *tagp = cpu_to_be16((char *)dep - (char *)hdr); 1931 *tagp = cpu_to_be16((char *)dep - (char *)hdr);
2027 xfs_dir2_data_log_entry(args, dbp, dep); 1932 xfs_dir2_data_log_entry(args, dbp, dep);
2028 /* 1933
2029 * Rescan the block for bestfree if needed. 1934 /* Rescan the freespace and log the data block if needed. */
2030 */
2031 if (needscan) 1935 if (needscan)
2032 xfs_dir2_data_freescan(dp, hdr, &needlog); 1936 xfs_dir2_data_freescan(dp, hdr, &needlog);
2033 /*
2034 * Log the data block header if needed.
2035 */
2036 if (needlog) 1937 if (needlog)
2037 xfs_dir2_data_log_header(args, dbp); 1938 xfs_dir2_data_log_header(args, dbp);
2038 /* 1939
2039 * If the freespace entry is now wrong, update it. 1940 /* If the freespace block entry is now wrong, update it. */
2040 */ 1941 free = fbp->b_addr;
2041 bests = dp->d_ops->free_bests_p(free); /* gcc is so stupid */ 1942 bests = dp->d_ops->free_bests_p(free);
2042 if (be16_to_cpu(bests[findex]) != be16_to_cpu(bf[0].length)) { 1943 if (bests[findex] != bf[0].length) {
2043 bests[findex] = bf[0].length; 1944 bests[findex] = bf[0].length;
2044 logfree = 1; 1945 logfree = 1;
2045 } 1946 }
2046 /* 1947
2047 * Log the freespace entry if needed. 1948 /* Log the freespace entry if needed. */
2048 */
2049 if (logfree) 1949 if (logfree)
2050 xfs_dir2_free_log_bests(args, fbp, findex, findex); 1950 xfs_dir2_free_log_bests(args, fbp, findex, findex);
2051 /* 1951
2052 * Return the data block and offset in args, then drop the data block. 1952 /* Return the data block and offset in args. */
2053 */
2054 args->blkno = (xfs_dablk_t)dbno; 1953 args->blkno = (xfs_dablk_t)dbno;
2055 args->index = be16_to_cpu(*tagp); 1954 args->index = be16_to_cpu(*tagp);
2056 return 0; 1955 return 0;
2057} 1956}
2058 1957
2059/* 1958/*
1959 * Top-level node form directory addname routine.
1960 */
1961int /* error */
1962xfs_dir2_node_addname(
1963 xfs_da_args_t *args) /* operation arguments */
1964{
1965 xfs_da_state_blk_t *blk; /* leaf block for insert */
1966 int error; /* error return value */
1967 int rval; /* sub-return value */
1968 xfs_da_state_t *state; /* btree cursor */
1969
1970 trace_xfs_dir2_node_addname(args);
1971
1972 /*
1973 * Allocate and initialize the state (btree cursor).
1974 */
1975 state = xfs_da_state_alloc();
1976 state->args = args;
1977 state->mp = args->dp->i_mount;
1978 /*
1979 * Look up the name. We're not supposed to find it, but
1980 * this gives us the insertion point.
1981 */
1982 error = xfs_da3_node_lookup_int(state, &rval);
1983 if (error)
1984 rval = error;
1985 if (rval != -ENOENT) {
1986 goto done;
1987 }
1988 /*
1989 * Add the data entry to a data block.
1990 * Extravalid is set to a freeblock found by lookup.
1991 */
1992 rval = xfs_dir2_node_addname_int(args,
1993 state->extravalid ? &state->extrablk : NULL);
1994 if (rval) {
1995 goto done;
1996 }
1997 blk = &state->path.blk[state->path.active - 1];
1998 ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
1999 /*
2000 * Add the new leaf entry.
2001 */
2002 rval = xfs_dir2_leafn_add(blk->bp, args, blk->index);
2003 if (rval == 0) {
2004 /*
2005 * It worked, fix the hash values up the btree.
2006 */
2007 if (!(args->op_flags & XFS_DA_OP_JUSTCHECK))
2008 xfs_da3_fixhashpath(state, &state->path);
2009 } else {
2010 /*
2011 * It didn't work, we need to split the leaf block.
2012 */
2013 if (args->total == 0) {
2014 ASSERT(rval == -ENOSPC);
2015 goto done;
2016 }
2017 /*
2018 * Split the leaf block and insert the new entry.
2019 */
2020 rval = xfs_da3_split(state);
2021 }
2022done:
2023 xfs_da_state_free(state);
2024 return rval;
2025}
2026
2027/*
2060 * Lookup an entry in a node-format directory. 2028 * Lookup an entry in a node-format directory.
2061 * All the real work happens in xfs_da3_node_lookup_int. 2029 * All the real work happens in xfs_da3_node_lookup_int.
2062 * The only real output is the inode number of the entry. 2030 * The only real output is the inode number of the entry.
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 033589257f54..85f14fc2a8da 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -164,7 +164,7 @@ xfs_dir2_block_to_sf(
164 * can free the block and copy the formatted data into the inode literal 164 * can free the block and copy the formatted data into the inode literal
165 * area. 165 * area.
166 */ 166 */
167 dst = kmem_alloc(mp->m_sb.sb_inodesize, KM_SLEEP); 167 dst = kmem_alloc(mp->m_sb.sb_inodesize, 0);
168 hdr = bp->b_addr; 168 hdr = bp->b_addr;
169 169
170 /* 170 /*
@@ -436,7 +436,7 @@ xfs_dir2_sf_addname_hard(
436 436
437 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; 437 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
438 old_isize = (int)dp->i_d.di_size; 438 old_isize = (int)dp->i_d.di_size;
439 buf = kmem_alloc(old_isize, KM_SLEEP); 439 buf = kmem_alloc(old_isize, 0);
440 oldsfp = (xfs_dir2_sf_hdr_t *)buf; 440 oldsfp = (xfs_dir2_sf_hdr_t *)buf;
441 memcpy(oldsfp, sfp, old_isize); 441 memcpy(oldsfp, sfp, old_isize);
442 /* 442 /*
@@ -1096,7 +1096,7 @@ xfs_dir2_sf_toino4(
1096 * Don't want xfs_idata_realloc copying the data here. 1096 * Don't want xfs_idata_realloc copying the data here.
1097 */ 1097 */
1098 oldsize = dp->i_df.if_bytes; 1098 oldsize = dp->i_df.if_bytes;
1099 buf = kmem_alloc(oldsize, KM_SLEEP); 1099 buf = kmem_alloc(oldsize, 0);
1100 oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; 1100 oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
1101 ASSERT(oldsfp->i8count == 1); 1101 ASSERT(oldsfp->i8count == 1);
1102 memcpy(buf, oldsfp, oldsize); 1102 memcpy(buf, oldsfp, oldsize);
@@ -1169,7 +1169,7 @@ xfs_dir2_sf_toino8(
1169 * Don't want xfs_idata_realloc copying the data here. 1169 * Don't want xfs_idata_realloc copying the data here.
1170 */ 1170 */
1171 oldsize = dp->i_df.if_bytes; 1171 oldsize = dp->i_df.if_bytes;
1172 buf = kmem_alloc(oldsize, KM_SLEEP); 1172 buf = kmem_alloc(oldsize, 0);
1173 oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; 1173 oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
1174 ASSERT(oldsfp->i8count == 0); 1174 ASSERT(oldsfp->i8count == 0);
1175 memcpy(buf, oldsfp, oldsize); 1175 memcpy(buf, oldsfp, oldsize);
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 52d03a3a02a4..39dd2b908106 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -287,7 +287,7 @@ struct xfs_ag_geometry {
287 uint32_t ag_ifree; /* o: inodes free */ 287 uint32_t ag_ifree; /* o: inodes free */
288 uint32_t ag_sick; /* o: sick things in ag */ 288 uint32_t ag_sick; /* o: sick things in ag */
289 uint32_t ag_checked; /* o: checked metadata in ag */ 289 uint32_t ag_checked; /* o: checked metadata in ag */
290 uint32_t ag_reserved32; /* o: zero */ 290 uint32_t ag_flags; /* i/o: flags for this ag */
291 uint64_t ag_reserved[12];/* o: zero */ 291 uint64_t ag_reserved[12];/* o: zero */
292}; 292};
293#define XFS_AG_GEOM_SICK_SB (1 << 0) /* superblock */ 293#define XFS_AG_GEOM_SICK_SB (1 << 0) /* superblock */
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 04377ab75863..588d44613094 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -2787,8 +2787,13 @@ xfs_ialloc_setup_geometry(
2787 igeo->inobt_maxlevels = xfs_btree_compute_maxlevels(igeo->inobt_mnr, 2787 igeo->inobt_maxlevels = xfs_btree_compute_maxlevels(igeo->inobt_mnr,
2788 inodes); 2788 inodes);
2789 2789
2790 /* Set the maximum inode count for this filesystem. */ 2790 /*
2791 if (sbp->sb_imax_pct) { 2791 * Set the maximum inode count for this filesystem, being careful not
2792 * to use obviously garbage sb_inopblog/sb_inopblock values. Regular
2793 * users should never get here due to failing sb verification, but
2794 * certain users (xfs_db) need to be usable even with corrupt metadata.
2795 */
2796 if (sbp->sb_imax_pct && igeo->ialloc_blks) {
2792 /* 2797 /*
2793 * Make sure the maximum inode count is a multiple 2798 * Make sure the maximum inode count is a multiple
2794 * of the units we allocate inodes in. 2799 * of the units we allocate inodes in.
diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c
index 27aa3f2bc4bc..7bc87408f1a0 100644
--- a/fs/xfs/libxfs/xfs_iext_tree.c
+++ b/fs/xfs/libxfs/xfs_iext_tree.c
@@ -616,7 +616,7 @@ xfs_iext_realloc_root(
616 * sequence counter is seen before the modifications to the extent tree itself 616 * sequence counter is seen before the modifications to the extent tree itself
617 * take effect. 617 * take effect.
618 */ 618 */
619static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp, int state) 619static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp)
620{ 620{
621 WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1); 621 WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1);
622} 622}
@@ -633,7 +633,7 @@ xfs_iext_insert(
633 struct xfs_iext_leaf *new = NULL; 633 struct xfs_iext_leaf *new = NULL;
634 int nr_entries, i; 634 int nr_entries, i;
635 635
636 xfs_iext_inc_seq(ifp, state); 636 xfs_iext_inc_seq(ifp);
637 637
638 if (ifp->if_height == 0) 638 if (ifp->if_height == 0)
639 xfs_iext_alloc_root(ifp, cur); 639 xfs_iext_alloc_root(ifp, cur);
@@ -875,7 +875,7 @@ xfs_iext_remove(
875 ASSERT(ifp->if_u1.if_root != NULL); 875 ASSERT(ifp->if_u1.if_root != NULL);
876 ASSERT(xfs_iext_valid(ifp, cur)); 876 ASSERT(xfs_iext_valid(ifp, cur));
877 877
878 xfs_iext_inc_seq(ifp, state); 878 xfs_iext_inc_seq(ifp);
879 879
880 nr_entries = xfs_iext_leaf_nr_entries(ifp, leaf, cur->pos) - 1; 880 nr_entries = xfs_iext_leaf_nr_entries(ifp, leaf, cur->pos) - 1;
881 for (i = cur->pos; i < nr_entries; i++) 881 for (i = cur->pos; i < nr_entries; i++)
@@ -983,7 +983,7 @@ xfs_iext_update_extent(
983{ 983{
984 struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state); 984 struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state);
985 985
986 xfs_iext_inc_seq(ifp, state); 986 xfs_iext_inc_seq(ifp);
987 987
988 if (cur->pos == 0) { 988 if (cur->pos == 0) {
989 struct xfs_bmbt_irec old; 989 struct xfs_bmbt_irec old;
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index bf3e04018246..c643beeb5a24 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -94,7 +94,7 @@ xfs_iformat_fork(
94 return 0; 94 return 0;
95 95
96 ASSERT(ip->i_afp == NULL); 96 ASSERT(ip->i_afp == NULL);
97 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS); 97 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_NOFS);
98 98
99 switch (dip->di_aformat) { 99 switch (dip->di_aformat) {
100 case XFS_DINODE_FMT_LOCAL: 100 case XFS_DINODE_FMT_LOCAL:
@@ -147,7 +147,7 @@ xfs_init_local_fork(
147 147
148 if (size) { 148 if (size) {
149 real_size = roundup(mem_size, 4); 149 real_size = roundup(mem_size, 4);
150 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS); 150 ifp->if_u1.if_data = kmem_alloc(real_size, KM_NOFS);
151 memcpy(ifp->if_u1.if_data, data, size); 151 memcpy(ifp->if_u1.if_data, data, size);
152 if (zero_terminate) 152 if (zero_terminate)
153 ifp->if_u1.if_data[size] = '\0'; 153 ifp->if_u1.if_data[size] = '\0';
@@ -302,7 +302,7 @@ xfs_iformat_btree(
302 } 302 }
303 303
304 ifp->if_broot_bytes = size; 304 ifp->if_broot_bytes = size;
305 ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS); 305 ifp->if_broot = kmem_alloc(size, KM_NOFS);
306 ASSERT(ifp->if_broot != NULL); 306 ASSERT(ifp->if_broot != NULL);
307 /* 307 /*
308 * Copy and convert from the on-disk structure 308 * Copy and convert from the on-disk structure
@@ -367,7 +367,7 @@ xfs_iroot_realloc(
367 */ 367 */
368 if (ifp->if_broot_bytes == 0) { 368 if (ifp->if_broot_bytes == 0) {
369 new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff); 369 new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff);
370 ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); 370 ifp->if_broot = kmem_alloc(new_size, KM_NOFS);
371 ifp->if_broot_bytes = (int)new_size; 371 ifp->if_broot_bytes = (int)new_size;
372 return; 372 return;
373 } 373 }
@@ -382,7 +382,7 @@ xfs_iroot_realloc(
382 new_max = cur_max + rec_diff; 382 new_max = cur_max + rec_diff;
383 new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); 383 new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
384 ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, 384 ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
385 KM_SLEEP | KM_NOFS); 385 KM_NOFS);
386 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, 386 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
387 ifp->if_broot_bytes); 387 ifp->if_broot_bytes);
388 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, 388 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
@@ -408,7 +408,7 @@ xfs_iroot_realloc(
408 else 408 else
409 new_size = 0; 409 new_size = 0;
410 if (new_size > 0) { 410 if (new_size > 0) {
411 new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); 411 new_broot = kmem_alloc(new_size, KM_NOFS);
412 /* 412 /*
413 * First copy over the btree block header. 413 * First copy over the btree block header.
414 */ 414 */
@@ -492,7 +492,7 @@ xfs_idata_realloc(
492 * We enforce that here. 492 * We enforce that here.
493 */ 493 */
494 ifp->if_u1.if_data = kmem_realloc(ifp->if_u1.if_data, 494 ifp->if_u1.if_data = kmem_realloc(ifp->if_u1.if_data,
495 roundup(new_size, 4), KM_SLEEP | KM_NOFS); 495 roundup(new_size, 4), KM_NOFS);
496 ifp->if_bytes = new_size; 496 ifp->if_bytes = new_size;
497} 497}
498 498
@@ -683,7 +683,7 @@ xfs_ifork_init_cow(
683 return; 683 return;
684 684
685 ip->i_cowfp = kmem_zone_zalloc(xfs_ifork_zone, 685 ip->i_cowfp = kmem_zone_zalloc(xfs_ifork_zone,
686 KM_SLEEP | KM_NOFS); 686 KM_NOFS);
687 ip->i_cowfp->if_flags = XFS_IFEXTENTS; 687 ip->i_cowfp->if_flags = XFS_IFEXTENTS;
688 ip->i_cformat = XFS_DINODE_FMT_EXTENTS; 688 ip->i_cformat = XFS_DINODE_FMT_EXTENTS;
689 ip->i_cnextents = 0; 689 ip->i_cnextents = 0;
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 51bb9bdb0e84..9a7fadb1361c 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -1174,7 +1174,7 @@ out_cur:
1174/* 1174/*
1175 * Record a refcount intent for later processing. 1175 * Record a refcount intent for later processing.
1176 */ 1176 */
1177static int 1177static void
1178__xfs_refcount_add( 1178__xfs_refcount_add(
1179 struct xfs_trans *tp, 1179 struct xfs_trans *tp,
1180 enum xfs_refcount_intent_type type, 1180 enum xfs_refcount_intent_type type,
@@ -1189,44 +1189,43 @@ __xfs_refcount_add(
1189 blockcount); 1189 blockcount);
1190 1190
1191 ri = kmem_alloc(sizeof(struct xfs_refcount_intent), 1191 ri = kmem_alloc(sizeof(struct xfs_refcount_intent),
1192 KM_SLEEP | KM_NOFS); 1192 KM_NOFS);
1193 INIT_LIST_HEAD(&ri->ri_list); 1193 INIT_LIST_HEAD(&ri->ri_list);
1194 ri->ri_type = type; 1194 ri->ri_type = type;
1195 ri->ri_startblock = startblock; 1195 ri->ri_startblock = startblock;
1196 ri->ri_blockcount = blockcount; 1196 ri->ri_blockcount = blockcount;
1197 1197
1198 xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list); 1198 xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list);
1199 return 0;
1200} 1199}
1201 1200
1202/* 1201/*
1203 * Increase the reference count of the blocks backing a file's extent. 1202 * Increase the reference count of the blocks backing a file's extent.
1204 */ 1203 */
1205int 1204void
1206xfs_refcount_increase_extent( 1205xfs_refcount_increase_extent(
1207 struct xfs_trans *tp, 1206 struct xfs_trans *tp,
1208 struct xfs_bmbt_irec *PREV) 1207 struct xfs_bmbt_irec *PREV)
1209{ 1208{
1210 if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb)) 1209 if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb))
1211 return 0; 1210 return;
1212 1211
1213 return __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, 1212 __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, PREV->br_startblock,
1214 PREV->br_startblock, PREV->br_blockcount); 1213 PREV->br_blockcount);
1215} 1214}
1216 1215
1217/* 1216/*
1218 * Decrease the reference count of the blocks backing a file's extent. 1217 * Decrease the reference count of the blocks backing a file's extent.
1219 */ 1218 */
1220int 1219void
1221xfs_refcount_decrease_extent( 1220xfs_refcount_decrease_extent(
1222 struct xfs_trans *tp, 1221 struct xfs_trans *tp,
1223 struct xfs_bmbt_irec *PREV) 1222 struct xfs_bmbt_irec *PREV)
1224{ 1223{
1225 if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb)) 1224 if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb))
1226 return 0; 1225 return;
1227 1226
1228 return __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, 1227 __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, PREV->br_startblock,
1229 PREV->br_startblock, PREV->br_blockcount); 1228 PREV->br_blockcount);
1230} 1229}
1231 1230
1232/* 1231/*
@@ -1541,47 +1540,40 @@ __xfs_refcount_cow_free(
1541} 1540}
1542 1541
1543/* Record a CoW staging extent in the refcount btree. */ 1542/* Record a CoW staging extent in the refcount btree. */
1544int 1543void
1545xfs_refcount_alloc_cow_extent( 1544xfs_refcount_alloc_cow_extent(
1546 struct xfs_trans *tp, 1545 struct xfs_trans *tp,
1547 xfs_fsblock_t fsb, 1546 xfs_fsblock_t fsb,
1548 xfs_extlen_t len) 1547 xfs_extlen_t len)
1549{ 1548{
1550 struct xfs_mount *mp = tp->t_mountp; 1549 struct xfs_mount *mp = tp->t_mountp;
1551 int error;
1552 1550
1553 if (!xfs_sb_version_hasreflink(&mp->m_sb)) 1551 if (!xfs_sb_version_hasreflink(&mp->m_sb))
1554 return 0; 1552 return;
1555 1553
1556 error = __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len); 1554 __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len);
1557 if (error)
1558 return error;
1559 1555
1560 /* Add rmap entry */ 1556 /* Add rmap entry */
1561 return xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb), 1557 xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
1562 XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW); 1558 XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
1563} 1559}
1564 1560
1565/* Forget a CoW staging event in the refcount btree. */ 1561/* Forget a CoW staging event in the refcount btree. */
1566int 1562void
1567xfs_refcount_free_cow_extent( 1563xfs_refcount_free_cow_extent(
1568 struct xfs_trans *tp, 1564 struct xfs_trans *tp,
1569 xfs_fsblock_t fsb, 1565 xfs_fsblock_t fsb,
1570 xfs_extlen_t len) 1566 xfs_extlen_t len)
1571{ 1567{
1572 struct xfs_mount *mp = tp->t_mountp; 1568 struct xfs_mount *mp = tp->t_mountp;
1573 int error;
1574 1569
1575 if (!xfs_sb_version_hasreflink(&mp->m_sb)) 1570 if (!xfs_sb_version_hasreflink(&mp->m_sb))
1576 return 0; 1571 return;
1577 1572
1578 /* Remove rmap entry */ 1573 /* Remove rmap entry */
1579 error = xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb), 1574 xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
1580 XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW); 1575 XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
1581 if (error) 1576 __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len);
1582 return error;
1583
1584 return __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len);
1585} 1577}
1586 1578
1587struct xfs_refcount_recovery { 1579struct xfs_refcount_recovery {
@@ -1602,7 +1594,7 @@ xfs_refcount_recover_extent(
1602 if (be32_to_cpu(rec->refc.rc_refcount) != 1) 1594 if (be32_to_cpu(rec->refc.rc_refcount) != 1)
1603 return -EFSCORRUPTED; 1595 return -EFSCORRUPTED;
1604 1596
1605 rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), KM_SLEEP); 1597 rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), 0);
1606 xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec); 1598 xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec);
1607 list_add_tail(&rr->rr_list, debris); 1599 list_add_tail(&rr->rr_list, debris);
1608 1600
@@ -1679,10 +1671,8 @@ xfs_refcount_recover_cow_leftovers(
1679 /* Free the orphan record */ 1671 /* Free the orphan record */
1680 agbno = rr->rr_rrec.rc_startblock - XFS_REFC_COW_START; 1672 agbno = rr->rr_rrec.rc_startblock - XFS_REFC_COW_START;
1681 fsb = XFS_AGB_TO_FSB(mp, agno, agbno); 1673 fsb = XFS_AGB_TO_FSB(mp, agno, agbno);
1682 error = xfs_refcount_free_cow_extent(tp, fsb, 1674 xfs_refcount_free_cow_extent(tp, fsb,
1683 rr->rr_rrec.rc_blockcount); 1675 rr->rr_rrec.rc_blockcount);
1684 if (error)
1685 goto out_trans;
1686 1676
1687 /* Free the block. */ 1677 /* Free the block. */
1688 xfs_bmap_add_free(tp, fsb, rr->rr_rrec.rc_blockcount, NULL); 1678 xfs_bmap_add_free(tp, fsb, rr->rr_rrec.rc_blockcount, NULL);
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 1d9c518575e7..209795539c8d 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -29,9 +29,9 @@ struct xfs_refcount_intent {
29 xfs_extlen_t ri_blockcount; 29 xfs_extlen_t ri_blockcount;
30}; 30};
31 31
32extern int xfs_refcount_increase_extent(struct xfs_trans *tp, 32void xfs_refcount_increase_extent(struct xfs_trans *tp,
33 struct xfs_bmbt_irec *irec); 33 struct xfs_bmbt_irec *irec);
34extern int xfs_refcount_decrease_extent(struct xfs_trans *tp, 34void xfs_refcount_decrease_extent(struct xfs_trans *tp,
35 struct xfs_bmbt_irec *irec); 35 struct xfs_bmbt_irec *irec);
36 36
37extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp, 37extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp,
@@ -45,10 +45,10 @@ extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur,
45 xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, 45 xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno,
46 xfs_extlen_t *flen, bool find_end_of_shared); 46 xfs_extlen_t *flen, bool find_end_of_shared);
47 47
48extern int xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, 48void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
49 xfs_fsblock_t fsb, xfs_extlen_t len); 49 xfs_extlen_t len);
50extern int xfs_refcount_free_cow_extent(struct xfs_trans *tp, 50void xfs_refcount_free_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
51 xfs_fsblock_t fsb, xfs_extlen_t len); 51 xfs_extlen_t len);
52extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, 52extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
53 xfs_agnumber_t agno); 53 xfs_agnumber_t agno);
54 54
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index e6aeb390b2fb..38e9414878b3 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -168,7 +168,6 @@ xfs_rmap_btrec_to_irec(
168 union xfs_btree_rec *rec, 168 union xfs_btree_rec *rec,
169 struct xfs_rmap_irec *irec) 169 struct xfs_rmap_irec *irec)
170{ 170{
171 irec->rm_flags = 0;
172 irec->rm_startblock = be32_to_cpu(rec->rmap.rm_startblock); 171 irec->rm_startblock = be32_to_cpu(rec->rmap.rm_startblock);
173 irec->rm_blockcount = be32_to_cpu(rec->rmap.rm_blockcount); 172 irec->rm_blockcount = be32_to_cpu(rec->rmap.rm_blockcount);
174 irec->rm_owner = be64_to_cpu(rec->rmap.rm_owner); 173 irec->rm_owner = be64_to_cpu(rec->rmap.rm_owner);
@@ -254,15 +253,15 @@ xfs_rmap_find_left_neighbor_helper(
254 rec->rm_flags); 253 rec->rm_flags);
255 254
256 if (rec->rm_owner != info->high.rm_owner) 255 if (rec->rm_owner != info->high.rm_owner)
257 return XFS_BTREE_QUERY_RANGE_CONTINUE; 256 return 0;
258 if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) && 257 if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) &&
259 !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) && 258 !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) &&
260 rec->rm_offset + rec->rm_blockcount - 1 != info->high.rm_offset) 259 rec->rm_offset + rec->rm_blockcount - 1 != info->high.rm_offset)
261 return XFS_BTREE_QUERY_RANGE_CONTINUE; 260 return 0;
262 261
263 *info->irec = *rec; 262 *info->irec = *rec;
264 *info->stat = 1; 263 *info->stat = 1;
265 return XFS_BTREE_QUERY_RANGE_ABORT; 264 return -ECANCELED;
266} 265}
267 266
268/* 267/*
@@ -305,7 +304,7 @@ xfs_rmap_find_left_neighbor(
305 304
306 error = xfs_rmap_query_range(cur, &info.high, &info.high, 305 error = xfs_rmap_query_range(cur, &info.high, &info.high,
307 xfs_rmap_find_left_neighbor_helper, &info); 306 xfs_rmap_find_left_neighbor_helper, &info);
308 if (error == XFS_BTREE_QUERY_RANGE_ABORT) 307 if (error == -ECANCELED)
309 error = 0; 308 error = 0;
310 if (*stat) 309 if (*stat)
311 trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, 310 trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
@@ -330,16 +329,16 @@ xfs_rmap_lookup_le_range_helper(
330 rec->rm_flags); 329 rec->rm_flags);
331 330
332 if (rec->rm_owner != info->high.rm_owner) 331 if (rec->rm_owner != info->high.rm_owner)
333 return XFS_BTREE_QUERY_RANGE_CONTINUE; 332 return 0;
334 if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) && 333 if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) &&
335 !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) && 334 !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) &&
336 (rec->rm_offset > info->high.rm_offset || 335 (rec->rm_offset > info->high.rm_offset ||
337 rec->rm_offset + rec->rm_blockcount <= info->high.rm_offset)) 336 rec->rm_offset + rec->rm_blockcount <= info->high.rm_offset))
338 return XFS_BTREE_QUERY_RANGE_CONTINUE; 337 return 0;
339 338
340 *info->irec = *rec; 339 *info->irec = *rec;
341 *info->stat = 1; 340 *info->stat = 1;
342 return XFS_BTREE_QUERY_RANGE_ABORT; 341 return -ECANCELED;
343} 342}
344 343
345/* 344/*
@@ -377,7 +376,7 @@ xfs_rmap_lookup_le_range(
377 cur->bc_private.a.agno, bno, 0, owner, offset, flags); 376 cur->bc_private.a.agno, bno, 0, owner, offset, flags);
378 error = xfs_rmap_query_range(cur, &info.high, &info.high, 377 error = xfs_rmap_query_range(cur, &info.high, &info.high,
379 xfs_rmap_lookup_le_range_helper, &info); 378 xfs_rmap_lookup_le_range_helper, &info);
380 if (error == XFS_BTREE_QUERY_RANGE_ABORT) 379 if (error == -ECANCELED)
381 error = 0; 380 error = 0;
382 if (*stat) 381 if (*stat)
383 trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, 382 trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
@@ -2268,7 +2267,7 @@ xfs_rmap_update_is_needed(
2268 * Record a rmap intent; the list is kept sorted first by AG and then by 2267 * Record a rmap intent; the list is kept sorted first by AG and then by
2269 * increasing age. 2268 * increasing age.
2270 */ 2269 */
2271static int 2270static void
2272__xfs_rmap_add( 2271__xfs_rmap_add(
2273 struct xfs_trans *tp, 2272 struct xfs_trans *tp,
2274 enum xfs_rmap_intent_type type, 2273 enum xfs_rmap_intent_type type,
@@ -2287,7 +2286,7 @@ __xfs_rmap_add(
2287 bmap->br_blockcount, 2286 bmap->br_blockcount,
2288 bmap->br_state); 2287 bmap->br_state);
2289 2288
2290 ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_SLEEP | KM_NOFS); 2289 ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_NOFS);
2291 INIT_LIST_HEAD(&ri->ri_list); 2290 INIT_LIST_HEAD(&ri->ri_list);
2292 ri->ri_type = type; 2291 ri->ri_type = type;
2293 ri->ri_owner = owner; 2292 ri->ri_owner = owner;
@@ -2295,11 +2294,10 @@ __xfs_rmap_add(
2295 ri->ri_bmap = *bmap; 2294 ri->ri_bmap = *bmap;
2296 2295
2297 xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list); 2296 xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list);
2298 return 0;
2299} 2297}
2300 2298
2301/* Map an extent into a file. */ 2299/* Map an extent into a file. */
2302int 2300void
2303xfs_rmap_map_extent( 2301xfs_rmap_map_extent(
2304 struct xfs_trans *tp, 2302 struct xfs_trans *tp,
2305 struct xfs_inode *ip, 2303 struct xfs_inode *ip,
@@ -2307,15 +2305,15 @@ xfs_rmap_map_extent(
2307 struct xfs_bmbt_irec *PREV) 2305 struct xfs_bmbt_irec *PREV)
2308{ 2306{
2309 if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork)) 2307 if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
2310 return 0; 2308 return;
2311 2309
2312 return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? 2310 __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
2313 XFS_RMAP_MAP_SHARED : XFS_RMAP_MAP, ip->i_ino, 2311 XFS_RMAP_MAP_SHARED : XFS_RMAP_MAP, ip->i_ino,
2314 whichfork, PREV); 2312 whichfork, PREV);
2315} 2313}
2316 2314
2317/* Unmap an extent out of a file. */ 2315/* Unmap an extent out of a file. */
2318int 2316void
2319xfs_rmap_unmap_extent( 2317xfs_rmap_unmap_extent(
2320 struct xfs_trans *tp, 2318 struct xfs_trans *tp,
2321 struct xfs_inode *ip, 2319 struct xfs_inode *ip,
@@ -2323,9 +2321,9 @@ xfs_rmap_unmap_extent(
2323 struct xfs_bmbt_irec *PREV) 2321 struct xfs_bmbt_irec *PREV)
2324{ 2322{
2325 if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork)) 2323 if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
2326 return 0; 2324 return;
2327 2325
2328 return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? 2326 __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
2329 XFS_RMAP_UNMAP_SHARED : XFS_RMAP_UNMAP, ip->i_ino, 2327 XFS_RMAP_UNMAP_SHARED : XFS_RMAP_UNMAP, ip->i_ino,
2330 whichfork, PREV); 2328 whichfork, PREV);
2331} 2329}
@@ -2336,7 +2334,7 @@ xfs_rmap_unmap_extent(
2336 * Note that tp can be NULL here as no transaction is used for COW fork 2334 * Note that tp can be NULL here as no transaction is used for COW fork
2337 * unwritten conversion. 2335 * unwritten conversion.
2338 */ 2336 */
2339int 2337void
2340xfs_rmap_convert_extent( 2338xfs_rmap_convert_extent(
2341 struct xfs_mount *mp, 2339 struct xfs_mount *mp,
2342 struct xfs_trans *tp, 2340 struct xfs_trans *tp,
@@ -2345,15 +2343,15 @@ xfs_rmap_convert_extent(
2345 struct xfs_bmbt_irec *PREV) 2343 struct xfs_bmbt_irec *PREV)
2346{ 2344{
2347 if (!xfs_rmap_update_is_needed(mp, whichfork)) 2345 if (!xfs_rmap_update_is_needed(mp, whichfork))
2348 return 0; 2346 return;
2349 2347
2350 return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? 2348 __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
2351 XFS_RMAP_CONVERT_SHARED : XFS_RMAP_CONVERT, ip->i_ino, 2349 XFS_RMAP_CONVERT_SHARED : XFS_RMAP_CONVERT, ip->i_ino,
2352 whichfork, PREV); 2350 whichfork, PREV);
2353} 2351}
2354 2352
2355/* Schedule the creation of an rmap for non-file data. */ 2353/* Schedule the creation of an rmap for non-file data. */
2356int 2354void
2357xfs_rmap_alloc_extent( 2355xfs_rmap_alloc_extent(
2358 struct xfs_trans *tp, 2356 struct xfs_trans *tp,
2359 xfs_agnumber_t agno, 2357 xfs_agnumber_t agno,
@@ -2364,18 +2362,18 @@ xfs_rmap_alloc_extent(
2364 struct xfs_bmbt_irec bmap; 2362 struct xfs_bmbt_irec bmap;
2365 2363
2366 if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK)) 2364 if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK))
2367 return 0; 2365 return;
2368 2366
2369 bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno); 2367 bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno);
2370 bmap.br_blockcount = len; 2368 bmap.br_blockcount = len;
2371 bmap.br_startoff = 0; 2369 bmap.br_startoff = 0;
2372 bmap.br_state = XFS_EXT_NORM; 2370 bmap.br_state = XFS_EXT_NORM;
2373 2371
2374 return __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap); 2372 __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap);
2375} 2373}
2376 2374
2377/* Schedule the deletion of an rmap for non-file data. */ 2375/* Schedule the deletion of an rmap for non-file data. */
2378int 2376void
2379xfs_rmap_free_extent( 2377xfs_rmap_free_extent(
2380 struct xfs_trans *tp, 2378 struct xfs_trans *tp,
2381 xfs_agnumber_t agno, 2379 xfs_agnumber_t agno,
@@ -2386,14 +2384,14 @@ xfs_rmap_free_extent(
2386 struct xfs_bmbt_irec bmap; 2384 struct xfs_bmbt_irec bmap;
2387 2385
2388 if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK)) 2386 if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK))
2389 return 0; 2387 return;
2390 2388
2391 bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno); 2389 bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno);
2392 bmap.br_blockcount = len; 2390 bmap.br_blockcount = len;
2393 bmap.br_startoff = 0; 2391 bmap.br_startoff = 0;
2394 bmap.br_state = XFS_EXT_NORM; 2392 bmap.br_state = XFS_EXT_NORM;
2395 2393
2396 return __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap); 2394 __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap);
2397} 2395}
2398 2396
2399/* Compare rmap records. Returns -1 if a < b, 1 if a > b, and 0 if equal. */ 2397/* Compare rmap records. Returns -1 if a < b, 1 if a > b, and 0 if equal. */
@@ -2511,7 +2509,7 @@ xfs_rmap_has_other_keys_helper(
2511 ((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags) 2509 ((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags)
2512 return 0; 2510 return 0;
2513 rks->has_rmap = true; 2511 rks->has_rmap = true;
2514 return XFS_BTREE_QUERY_RANGE_ABORT; 2512 return -ECANCELED;
2515} 2513}
2516 2514
2517/* 2515/*
@@ -2540,8 +2538,11 @@ xfs_rmap_has_other_keys(
2540 2538
2541 error = xfs_rmap_query_range(cur, &low, &high, 2539 error = xfs_rmap_query_range(cur, &low, &high,
2542 xfs_rmap_has_other_keys_helper, &rks); 2540 xfs_rmap_has_other_keys_helper, &rks);
2541 if (error < 0)
2542 return error;
2543
2543 *has_rmap = rks.has_rmap; 2544 *has_rmap = rks.has_rmap;
2544 return error; 2545 return 0;
2545} 2546}
2546 2547
2547const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE = { 2548const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE = {
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index e21ed0294e5c..abe633403fd1 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -68,6 +68,7 @@ xfs_rmap_irec_offset_unpack(
68 if (offset & ~(XFS_RMAP_OFF_MASK | XFS_RMAP_OFF_FLAGS)) 68 if (offset & ~(XFS_RMAP_OFF_MASK | XFS_RMAP_OFF_FLAGS))
69 return -EFSCORRUPTED; 69 return -EFSCORRUPTED;
70 irec->rm_offset = XFS_RMAP_OFF(offset); 70 irec->rm_offset = XFS_RMAP_OFF(offset);
71 irec->rm_flags = 0;
71 if (offset & XFS_RMAP_OFF_ATTR_FORK) 72 if (offset & XFS_RMAP_OFF_ATTR_FORK)
72 irec->rm_flags |= XFS_RMAP_ATTR_FORK; 73 irec->rm_flags |= XFS_RMAP_ATTR_FORK;
73 if (offset & XFS_RMAP_OFF_BMBT_BLOCK) 74 if (offset & XFS_RMAP_OFF_BMBT_BLOCK)
@@ -161,16 +162,16 @@ struct xfs_rmap_intent {
161}; 162};
162 163
163/* functions for updating the rmapbt based on bmbt map/unmap operations */ 164/* functions for updating the rmapbt based on bmbt map/unmap operations */
164int xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, 165void xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
165 int whichfork, struct xfs_bmbt_irec *imap); 166 int whichfork, struct xfs_bmbt_irec *imap);
166int xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, 167void xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
167 int whichfork, struct xfs_bmbt_irec *imap); 168 int whichfork, struct xfs_bmbt_irec *imap);
168int xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp, 169void xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp,
169 struct xfs_inode *ip, int whichfork, 170 struct xfs_inode *ip, int whichfork,
170 struct xfs_bmbt_irec *imap); 171 struct xfs_bmbt_irec *imap);
171int xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno, 172void xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
172 xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner); 173 xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner);
173int xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno, 174void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
174 xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner); 175 xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner);
175 176
176void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp, 177void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp,
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index e0641b7337b3..c45acbd3add9 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -177,10 +177,4 @@ struct xfs_ino_geometry {
177 unsigned int agino_log; /* #bits for agino in inum */ 177 unsigned int agino_log; /* #bits for agino in inum */
178}; 178};
179 179
180/* Keep iterating the data structure. */
181#define XFS_ITER_CONTINUE (0)
182
183/* Stop iterating the data structure. */
184#define XFS_ITER_ABORT (1)
185
186#endif /* __XFS_SHARED_H__ */ 180#endif /* __XFS_SHARED_H__ */
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 802b34cd10fe..300b3e91ca3a 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -169,6 +169,14 @@ typedef struct xfs_bmbt_irec
169 xfs_exntst_t br_state; /* extent state */ 169 xfs_exntst_t br_state; /* extent state */
170} xfs_bmbt_irec_t; 170} xfs_bmbt_irec_t;
171 171
172/* per-AG block reservation types */
173enum xfs_ag_resv_type {
174 XFS_AG_RESV_NONE = 0,
175 XFS_AG_RESV_AGFL,
176 XFS_AG_RESV_METADATA,
177 XFS_AG_RESV_RMAPBT,
178};
179
172/* 180/*
173 * Type verifier functions 181 * Type verifier functions
174 */ 182 */
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 16b09b941441..ba0f747c82e8 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -639,7 +639,7 @@ xchk_agfl_block(
639 xchk_agfl_block_xref(sc, agbno); 639 xchk_agfl_block_xref(sc, agbno);
640 640
641 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 641 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
642 return XFS_ITER_ABORT; 642 return -ECANCELED;
643 643
644 return 0; 644 return 0;
645} 645}
@@ -730,7 +730,7 @@ xchk_agfl(
730 /* Check the blocks in the AGFL. */ 730 /* Check the blocks in the AGFL. */
731 error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(sc->sa.agf_bp), 731 error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(sc->sa.agf_bp),
732 sc->sa.agfl_bp, xchk_agfl_block, &sai); 732 sc->sa.agfl_bp, xchk_agfl_block, &sai);
733 if (error == XFS_ITER_ABORT) { 733 if (error == -ECANCELED) {
734 error = 0; 734 error = 0;
735 goto out_free; 735 goto out_free;
736 } 736 }
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index 1afc58bf71dd..0edc7f8eb96e 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -80,7 +80,7 @@ xchk_setup_xattr(
80 * without the inode lock held, which means we can sleep. 80 * without the inode lock held, which means we can sleep.
81 */ 81 */
82 if (sc->flags & XCHK_TRY_HARDER) { 82 if (sc->flags & XCHK_TRY_HARDER) {
83 error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, KM_SLEEP); 83 error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, 0);
84 if (error) 84 if (error)
85 return error; 85 return error;
86 } 86 }
@@ -163,8 +163,6 @@ xchk_xattr_listent(
163 args.valuelen = valuelen; 163 args.valuelen = valuelen;
164 164
165 error = xfs_attr_get_ilocked(context->dp, &args); 165 error = xfs_attr_get_ilocked(context->dp, &args);
166 if (error == -EEXIST)
167 error = 0;
168 if (!xchk_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno, 166 if (!xchk_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno,
169 &error)) 167 &error))
170 goto fail_xref; 168 goto fail_xref;
@@ -173,7 +171,7 @@ xchk_xattr_listent(
173 args.blkno); 171 args.blkno);
174fail_xref: 172fail_xref:
175 if (sx->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 173 if (sx->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
176 context->seen_enough = XFS_ITER_ABORT; 174 context->seen_enough = 1;
177 return; 175 return;
178} 176}
179 177
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 1bd29fdc2ab5..fa6ea6407992 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -75,6 +75,7 @@ struct xchk_bmap_info {
75 xfs_fileoff_t lastoff; 75 xfs_fileoff_t lastoff;
76 bool is_rt; 76 bool is_rt;
77 bool is_shared; 77 bool is_shared;
78 bool was_loaded;
78 int whichfork; 79 int whichfork;
79}; 80};
80 81
@@ -213,25 +214,20 @@ xchk_bmap_xref_rmap(
213 214
214/* Cross-reference a single rtdev extent record. */ 215/* Cross-reference a single rtdev extent record. */
215STATIC void 216STATIC void
216xchk_bmap_rt_extent_xref( 217xchk_bmap_rt_iextent_xref(
217 struct xchk_bmap_info *info,
218 struct xfs_inode *ip, 218 struct xfs_inode *ip,
219 struct xfs_btree_cur *cur, 219 struct xchk_bmap_info *info,
220 struct xfs_bmbt_irec *irec) 220 struct xfs_bmbt_irec *irec)
221{ 221{
222 if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
223 return;
224
225 xchk_xref_is_used_rt_space(info->sc, irec->br_startblock, 222 xchk_xref_is_used_rt_space(info->sc, irec->br_startblock,
226 irec->br_blockcount); 223 irec->br_blockcount);
227} 224}
228 225
229/* Cross-reference a single datadev extent record. */ 226/* Cross-reference a single datadev extent record. */
230STATIC void 227STATIC void
231xchk_bmap_extent_xref( 228xchk_bmap_iextent_xref(
232 struct xchk_bmap_info *info,
233 struct xfs_inode *ip, 229 struct xfs_inode *ip,
234 struct xfs_btree_cur *cur, 230 struct xchk_bmap_info *info,
235 struct xfs_bmbt_irec *irec) 231 struct xfs_bmbt_irec *irec)
236{ 232{
237 struct xfs_mount *mp = info->sc->mp; 233 struct xfs_mount *mp = info->sc->mp;
@@ -240,9 +236,6 @@ xchk_bmap_extent_xref(
240 xfs_extlen_t len; 236 xfs_extlen_t len;
241 int error; 237 int error;
242 238
243 if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
244 return;
245
246 agno = XFS_FSB_TO_AGNO(mp, irec->br_startblock); 239 agno = XFS_FSB_TO_AGNO(mp, irec->br_startblock);
247 agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock); 240 agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock);
248 len = irec->br_blockcount; 241 len = irec->br_blockcount;
@@ -300,20 +293,15 @@ xchk_bmap_dirattr_extent(
300 293
301/* Scrub a single extent record. */ 294/* Scrub a single extent record. */
302STATIC int 295STATIC int
303xchk_bmap_extent( 296xchk_bmap_iextent(
304 struct xfs_inode *ip, 297 struct xfs_inode *ip,
305 struct xfs_btree_cur *cur,
306 struct xchk_bmap_info *info, 298 struct xchk_bmap_info *info,
307 struct xfs_bmbt_irec *irec) 299 struct xfs_bmbt_irec *irec)
308{ 300{
309 struct xfs_mount *mp = info->sc->mp; 301 struct xfs_mount *mp = info->sc->mp;
310 struct xfs_buf *bp = NULL;
311 xfs_filblks_t end; 302 xfs_filblks_t end;
312 int error = 0; 303 int error = 0;
313 304
314 if (cur)
315 xfs_btree_get_block(cur, 0, &bp);
316
317 /* 305 /*
318 * Check for out-of-order extents. This record could have come 306 * Check for out-of-order extents. This record could have come
319 * from the incore list, for which there is no ordering check. 307 * from the incore list, for which there is no ordering check.
@@ -364,10 +352,13 @@ xchk_bmap_extent(
364 xchk_fblock_set_corrupt(info->sc, info->whichfork, 352 xchk_fblock_set_corrupt(info->sc, info->whichfork,
365 irec->br_startoff); 353 irec->br_startoff);
366 354
355 if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
356 return 0;
357
367 if (info->is_rt) 358 if (info->is_rt)
368 xchk_bmap_rt_extent_xref(info, ip, cur, irec); 359 xchk_bmap_rt_iextent_xref(ip, info, irec);
369 else 360 else
370 xchk_bmap_extent_xref(info, ip, cur, irec); 361 xchk_bmap_iextent_xref(ip, info, irec);
371 362
372 info->lastoff = irec->br_startoff + irec->br_blockcount; 363 info->lastoff = irec->br_startoff + irec->br_blockcount;
373 return error; 364 return error;
@@ -380,10 +371,13 @@ xchk_bmapbt_rec(
380 union xfs_btree_rec *rec) 371 union xfs_btree_rec *rec)
381{ 372{
382 struct xfs_bmbt_irec irec; 373 struct xfs_bmbt_irec irec;
374 struct xfs_bmbt_irec iext_irec;
375 struct xfs_iext_cursor icur;
383 struct xchk_bmap_info *info = bs->private; 376 struct xchk_bmap_info *info = bs->private;
384 struct xfs_inode *ip = bs->cur->bc_private.b.ip; 377 struct xfs_inode *ip = bs->cur->bc_private.b.ip;
385 struct xfs_buf *bp = NULL; 378 struct xfs_buf *bp = NULL;
386 struct xfs_btree_block *block; 379 struct xfs_btree_block *block;
380 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, info->whichfork);
387 uint64_t owner; 381 uint64_t owner;
388 int i; 382 int i;
389 383
@@ -402,9 +396,26 @@ xchk_bmapbt_rec(
402 } 396 }
403 } 397 }
404 398
405 /* Set up the in-core record and scrub it. */ 399 /*
400 * Check that the incore extent tree contains an extent that matches
401 * this one exactly. We validate those cached bmaps later, so we don't
402 * need to check them here. If the incore extent tree was just loaded
403 * from disk by the scrubber, we assume that its contents match what's
404 * on disk (we still hold the ILOCK) and skip the equivalence check.
405 */
406 if (!info->was_loaded)
407 return 0;
408
406 xfs_bmbt_disk_get_all(&rec->bmbt, &irec); 409 xfs_bmbt_disk_get_all(&rec->bmbt, &irec);
407 return xchk_bmap_extent(ip, bs->cur, info, &irec); 410 if (!xfs_iext_lookup_extent(ip, ifp, irec.br_startoff, &icur,
411 &iext_irec) ||
412 irec.br_startoff != iext_irec.br_startoff ||
413 irec.br_startblock != iext_irec.br_startblock ||
414 irec.br_blockcount != iext_irec.br_blockcount ||
415 irec.br_state != iext_irec.br_state)
416 xchk_fblock_set_corrupt(bs->sc, info->whichfork,
417 irec.br_startoff);
418 return 0;
408} 419}
409 420
410/* Scan the btree records. */ 421/* Scan the btree records. */
@@ -415,15 +426,26 @@ xchk_bmap_btree(
415 struct xchk_bmap_info *info) 426 struct xchk_bmap_info *info)
416{ 427{
417 struct xfs_owner_info oinfo; 428 struct xfs_owner_info oinfo;
429 struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, whichfork);
418 struct xfs_mount *mp = sc->mp; 430 struct xfs_mount *mp = sc->mp;
419 struct xfs_inode *ip = sc->ip; 431 struct xfs_inode *ip = sc->ip;
420 struct xfs_btree_cur *cur; 432 struct xfs_btree_cur *cur;
421 int error; 433 int error;
422 434
435 /* Load the incore bmap cache if it's not loaded. */
436 info->was_loaded = ifp->if_flags & XFS_IFEXTENTS;
437 if (!info->was_loaded) {
438 error = xfs_iread_extents(sc->tp, ip, whichfork);
439 if (!xchk_fblock_process_error(sc, whichfork, 0, &error))
440 goto out;
441 }
442
443 /* Check the btree structure. */
423 cur = xfs_bmbt_init_cursor(mp, sc->tp, ip, whichfork); 444 cur = xfs_bmbt_init_cursor(mp, sc->tp, ip, whichfork);
424 xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork); 445 xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork);
425 error = xchk_btree(sc, cur, xchk_bmapbt_rec, &oinfo, info); 446 error = xchk_btree(sc, cur, xchk_bmapbt_rec, &oinfo, info);
426 xfs_btree_del_cursor(cur, error); 447 xfs_btree_del_cursor(cur, error);
448out:
427 return error; 449 return error;
428} 450}
429 451
@@ -500,7 +522,7 @@ xchk_bmap_check_rmap(
500 522
501out: 523out:
502 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 524 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
503 return XFS_BTREE_QUERY_RANGE_ABORT; 525 return -ECANCELED;
504 return 0; 526 return 0;
505} 527}
506 528
@@ -529,7 +551,7 @@ xchk_bmap_check_ag_rmaps(
529 sbcri.sc = sc; 551 sbcri.sc = sc;
530 sbcri.whichfork = whichfork; 552 sbcri.whichfork = whichfork;
531 error = xfs_rmap_query_all(cur, xchk_bmap_check_rmap, &sbcri); 553 error = xfs_rmap_query_all(cur, xchk_bmap_check_rmap, &sbcri);
532 if (error == XFS_BTREE_QUERY_RANGE_ABORT) 554 if (error == -ECANCELED)
533 error = 0; 555 error = 0;
534 556
535 xfs_btree_del_cursor(cur, error); 557 xfs_btree_del_cursor(cur, error);
@@ -671,13 +693,6 @@ xchk_bmap(
671 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 693 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
672 goto out; 694 goto out;
673 695
674 /* Now try to scrub the in-memory extent list. */
675 if (!(ifp->if_flags & XFS_IFEXTENTS)) {
676 error = xfs_iread_extents(sc->tp, ip, whichfork);
677 if (!xchk_fblock_process_error(sc, whichfork, 0, &error))
678 goto out;
679 }
680
681 /* Find the offset of the last extent in the mapping. */ 696 /* Find the offset of the last extent in the mapping. */
682 error = xfs_bmap_last_offset(ip, &endoff, whichfork); 697 error = xfs_bmap_last_offset(ip, &endoff, whichfork);
683 if (!xchk_fblock_process_error(sc, whichfork, 0, &error)) 698 if (!xchk_fblock_process_error(sc, whichfork, 0, &error))
@@ -689,7 +704,7 @@ xchk_bmap(
689 for_each_xfs_iext(ifp, &icur, &irec) { 704 for_each_xfs_iext(ifp, &icur, &irec) {
690 if (xchk_should_terminate(sc, &error) || 705 if (xchk_should_terminate(sc, &error) ||
691 (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) 706 (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
692 break; 707 goto out;
693 if (isnullstartblock(irec.br_startblock)) 708 if (isnullstartblock(irec.br_startblock))
694 continue; 709 continue;
695 if (irec.br_startoff >= endoff) { 710 if (irec.br_startoff >= endoff) {
@@ -697,7 +712,7 @@ xchk_bmap(
697 irec.br_startoff); 712 irec.br_startoff);
698 goto out; 713 goto out;
699 } 714 }
700 error = xchk_bmap_extent(ip, NULL, &info, &irec); 715 error = xchk_bmap_iextent(ip, &info, &irec);
701 if (error) 716 if (error)
702 goto out; 717 goto out;
703 } 718 }
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index fc3f510c9034..98f82d7c8b40 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -125,7 +125,7 @@ xchk_setup_fscounters(
125 struct xchk_fscounters *fsc; 125 struct xchk_fscounters *fsc;
126 int error; 126 int error;
127 127
128 sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), KM_SLEEP); 128 sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), 0);
129 if (!sc->buf) 129 if (!sc->buf)
130 return -ENOMEM; 130 return -ENOMEM;
131 fsc = sc->buf; 131 fsc = sc->buf;
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 4cfeec57fb05..b70a88bc975e 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -351,7 +351,7 @@ xrep_init_btblock(
351 xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); 351 xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
352 xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno); 352 xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno);
353 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF); 353 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
354 xfs_trans_log_buf(tp, bp, 0, bp->b_length); 354 xfs_trans_log_buf(tp, bp, 0, BBTOB(bp->b_length) - 1);
355 bp->b_ops = ops; 355 bp->b_ops = ops;
356 *bpp = bp; 356 *bpp = bp;
357 357
@@ -664,7 +664,7 @@ xrep_findroot_agfl_walk(
664{ 664{
665 xfs_agblock_t *agbno = priv; 665 xfs_agblock_t *agbno = priv;
666 666
667 return (*agbno == bno) ? XFS_ITER_ABORT : 0; 667 return (*agbno == bno) ? -ECANCELED : 0;
668} 668}
669 669
670/* Does this block match the btree information passed in? */ 670/* Does this block match the btree information passed in? */
@@ -694,7 +694,7 @@ xrep_findroot_block(
694 if (owner == XFS_RMAP_OWN_AG) { 694 if (owner == XFS_RMAP_OWN_AG) {
695 error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp, 695 error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp,
696 xrep_findroot_agfl_walk, &agbno); 696 xrep_findroot_agfl_walk, &agbno);
697 if (error == XFS_ITER_ABORT) 697 if (error == -ECANCELED)
698 return 0; 698 return 0;
699 if (error) 699 if (error)
700 return error; 700 return error;
diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
index 99c0b1234c3c..5641ae512c9e 100644
--- a/fs/xfs/scrub/symlink.c
+++ b/fs/xfs/scrub/symlink.c
@@ -22,7 +22,7 @@ xchk_setup_symlink(
22 struct xfs_inode *ip) 22 struct xfs_inode *ip)
23{ 23{
24 /* Allocate the buffer without the inode lock held. */ 24 /* Allocate the buffer without the inode lock held. */
25 sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, KM_SLEEP); 25 sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, 0);
26 if (!sc->buf) 26 if (!sc->buf)
27 return -ENOMEM; 27 return -ENOMEM;
28 28
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index cbda40d40326..96d7071cfa46 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -112,7 +112,7 @@ xfs_get_acl(struct inode *inode, int type)
112{ 112{
113 struct xfs_inode *ip = XFS_I(inode); 113 struct xfs_inode *ip = XFS_I(inode);
114 struct posix_acl *acl = NULL; 114 struct posix_acl *acl = NULL;
115 struct xfs_acl *xfs_acl; 115 struct xfs_acl *xfs_acl = NULL;
116 unsigned char *ea_name; 116 unsigned char *ea_name;
117 int error; 117 int error;
118 int len; 118 int len;
@@ -135,12 +135,8 @@ xfs_get_acl(struct inode *inode, int type)
135 * go out to the disk. 135 * go out to the disk.
136 */ 136 */
137 len = XFS_ACL_MAX_SIZE(ip->i_mount); 137 len = XFS_ACL_MAX_SIZE(ip->i_mount);
138 xfs_acl = kmem_zalloc_large(len, KM_SLEEP); 138 error = xfs_attr_get(ip, ea_name, (unsigned char **)&xfs_acl, &len,
139 if (!xfs_acl) 139 ATTR_ALLOC | ATTR_ROOT);
140 return ERR_PTR(-ENOMEM);
141
142 error = xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl,
143 &len, ATTR_ROOT);
144 if (error) { 140 if (error) {
145 /* 141 /*
146 * If the attribute doesn't exist make sure we have a negative 142 * If the attribute doesn't exist make sure we have a negative
@@ -151,8 +147,8 @@ xfs_get_acl(struct inode *inode, int type)
151 } else { 147 } else {
152 acl = xfs_acl_from_disk(xfs_acl, len, 148 acl = xfs_acl_from_disk(xfs_acl, len,
153 XFS_ACL_MAX_ENTRIES(ip->i_mount)); 149 XFS_ACL_MAX_ENTRIES(ip->i_mount));
150 kmem_free(xfs_acl);
154 } 151 }
155 kmem_free(xfs_acl);
156 return acl; 152 return acl;
157} 153}
158 154
@@ -180,7 +176,7 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
180 struct xfs_acl *xfs_acl; 176 struct xfs_acl *xfs_acl;
181 int len = XFS_ACL_MAX_SIZE(ip->i_mount); 177 int len = XFS_ACL_MAX_SIZE(ip->i_mount);
182 178
183 xfs_acl = kmem_zalloc_large(len, KM_SLEEP); 179 xfs_acl = kmem_zalloc_large(len, 0);
184 if (!xfs_acl) 180 if (!xfs_acl)
185 return -ENOMEM; 181 return -ENOMEM;
186 182
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index dc93c51c17de..a640a285cc52 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -147,7 +147,7 @@ xfs_attr3_leaf_inactive(
147 * Allocate storage for a list of all the "remote" value extents. 147 * Allocate storage for a list of all the "remote" value extents.
148 */ 148 */
149 size = count * sizeof(xfs_attr_inactive_list_t); 149 size = count * sizeof(xfs_attr_inactive_list_t);
150 list = kmem_alloc(size, KM_SLEEP); 150 list = kmem_alloc(size, 0);
151 151
152 /* 152 /*
153 * Identify each of the "remote" value extents. 153 * Identify each of the "remote" value extents.
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 58fc820a70c6..00758fdc2fec 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -109,7 +109,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
109 * It didn't all fit, so we have to sort everything on hashval. 109 * It didn't all fit, so we have to sort everything on hashval.
110 */ 110 */
111 sbsize = sf->hdr.count * sizeof(*sbuf); 111 sbsize = sf->hdr.count * sizeof(*sbuf);
112 sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS); 112 sbp = sbuf = kmem_alloc(sbsize, KM_NOFS);
113 113
114 /* 114 /*
115 * Scan the attribute list for the rest of the entries, storing 115 * Scan the attribute list for the rest of the entries, storing
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 9fa4a7ee8cfc..83d24e983d4c 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -141,7 +141,7 @@ xfs_bui_init(
141{ 141{
142 struct xfs_bui_log_item *buip; 142 struct xfs_bui_log_item *buip;
143 143
144 buip = kmem_zone_zalloc(xfs_bui_zone, KM_SLEEP); 144 buip = kmem_zone_zalloc(xfs_bui_zone, 0);
145 145
146 xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops); 146 xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops);
147 buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS; 147 buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS;
@@ -218,7 +218,7 @@ xfs_trans_get_bud(
218{ 218{
219 struct xfs_bud_log_item *budp; 219 struct xfs_bud_log_item *budp;
220 220
221 budp = kmem_zone_zalloc(xfs_bud_zone, KM_SLEEP); 221 budp = kmem_zone_zalloc(xfs_bud_zone, 0);
222 xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD, 222 xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD,
223 &xfs_bud_item_ops); 223 &xfs_bud_item_ops);
224 budp->bud_buip = buip; 224 budp->bud_buip = buip;
@@ -542,9 +542,7 @@ xfs_bui_recover(
542 irec.br_blockcount = count; 542 irec.br_blockcount = count;
543 irec.br_startoff = bmap->me_startoff; 543 irec.br_startoff = bmap->me_startoff;
544 irec.br_state = state; 544 irec.br_state = state;
545 error = xfs_bmap_unmap_extent(tp, ip, &irec); 545 xfs_bmap_unmap_extent(tp, ip, &irec);
546 if (error)
547 goto err_inode;
548 } 546 }
549 547
550 set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); 548 set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 98c6a7a71427..0910cb75b65d 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -39,9 +39,9 @@
39xfs_daddr_t 39xfs_daddr_t
40xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) 40xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
41{ 41{
42 return (XFS_IS_REALTIME_INODE(ip) ? \ 42 if (XFS_IS_REALTIME_INODE(ip))
43 (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \ 43 return XFS_FSB_TO_BB(ip->i_mount, fsb);
44 XFS_FSB_TO_DADDR((ip)->i_mount, (fsb))); 44 return XFS_FSB_TO_DADDR(ip->i_mount, fsb);
45} 45}
46 46
47/* 47/*
@@ -1532,24 +1532,16 @@ xfs_swap_extent_rmap(
1532 trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec); 1532 trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec);
1533 1533
1534 /* Remove the mapping from the donor file. */ 1534 /* Remove the mapping from the donor file. */
1535 error = xfs_bmap_unmap_extent(tp, tip, &uirec); 1535 xfs_bmap_unmap_extent(tp, tip, &uirec);
1536 if (error)
1537 goto out;
1538 1536
1539 /* Remove the mapping from the source file. */ 1537 /* Remove the mapping from the source file. */
1540 error = xfs_bmap_unmap_extent(tp, ip, &irec); 1538 xfs_bmap_unmap_extent(tp, ip, &irec);
1541 if (error)
1542 goto out;
1543 1539
1544 /* Map the donor file's blocks into the source file. */ 1540 /* Map the donor file's blocks into the source file. */
1545 error = xfs_bmap_map_extent(tp, ip, &uirec); 1541 xfs_bmap_map_extent(tp, ip, &uirec);
1546 if (error)
1547 goto out;
1548 1542
1549 /* Map the source file's blocks into the donor file. */ 1543 /* Map the source file's blocks into the donor file. */
1550 error = xfs_bmap_map_extent(tp, tip, &irec); 1544 xfs_bmap_map_extent(tp, tip, &irec);
1551 if (error)
1552 goto out;
1553 1545
1554 error = xfs_defer_finish(tpp); 1546 error = xfs_defer_finish(tpp);
1555 tp = *tpp; 1547 tp = *tpp;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index ca0849043f54..120ef99d09e8 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -353,7 +353,8 @@ xfs_buf_allocate_memory(
353 */ 353 */
354 size = BBTOB(bp->b_length); 354 size = BBTOB(bp->b_length);
355 if (size < PAGE_SIZE) { 355 if (size < PAGE_SIZE) {
356 bp->b_addr = kmem_alloc(size, KM_NOFS); 356 int align_mask = xfs_buftarg_dma_alignment(bp->b_target);
357 bp->b_addr = kmem_alloc_io(size, align_mask, KM_NOFS);
357 if (!bp->b_addr) { 358 if (!bp->b_addr) {
358 /* low memory - use alloc_page loop instead */ 359 /* low memory - use alloc_page loop instead */
359 goto use_alloc_page; 360 goto use_alloc_page;
@@ -368,7 +369,7 @@ xfs_buf_allocate_memory(
368 } 369 }
369 bp->b_offset = offset_in_page(bp->b_addr); 370 bp->b_offset = offset_in_page(bp->b_addr);
370 bp->b_pages = bp->b_page_array; 371 bp->b_pages = bp->b_page_array;
371 bp->b_pages[0] = virt_to_page(bp->b_addr); 372 bp->b_pages[0] = kmem_to_page(bp->b_addr);
372 bp->b_page_count = 1; 373 bp->b_page_count = 1;
373 bp->b_flags |= _XBF_KMEM; 374 bp->b_flags |= _XBF_KMEM;
374 return 0; 375 return 0;
@@ -1741,7 +1742,7 @@ xfs_alloc_buftarg(
1741{ 1742{
1742 xfs_buftarg_t *btp; 1743 xfs_buftarg_t *btp;
1743 1744
1744 btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS); 1745 btp = kmem_zalloc(sizeof(*btp), KM_NOFS);
1745 1746
1746 btp->bt_mount = mp; 1747 btp->bt_mount = mp;
1747 btp->bt_dev = bdev->bd_dev; 1748 btp->bt_dev = bdev->bd_dev;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index c6e57a3f409e..f6ce17d8d848 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -350,6 +350,12 @@ extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int);
350#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) 350#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
351#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) 351#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
352 352
353static inline int
354xfs_buftarg_dma_alignment(struct xfs_buftarg *bt)
355{
356 return queue_dma_alignment(bt->bt_bdev->bd_disk->queue);
357}
358
353int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops); 359int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
354bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic); 360bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic);
355bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic); 361bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 7dcaec54a20b..d74fbd1e9d3e 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -702,7 +702,7 @@ xfs_buf_item_get_format(
702 } 702 }
703 703
704 bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format), 704 bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format),
705 KM_SLEEP); 705 0);
706 if (!bip->bli_formats) 706 if (!bip->bli_formats)
707 return -ENOMEM; 707 return -ENOMEM;
708 return 0; 708 return 0;
@@ -747,7 +747,7 @@ xfs_buf_item_init(
747 return 0; 747 return 0;
748 } 748 }
749 749
750 bip = kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP); 750 bip = kmem_zone_zalloc(xfs_buf_item_zone, 0);
751 xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); 751 xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
752 bip->bli_buf = bp; 752 bip->bli_buf = bp;
753 753
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index fb1ad4483081..aeb95e7391c1 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -440,7 +440,7 @@ xfs_dquot_alloc(
440{ 440{
441 struct xfs_dquot *dqp; 441 struct xfs_dquot *dqp;
442 442
443 dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP); 443 dqp = kmem_zone_zalloc(xfs_qm_dqzone, 0);
444 444
445 dqp->dq_flags = type; 445 dqp->dq_flags = type;
446 dqp->q_core.d_id = cpu_to_be32(id); 446 dqp->q_core.d_id = cpu_to_be32(id);
@@ -1239,7 +1239,7 @@ xfs_qm_exit(void)
1239/* 1239/*
1240 * Iterate every dquot of a particular type. The caller must ensure that the 1240 * Iterate every dquot of a particular type. The caller must ensure that the
1241 * particular quota type is active. iter_fn can return negative error codes, 1241 * particular quota type is active. iter_fn can return negative error codes,
1242 * or XFS_ITER_ABORT to indicate that it wants to stop iterating. 1242 * or -ECANCELED to indicate that it wants to stop iterating.
1243 */ 1243 */
1244int 1244int
1245xfs_qm_dqiterate( 1245xfs_qm_dqiterate(
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 282ec5af293e..d60647d7197b 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -347,7 +347,7 @@ xfs_qm_qoff_logitem_init(
347{ 347{
348 struct xfs_qoff_logitem *qf; 348 struct xfs_qoff_logitem *qf;
349 349
350 qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), KM_SLEEP); 350 qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), 0);
351 351
352 xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ? 352 xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
353 &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops); 353 &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 544c9482a0ef..849fd4476950 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -213,7 +213,7 @@ xfs_errortag_init(
213 struct xfs_mount *mp) 213 struct xfs_mount *mp)
214{ 214{
215 mp->m_errortag = kmem_zalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX, 215 mp->m_errortag = kmem_zalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX,
216 KM_SLEEP | KM_MAYFAIL); 216 KM_MAYFAIL);
217 if (!mp->m_errortag) 217 if (!mp->m_errortag)
218 return -ENOMEM; 218 return -ENOMEM;
219 219
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 0ed68379e551..2183d87be4cf 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -33,7 +33,7 @@ xfs_extent_busy_insert(
33 struct rb_node **rbp; 33 struct rb_node **rbp;
34 struct rb_node *parent = NULL; 34 struct rb_node *parent = NULL;
35 35
36 new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_SLEEP); 36 new = kmem_zalloc(sizeof(struct xfs_extent_busy), 0);
37 new->agno = agno; 37 new->agno = agno;
38 new->bno = bno; 38 new->bno = bno;
39 new->length = len; 39 new->length = len;
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 86f6512d6864..e44efc41a041 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -163,9 +163,9 @@ xfs_efi_init(
163 if (nextents > XFS_EFI_MAX_FAST_EXTENTS) { 163 if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
164 size = (uint)(sizeof(xfs_efi_log_item_t) + 164 size = (uint)(sizeof(xfs_efi_log_item_t) +
165 ((nextents - 1) * sizeof(xfs_extent_t))); 165 ((nextents - 1) * sizeof(xfs_extent_t)));
166 efip = kmem_zalloc(size, KM_SLEEP); 166 efip = kmem_zalloc(size, 0);
167 } else { 167 } else {
168 efip = kmem_zone_zalloc(xfs_efi_zone, KM_SLEEP); 168 efip = kmem_zone_zalloc(xfs_efi_zone, 0);
169 } 169 }
170 170
171 xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); 171 xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
@@ -333,9 +333,9 @@ xfs_trans_get_efd(
333 if (nextents > XFS_EFD_MAX_FAST_EXTENTS) { 333 if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
334 efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) + 334 efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) +
335 (nextents - 1) * sizeof(struct xfs_extent), 335 (nextents - 1) * sizeof(struct xfs_extent),
336 KM_SLEEP); 336 0);
337 } else { 337 } else {
338 efdp = kmem_zone_zalloc(xfs_efd_zone, KM_SLEEP); 338 efdp = kmem_zone_zalloc(xfs_efd_zone, 0);
339 } 339 }
340 340
341 xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD, 341 xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 28101bbc0b78..d952d5962e93 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -28,6 +28,7 @@
28#include <linux/falloc.h> 28#include <linux/falloc.h>
29#include <linux/backing-dev.h> 29#include <linux/backing-dev.h>
30#include <linux/mman.h> 30#include <linux/mman.h>
31#include <linux/fadvise.h>
31 32
32static const struct vm_operations_struct xfs_file_vm_ops; 33static const struct vm_operations_struct xfs_file_vm_ops;
33 34
@@ -933,6 +934,30 @@ out_unlock:
933 return error; 934 return error;
934} 935}
935 936
937STATIC int
938xfs_file_fadvise(
939 struct file *file,
940 loff_t start,
941 loff_t end,
942 int advice)
943{
944 struct xfs_inode *ip = XFS_I(file_inode(file));
945 int ret;
946 int lockflags = 0;
947
948 /*
949 * Operations creating pages in page cache need protection from hole
950 * punching and similar ops
951 */
952 if (advice == POSIX_FADV_WILLNEED) {
953 lockflags = XFS_IOLOCK_SHARED;
954 xfs_ilock(ip, lockflags);
955 }
956 ret = generic_fadvise(file, start, end, advice);
957 if (lockflags)
958 xfs_iunlock(ip, lockflags);
959 return ret;
960}
936 961
937STATIC loff_t 962STATIC loff_t
938xfs_file_remap_range( 963xfs_file_remap_range(
@@ -1232,6 +1257,7 @@ const struct file_operations xfs_file_operations = {
1232 .fsync = xfs_file_fsync, 1257 .fsync = xfs_file_fsync,
1233 .get_unmapped_area = thp_get_unmapped_area, 1258 .get_unmapped_area = thp_get_unmapped_area,
1234 .fallocate = xfs_file_fallocate, 1259 .fallocate = xfs_file_fallocate,
1260 .fadvise = xfs_file_fadvise,
1235 .remap_file_range = xfs_file_remap_range, 1261 .remap_file_range = xfs_file_remap_range,
1236}; 1262};
1237 1263
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 5a8f9641562a..d082143feb5a 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -250,7 +250,7 @@ xfs_getfsmap_helper(
250 rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); 250 rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
251 if (info->next_daddr < rec_daddr) 251 if (info->next_daddr < rec_daddr)
252 info->next_daddr = rec_daddr; 252 info->next_daddr = rec_daddr;
253 return XFS_BTREE_QUERY_RANGE_CONTINUE; 253 return 0;
254 } 254 }
255 255
256 /* Are we just counting mappings? */ 256 /* Are we just counting mappings? */
@@ -259,14 +259,14 @@ xfs_getfsmap_helper(
259 info->head->fmh_entries++; 259 info->head->fmh_entries++;
260 260
261 if (info->last) 261 if (info->last)
262 return XFS_BTREE_QUERY_RANGE_CONTINUE; 262 return 0;
263 263
264 info->head->fmh_entries++; 264 info->head->fmh_entries++;
265 265
266 rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); 266 rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
267 if (info->next_daddr < rec_daddr) 267 if (info->next_daddr < rec_daddr)
268 info->next_daddr = rec_daddr; 268 info->next_daddr = rec_daddr;
269 return XFS_BTREE_QUERY_RANGE_CONTINUE; 269 return 0;
270 } 270 }
271 271
272 /* 272 /*
@@ -276,7 +276,7 @@ xfs_getfsmap_helper(
276 */ 276 */
277 if (rec_daddr > info->next_daddr) { 277 if (rec_daddr > info->next_daddr) {
278 if (info->head->fmh_entries >= info->head->fmh_count) 278 if (info->head->fmh_entries >= info->head->fmh_count)
279 return XFS_BTREE_QUERY_RANGE_ABORT; 279 return -ECANCELED;
280 280
281 fmr.fmr_device = info->dev; 281 fmr.fmr_device = info->dev;
282 fmr.fmr_physical = info->next_daddr; 282 fmr.fmr_physical = info->next_daddr;
@@ -295,7 +295,7 @@ xfs_getfsmap_helper(
295 295
296 /* Fill out the extent we found */ 296 /* Fill out the extent we found */
297 if (info->head->fmh_entries >= info->head->fmh_count) 297 if (info->head->fmh_entries >= info->head->fmh_count)
298 return XFS_BTREE_QUERY_RANGE_ABORT; 298 return -ECANCELED;
299 299
300 trace_xfs_fsmap_mapping(mp, info->dev, info->agno, rec); 300 trace_xfs_fsmap_mapping(mp, info->dev, info->agno, rec);
301 301
@@ -328,7 +328,7 @@ out:
328 rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); 328 rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
329 if (info->next_daddr < rec_daddr) 329 if (info->next_daddr < rec_daddr)
330 info->next_daddr = rec_daddr; 330 info->next_daddr = rec_daddr;
331 return XFS_BTREE_QUERY_RANGE_CONTINUE; 331 return 0;
332} 332}
333 333
334/* Transform a rmapbt irec into a fsmap */ 334/* Transform a rmapbt irec into a fsmap */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 0b0fd10a36d4..944add5ff8e0 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -40,7 +40,7 @@ xfs_inode_alloc(
40 * KM_MAYFAIL and return NULL here on ENOMEM. Set the 40 * KM_MAYFAIL and return NULL here on ENOMEM. Set the
41 * code up to do this anyway. 41 * code up to do this anyway.
42 */ 42 */
43 ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP); 43 ip = kmem_zone_alloc(xfs_inode_zone, 0);
44 if (!ip) 44 if (!ip)
45 return NULL; 45 return NULL;
46 if (inode_init_always(mp->m_super, VFS_I(ip))) { 46 if (inode_init_always(mp->m_super, VFS_I(ip))) {
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index d99a0a3e5f40..3ebd1b7f49d8 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -89,7 +89,7 @@ xfs_icreate_log(
89{ 89{
90 struct xfs_icreate_item *icp; 90 struct xfs_icreate_item *icp;
91 91
92 icp = kmem_zone_zalloc(xfs_icreate_zone, KM_SLEEP); 92 icp = kmem_zone_zalloc(xfs_icreate_zone, 0);
93 93
94 xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE, 94 xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE,
95 &xfs_icreate_item_ops); 95 &xfs_icreate_item_ops);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 6467d5e1df2d..18f4b262e61c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2018,7 +2018,7 @@ xfs_iunlink_add_backref(
2018 if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK)) 2018 if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
2019 return 0; 2019 return 0;
2020 2020
2021 iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS); 2021 iu = kmem_zalloc(sizeof(*iu), KM_NOFS);
2022 iu->iu_agino = prev_agino; 2022 iu->iu_agino = prev_agino;
2023 iu->iu_next_unlinked = this_agino; 2023 iu->iu_next_unlinked = this_agino;
2024 2024
@@ -3282,7 +3282,8 @@ xfs_rename(
3282 spaceres); 3282 spaceres);
3283 3283
3284 /* 3284 /*
3285 * Set up the target. 3285 * Check for expected errors before we dirty the transaction
3286 * so we can return an error without a transaction abort.
3286 */ 3287 */
3287 if (target_ip == NULL) { 3288 if (target_ip == NULL) {
3288 /* 3289 /*
@@ -3294,6 +3295,46 @@ xfs_rename(
3294 if (error) 3295 if (error)
3295 goto out_trans_cancel; 3296 goto out_trans_cancel;
3296 } 3297 }
3298 } else {
3299 /*
3300 * If target exists and it's a directory, check that whether
3301 * it can be destroyed.
3302 */
3303 if (S_ISDIR(VFS_I(target_ip)->i_mode) &&
3304 (!xfs_dir_isempty(target_ip) ||
3305 (VFS_I(target_ip)->i_nlink > 2))) {
3306 error = -EEXIST;
3307 goto out_trans_cancel;
3308 }
3309 }
3310
3311 /*
3312 * Directory entry creation below may acquire the AGF. Remove
3313 * the whiteout from the unlinked list first to preserve correct
3314 * AGI/AGF locking order. This dirties the transaction so failures
3315 * after this point will abort and log recovery will clean up the
3316 * mess.
3317 *
3318 * For whiteouts, we need to bump the link count on the whiteout
3319 * inode. After this point, we have a real link, clear the tmpfile
3320 * state flag from the inode so it doesn't accidentally get misused
3321 * in future.
3322 */
3323 if (wip) {
3324 ASSERT(VFS_I(wip)->i_nlink == 0);
3325 error = xfs_iunlink_remove(tp, wip);
3326 if (error)
3327 goto out_trans_cancel;
3328
3329 xfs_bumplink(tp, wip);
3330 xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
3331 VFS_I(wip)->i_state &= ~I_LINKABLE;
3332 }
3333
3334 /*
3335 * Set up the target.
3336 */
3337 if (target_ip == NULL) {
3297 /* 3338 /*
3298 * If target does not exist and the rename crosses 3339 * If target does not exist and the rename crosses
3299 * directories, adjust the target directory link count 3340 * directories, adjust the target directory link count
@@ -3312,22 +3353,6 @@ xfs_rename(
3312 } 3353 }
3313 } else { /* target_ip != NULL */ 3354 } else { /* target_ip != NULL */
3314 /* 3355 /*
3315 * If target exists and it's a directory, check that both
3316 * target and source are directories and that target can be
3317 * destroyed, or that neither is a directory.
3318 */
3319 if (S_ISDIR(VFS_I(target_ip)->i_mode)) {
3320 /*
3321 * Make sure target dir is empty.
3322 */
3323 if (!(xfs_dir_isempty(target_ip)) ||
3324 (VFS_I(target_ip)->i_nlink > 2)) {
3325 error = -EEXIST;
3326 goto out_trans_cancel;
3327 }
3328 }
3329
3330 /*
3331 * Link the source inode under the target name. 3356 * Link the source inode under the target name.
3332 * If the source inode is a directory and we are moving 3357 * If the source inode is a directory and we are moving
3333 * it across directories, its ".." entry will be 3358 * it across directories, its ".." entry will be
@@ -3417,30 +3442,6 @@ xfs_rename(
3417 if (error) 3442 if (error)
3418 goto out_trans_cancel; 3443 goto out_trans_cancel;
3419 3444
3420 /*
3421 * For whiteouts, we need to bump the link count on the whiteout inode.
3422 * This means that failures all the way up to this point leave the inode
3423 * on the unlinked list and so cleanup is a simple matter of dropping
3424 * the remaining reference to it. If we fail here after bumping the link
3425 * count, we're shutting down the filesystem so we'll never see the
3426 * intermediate state on disk.
3427 */
3428 if (wip) {
3429 ASSERT(VFS_I(wip)->i_nlink == 0);
3430 xfs_bumplink(tp, wip);
3431 error = xfs_iunlink_remove(tp, wip);
3432 if (error)
3433 goto out_trans_cancel;
3434 xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
3435
3436 /*
3437 * Now we have a real link, clear the "I'm a tmpfile" state
3438 * flag from the inode so it doesn't accidentally get misused in
3439 * future.
3440 */
3441 VFS_I(wip)->i_state &= ~I_LINKABLE;
3442 }
3443
3444 xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 3445 xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3445 xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); 3446 xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
3446 if (new_parent) 3447 if (new_parent)
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index c9a502eed204..bb8f076805b9 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -651,7 +651,7 @@ xfs_inode_item_init(
651 struct xfs_inode_log_item *iip; 651 struct xfs_inode_log_item *iip;
652 652
653 ASSERT(ip->i_itemp == NULL); 653 ASSERT(ip->i_itemp == NULL);
654 iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP); 654 iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, 0);
655 655
656 iip->ili_inode = ip; 656 iip->ili_inode = ip;
657 xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE, 657 xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index affa557c2337..d58f0d6a699e 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -396,7 +396,7 @@ xfs_attrlist_by_handle(
396 if (IS_ERR(dentry)) 396 if (IS_ERR(dentry))
397 return PTR_ERR(dentry); 397 return PTR_ERR(dentry);
398 398
399 kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP); 399 kbuf = kmem_zalloc_large(al_hreq.buflen, 0);
400 if (!kbuf) 400 if (!kbuf)
401 goto out_dput; 401 goto out_dput;
402 402
@@ -434,11 +434,11 @@ xfs_attrmulti_attr_get(
434 434
435 if (*len > XFS_XATTR_SIZE_MAX) 435 if (*len > XFS_XATTR_SIZE_MAX)
436 return -EINVAL; 436 return -EINVAL;
437 kbuf = kmem_zalloc_large(*len, KM_SLEEP); 437 kbuf = kmem_zalloc_large(*len, 0);
438 if (!kbuf) 438 if (!kbuf)
439 return -ENOMEM; 439 return -ENOMEM;
440 440
441 error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags); 441 error = xfs_attr_get(XFS_I(inode), name, &kbuf, (int *)len, flags);
442 if (error) 442 if (error)
443 goto out_kfree; 443 goto out_kfree;
444 444
@@ -831,7 +831,7 @@ xfs_bulkstat_fmt(
831/* 831/*
832 * Check the incoming bulk request @hdr from userspace and initialize the 832 * Check the incoming bulk request @hdr from userspace and initialize the
833 * internal @breq bulk request appropriately. Returns 0 if the bulk request 833 * internal @breq bulk request appropriately. Returns 0 if the bulk request
834 * should proceed; XFS_ITER_ABORT if there's nothing to do; or the usual 834 * should proceed; -ECANCELED if there's nothing to do; or the usual
835 * negative error code. 835 * negative error code.
836 */ 836 */
837static int 837static int
@@ -889,13 +889,13 @@ xfs_bulk_ireq_setup(
889 889
890 /* Asking for an inode past the end of the AG? We're done! */ 890 /* Asking for an inode past the end of the AG? We're done! */
891 if (XFS_INO_TO_AGNO(mp, breq->startino) > hdr->agno) 891 if (XFS_INO_TO_AGNO(mp, breq->startino) > hdr->agno)
892 return XFS_ITER_ABORT; 892 return -ECANCELED;
893 } else if (hdr->agno) 893 } else if (hdr->agno)
894 return -EINVAL; 894 return -EINVAL;
895 895
896 /* Asking for an inode past the end of the FS? We're done! */ 896 /* Asking for an inode past the end of the FS? We're done! */
897 if (XFS_INO_TO_AGNO(mp, breq->startino) >= mp->m_sb.sb_agcount) 897 if (XFS_INO_TO_AGNO(mp, breq->startino) >= mp->m_sb.sb_agcount)
898 return XFS_ITER_ABORT; 898 return -ECANCELED;
899 899
900 return 0; 900 return 0;
901} 901}
@@ -936,7 +936,7 @@ xfs_ioc_bulkstat(
936 return -EFAULT; 936 return -EFAULT;
937 937
938 error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->bulkstat); 938 error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->bulkstat);
939 if (error == XFS_ITER_ABORT) 939 if (error == -ECANCELED)
940 goto out_teardown; 940 goto out_teardown;
941 if (error < 0) 941 if (error < 0)
942 return error; 942 return error;
@@ -986,7 +986,7 @@ xfs_ioc_inumbers(
986 return -EFAULT; 986 return -EFAULT;
987 987
988 error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->inumbers); 988 error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->inumbers);
989 if (error == XFS_ITER_ABORT) 989 if (error == -ECANCELED)
990 goto out_teardown; 990 goto out_teardown;
991 if (error < 0) 991 if (error < 0)
992 return error; 992 return error;
@@ -1038,6 +1038,10 @@ xfs_ioc_ag_geometry(
1038 1038
1039 if (copy_from_user(&ageo, arg, sizeof(ageo))) 1039 if (copy_from_user(&ageo, arg, sizeof(ageo)))
1040 return -EFAULT; 1040 return -EFAULT;
1041 if (ageo.ag_flags)
1042 return -EINVAL;
1043 if (memchr_inv(&ageo.ag_reserved, 0, sizeof(ageo.ag_reserved)))
1044 return -EINVAL;
1041 1045
1042 error = xfs_ag_get_geometry(mp, ageo.ag_number, &ageo); 1046 error = xfs_ag_get_geometry(mp, ageo.ag_number, &ageo);
1043 if (error) 1047 if (error)
@@ -1309,8 +1313,7 @@ xfs_ioctl_setattr_dax_invalidate(
1309 if (fa->fsx_xflags & FS_XFLAG_DAX) { 1313 if (fa->fsx_xflags & FS_XFLAG_DAX) {
1310 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) 1314 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
1311 return -EINVAL; 1315 return -EINVAL;
1312 if (S_ISREG(inode->i_mode) && 1316 if (!bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)),
1313 !bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)),
1314 sb->s_blocksize)) 1317 sb->s_blocksize))
1315 return -EINVAL; 1318 return -EINVAL;
1316 } 1319 }
@@ -1881,7 +1884,7 @@ xfs_ioc_getfsmap(
1881 info.mp = ip->i_mount; 1884 info.mp = ip->i_mount;
1882 info.data = arg; 1885 info.data = arg;
1883 error = xfs_getfsmap(ip->i_mount, &xhead, xfs_getfsmap_format, &info); 1886 error = xfs_getfsmap(ip->i_mount, &xhead, xfs_getfsmap_format, &info);
1884 if (error == XFS_BTREE_QUERY_RANGE_ABORT) { 1887 if (error == -ECANCELED) {
1885 error = 0; 1888 error = 0;
1886 aborted = true; 1889 aborted = true;
1887 } else if (error) 1890 } else if (error)
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 7bd7534f5051..1e08bf79b478 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -381,7 +381,7 @@ xfs_compat_attrlist_by_handle(
381 return PTR_ERR(dentry); 381 return PTR_ERR(dentry);
382 382
383 error = -ENOMEM; 383 error = -ENOMEM;
384 kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP); 384 kbuf = kmem_zalloc_large(al_hreq.buflen, 0);
385 if (!kbuf) 385 if (!kbuf)
386 goto out_dput; 386 goto out_dput;
387 387
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 3a4310d7cb59..f780e223b118 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -58,7 +58,7 @@ xfs_bmbt_to_iomap(
58{ 58{
59 struct xfs_mount *mp = ip->i_mount; 59 struct xfs_mount *mp = ip->i_mount;
60 60
61 if (unlikely(!imap->br_startblock && !XFS_IS_REALTIME_INODE(ip))) 61 if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
62 return xfs_alert_fsblock_zero(ip, imap); 62 return xfs_alert_fsblock_zero(ip, imap);
63 63
64 if (imap->br_startblock == HOLESTARTBLOCK) { 64 if (imap->br_startblock == HOLESTARTBLOCK) {
@@ -297,7 +297,7 @@ xfs_iomap_write_direct(
297 goto out_unlock; 297 goto out_unlock;
298 } 298 }
299 299
300 if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) 300 if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
301 error = xfs_alert_fsblock_zero(ip, imap); 301 error = xfs_alert_fsblock_zero(ip, imap);
302 302
303out_unlock: 303out_unlock:
@@ -814,7 +814,7 @@ xfs_iomap_write_unwritten(
814 if (error) 814 if (error)
815 return error; 815 return error;
816 816
817 if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) 817 if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock)))
818 return xfs_alert_fsblock_zero(ip, &imap); 818 return xfs_alert_fsblock_zero(ip, &imap);
819 819
820 if ((numblks_fsb = imap.br_blockcount) == 0) { 820 if ((numblks_fsb = imap.br_blockcount) == 0) {
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index f5c955d35be4..884950adbd16 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -137,7 +137,7 @@ xfs_bulkstat_one_int(
137 xfs_irele(ip); 137 xfs_irele(ip);
138 138
139 error = bc->formatter(bc->breq, buf); 139 error = bc->formatter(bc->breq, buf);
140 if (error == XFS_IBULK_ABORT) 140 if (error == -ECANCELED)
141 goto out_advance; 141 goto out_advance;
142 if (error) 142 if (error)
143 goto out; 143 goto out;
@@ -169,7 +169,7 @@ xfs_bulkstat_one(
169 ASSERT(breq->icount == 1); 169 ASSERT(breq->icount == 1);
170 170
171 bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat), 171 bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat),
172 KM_SLEEP | KM_MAYFAIL); 172 KM_MAYFAIL);
173 if (!bc.buf) 173 if (!bc.buf)
174 return -ENOMEM; 174 return -ENOMEM;
175 175
@@ -181,7 +181,7 @@ xfs_bulkstat_one(
181 * If we reported one inode to userspace then we abort because we hit 181 * If we reported one inode to userspace then we abort because we hit
182 * the end of the buffer. Don't leak that back to userspace. 182 * the end of the buffer. Don't leak that back to userspace.
183 */ 183 */
184 if (error == XFS_IWALK_ABORT) 184 if (error == -ECANCELED)
185 error = 0; 185 error = 0;
186 186
187 return error; 187 return error;
@@ -243,7 +243,7 @@ xfs_bulkstat(
243 return 0; 243 return 0;
244 244
245 bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat), 245 bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat),
246 KM_SLEEP | KM_MAYFAIL); 246 KM_MAYFAIL);
247 if (!bc.buf) 247 if (!bc.buf)
248 return -ENOMEM; 248 return -ENOMEM;
249 249
@@ -342,7 +342,7 @@ xfs_inumbers_walk(
342 int error; 342 int error;
343 343
344 error = ic->formatter(ic->breq, &inogrp); 344 error = ic->formatter(ic->breq, &inogrp);
345 if (error && error != XFS_IBULK_ABORT) 345 if (error && error != -ECANCELED)
346 return error; 346 return error;
347 347
348 ic->breq->startino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino) + 348 ic->breq->startino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino) +
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index e90c1fc5b981..96a1e2a9be3f 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -18,9 +18,6 @@ struct xfs_ibulk {
18/* Only iterate within the same AG as startino */ 18/* Only iterate within the same AG as startino */
19#define XFS_IBULK_SAME_AG (XFS_IWALK_SAME_AG) 19#define XFS_IBULK_SAME_AG (XFS_IWALK_SAME_AG)
20 20
21/* Return value that means we want to abort the walk. */
22#define XFS_IBULK_ABORT (XFS_IWALK_ABORT)
23
24/* 21/*
25 * Advance the user buffer pointer by one record of the given size. If the 22 * Advance the user buffer pointer by one record of the given size. If the
26 * buffer is now full, return the appropriate error code. 23 * buffer is now full, return the appropriate error code.
@@ -34,13 +31,21 @@ xfs_ibulk_advance(
34 31
35 breq->ubuffer = b + bytes; 32 breq->ubuffer = b + bytes;
36 breq->ocount++; 33 breq->ocount++;
37 return breq->ocount == breq->icount ? XFS_IBULK_ABORT : 0; 34 return breq->ocount == breq->icount ? -ECANCELED : 0;
38} 35}
39 36
40/* 37/*
41 * Return stat information in bulk (by-inode) for the filesystem. 38 * Return stat information in bulk (by-inode) for the filesystem.
42 */ 39 */
43 40
41/*
42 * Return codes for the formatter function are 0 to continue iterating, and
43 * non-zero to stop iterating. Any non-zero value will be passed up to the
44 * bulkstat/inumbers caller. The special value -ECANCELED can be used to stop
45 * iteration, as neither bulkstat nor inumbers will ever generate that error
46 * code on their own.
47 */
48
44typedef int (*bulkstat_one_fmt_pf)(struct xfs_ibulk *breq, 49typedef int (*bulkstat_one_fmt_pf)(struct xfs_ibulk *breq,
45 const struct xfs_bulkstat *bstat); 50 const struct xfs_bulkstat *bstat);
46 51
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
index 8c7d727149ea..aa375cf53021 100644
--- a/fs/xfs/xfs_iwalk.c
+++ b/fs/xfs/xfs_iwalk.c
@@ -31,7 +31,7 @@
31 * inode it finds, it calls a walk function with the relevant inode number and 31 * inode it finds, it calls a walk function with the relevant inode number and
32 * a pointer to caller-provided data. The walk function can return the usual 32 * a pointer to caller-provided data. The walk function can return the usual
33 * negative error code to stop the iteration; 0 to continue the iteration; or 33 * negative error code to stop the iteration; 0 to continue the iteration; or
34 * XFS_IWALK_ABORT to stop the iteration. This return value is returned to the 34 * -ECANCELED to stop the iteration. This return value is returned to the
35 * caller. 35 * caller.
36 * 36 *
37 * Internally, we allow the walk function to do anything, which means that we 37 * Internally, we allow the walk function to do anything, which means that we
@@ -616,7 +616,7 @@ xfs_iwalk_threaded(
616 if (xfs_pwork_ctl_want_abort(&pctl)) 616 if (xfs_pwork_ctl_want_abort(&pctl))
617 break; 617 break;
618 618
619 iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), KM_SLEEP); 619 iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), 0);
620 iwag->mp = mp; 620 iwag->mp = mp;
621 iwag->iwalk_fn = iwalk_fn; 621 iwag->iwalk_fn = iwalk_fn;
622 iwag->data = data; 622 iwag->data = data;
diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h
index 6c960e10ed4d..37a795f03267 100644
--- a/fs/xfs/xfs_iwalk.h
+++ b/fs/xfs/xfs_iwalk.h
@@ -6,12 +6,17 @@
6#ifndef __XFS_IWALK_H__ 6#ifndef __XFS_IWALK_H__
7#define __XFS_IWALK_H__ 7#define __XFS_IWALK_H__
8 8
9/*
10 * Return codes for the inode/inobt walk function are 0 to continue iterating,
11 * and non-zero to stop iterating. Any non-zero value will be passed up to the
12 * iwalk or inobt_walk caller. The special value -ECANCELED can be used to
13 * stop iteration, as neither iwalk nor inobt_walk will ever generate that
14 * error code on their own.
15 */
16
9/* Walk all inodes in the filesystem starting from @startino. */ 17/* Walk all inodes in the filesystem starting from @startino. */
10typedef int (*xfs_iwalk_fn)(struct xfs_mount *mp, struct xfs_trans *tp, 18typedef int (*xfs_iwalk_fn)(struct xfs_mount *mp, struct xfs_trans *tp,
11 xfs_ino_t ino, void *data); 19 xfs_ino_t ino, void *data);
12/* Return values for xfs_iwalk_fn. */
13#define XFS_IWALK_CONTINUE (XFS_ITER_CONTINUE)
14#define XFS_IWALK_ABORT (XFS_ITER_ABORT)
15 20
16int xfs_iwalk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino, 21int xfs_iwalk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino,
17 unsigned int flags, xfs_iwalk_fn iwalk_fn, 22 unsigned int flags, xfs_iwalk_fn iwalk_fn,
@@ -30,8 +35,6 @@ typedef int (*xfs_inobt_walk_fn)(struct xfs_mount *mp, struct xfs_trans *tp,
30 xfs_agnumber_t agno, 35 xfs_agnumber_t agno,
31 const struct xfs_inobt_rec_incore *irec, 36 const struct xfs_inobt_rec_incore *irec,
32 void *data); 37 void *data);
33/* Return value (for xfs_inobt_walk_fn) that aborts the walk immediately. */
34#define XFS_INOBT_WALK_ABORT (XFS_IWALK_ABORT)
35 38
36int xfs_inobt_walk(struct xfs_mount *mp, struct xfs_trans *tp, 39int xfs_inobt_walk(struct xfs_mount *mp, struct xfs_trans *tp,
37 xfs_ino_t startino, unsigned int flags, 40 xfs_ino_t startino, unsigned int flags,
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 7fc3c1ad36bc..a2beee9f74da 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -214,15 +214,42 @@ xlog_grant_head_wake(
214{ 214{
215 struct xlog_ticket *tic; 215 struct xlog_ticket *tic;
216 int need_bytes; 216 int need_bytes;
217 bool woken_task = false;
217 218
218 list_for_each_entry(tic, &head->waiters, t_queue) { 219 list_for_each_entry(tic, &head->waiters, t_queue) {
220
221 /*
222 * There is a chance that the size of the CIL checkpoints in
223 * progress at the last AIL push target calculation resulted in
224 * limiting the target to the log head (l_last_sync_lsn) at the
225 * time. This may not reflect where the log head is now as the
226 * CIL checkpoints may have completed.
227 *
228 * Hence when we are woken here, it may be that the head of the
229 * log that has moved rather than the tail. As the tail didn't
230 * move, there still won't be space available for the
231 * reservation we require. However, if the AIL has already
232 * pushed to the target defined by the old log head location, we
233 * will hang here waiting for something else to update the AIL
234 * push target.
235 *
236 * Therefore, if there isn't space to wake the first waiter on
237 * the grant head, we need to push the AIL again to ensure the
238 * target reflects both the current log tail and log head
239 * position before we wait for the tail to move again.
240 */
241
219 need_bytes = xlog_ticket_reservation(log, head, tic); 242 need_bytes = xlog_ticket_reservation(log, head, tic);
220 if (*free_bytes < need_bytes) 243 if (*free_bytes < need_bytes) {
244 if (!woken_task)
245 xlog_grant_push_ail(log, need_bytes);
221 return false; 246 return false;
247 }
222 248
223 *free_bytes -= need_bytes; 249 *free_bytes -= need_bytes;
224 trace_xfs_log_grant_wake_up(log, tic); 250 trace_xfs_log_grant_wake_up(log, tic);
225 wake_up_process(tic->t_task); 251 wake_up_process(tic->t_task);
252 woken_task = true;
226 } 253 }
227 254
228 return true; 255 return true;
@@ -428,8 +455,7 @@ xfs_log_reserve(
428 XFS_STATS_INC(mp, xs_try_logspace); 455 XFS_STATS_INC(mp, xs_try_logspace);
429 456
430 ASSERT(*ticp == NULL); 457 ASSERT(*ticp == NULL);
431 tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, 458 tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, 0);
432 KM_SLEEP);
433 *ticp = tic; 459 *ticp = tic;
434 460
435 xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt 461 xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
@@ -1404,6 +1430,7 @@ xlog_alloc_log(
1404 */ 1430 */
1405 ASSERT(log->l_iclog_size >= 4096); 1431 ASSERT(log->l_iclog_size >= 4096);
1406 for (i = 0; i < log->l_iclog_bufs; i++) { 1432 for (i = 0; i < log->l_iclog_bufs; i++) {
1433 int align_mask = xfs_buftarg_dma_alignment(mp->m_logdev_targp);
1407 size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) * 1434 size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) *
1408 sizeof(struct bio_vec); 1435 sizeof(struct bio_vec);
1409 1436
@@ -1415,8 +1442,8 @@ xlog_alloc_log(
1415 iclog->ic_prev = prev_iclog; 1442 iclog->ic_prev = prev_iclog;
1416 prev_iclog = iclog; 1443 prev_iclog = iclog;
1417 1444
1418 iclog->ic_data = kmem_alloc_large(log->l_iclog_size, 1445 iclog->ic_data = kmem_alloc_io(log->l_iclog_size, align_mask,
1419 KM_MAYFAIL); 1446 KM_MAYFAIL);
1420 if (!iclog->ic_data) 1447 if (!iclog->ic_data)
1421 goto out_free_iclog; 1448 goto out_free_iclog;
1422#ifdef DEBUG 1449#ifdef DEBUG
@@ -2496,21 +2523,35 @@ next_lv:
2496 ***************************************************************************** 2523 *****************************************************************************
2497 */ 2524 */
2498 2525
2499/* Clean iclogs starting from the head. This ordering must be 2526/*
2500 * maintained, so an iclog doesn't become ACTIVE beyond one that 2527 * An iclog has just finished IO completion processing, so we need to update
2501 * is SYNCING. This is also required to maintain the notion that we use 2528 * the iclog state and propagate that up into the overall log state. Hence we
2502 * a ordered wait queue to hold off would be writers to the log when every 2529 * prepare the iclog for cleaning, and then clean all the pending dirty iclogs
2503 * iclog is trying to sync to disk. 2530 * starting from the head, and then wake up any threads that are waiting for the
2531 * iclog to be marked clean.
2532 *
2533 * The ordering of marking iclogs ACTIVE must be maintained, so an iclog
2534 * doesn't become ACTIVE beyond one that is SYNCING. This is also required to
2535 * maintain the notion that we use a ordered wait queue to hold off would be
2536 * writers to the log when every iclog is trying to sync to disk.
2537 *
2538 * Caller must hold the icloglock before calling us.
2504 * 2539 *
2505 * State Change: DIRTY -> ACTIVE 2540 * State Change: !IOERROR -> DIRTY -> ACTIVE
2506 */ 2541 */
2507STATIC void 2542STATIC void
2508xlog_state_clean_log( 2543xlog_state_clean_iclog(
2509 struct xlog *log) 2544 struct xlog *log,
2545 struct xlog_in_core *dirty_iclog)
2510{ 2546{
2511 xlog_in_core_t *iclog; 2547 struct xlog_in_core *iclog;
2512 int changed = 0; 2548 int changed = 0;
2513 2549
2550 /* Prepare the completed iclog. */
2551 if (!(dirty_iclog->ic_state & XLOG_STATE_IOERROR))
2552 dirty_iclog->ic_state = XLOG_STATE_DIRTY;
2553
2554 /* Walk all the iclogs to update the ordered active state. */
2514 iclog = log->l_iclog; 2555 iclog = log->l_iclog;
2515 do { 2556 do {
2516 if (iclog->ic_state == XLOG_STATE_DIRTY) { 2557 if (iclog->ic_state == XLOG_STATE_DIRTY) {
@@ -2548,7 +2589,13 @@ xlog_state_clean_log(
2548 iclog = iclog->ic_next; 2589 iclog = iclog->ic_next;
2549 } while (iclog != log->l_iclog); 2590 } while (iclog != log->l_iclog);
2550 2591
2551 /* log is locked when we are called */ 2592
2593 /*
2594 * Wake up threads waiting in xfs_log_force() for the dirty iclog
2595 * to be cleaned.
2596 */
2597 wake_up_all(&dirty_iclog->ic_force_wait);
2598
2552 /* 2599 /*
2553 * Change state for the dummy log recording. 2600 * Change state for the dummy log recording.
2554 * We usually go to NEED. But we go to NEED2 if the changed indicates 2601 * We usually go to NEED. But we go to NEED2 if the changed indicates
@@ -2582,7 +2629,7 @@ xlog_state_clean_log(
2582 ASSERT(0); 2629 ASSERT(0);
2583 } 2630 }
2584 } 2631 }
2585} /* xlog_state_clean_log */ 2632}
2586 2633
2587STATIC xfs_lsn_t 2634STATIC xfs_lsn_t
2588xlog_get_lowest_lsn( 2635xlog_get_lowest_lsn(
@@ -2603,30 +2650,205 @@ xlog_get_lowest_lsn(
2603 return lowest_lsn; 2650 return lowest_lsn;
2604} 2651}
2605 2652
2653/*
2654 * Completion of a iclog IO does not imply that a transaction has completed, as
2655 * transactions can be large enough to span many iclogs. We cannot change the
2656 * tail of the log half way through a transaction as this may be the only
2657 * transaction in the log and moving the tail to point to the middle of it
2658 * will prevent recovery from finding the start of the transaction. Hence we
2659 * should only update the last_sync_lsn if this iclog contains transaction
2660 * completion callbacks on it.
2661 *
2662 * We have to do this before we drop the icloglock to ensure we are the only one
2663 * that can update it.
2664 *
2665 * If we are moving the last_sync_lsn forwards, we also need to ensure we kick
2666 * the reservation grant head pushing. This is due to the fact that the push
2667 * target is bound by the current last_sync_lsn value. Hence if we have a large
2668 * amount of log space bound up in this committing transaction then the
2669 * last_sync_lsn value may be the limiting factor preventing tail pushing from
2670 * freeing space in the log. Hence once we've updated the last_sync_lsn we
2671 * should push the AIL to ensure the push target (and hence the grant head) is
2672 * no longer bound by the old log head location and can move forwards and make
2673 * progress again.
2674 */
2675static void
2676xlog_state_set_callback(
2677 struct xlog *log,
2678 struct xlog_in_core *iclog,
2679 xfs_lsn_t header_lsn)
2680{
2681 iclog->ic_state = XLOG_STATE_CALLBACK;
2682
2683 ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
2684 header_lsn) <= 0);
2685
2686 if (list_empty_careful(&iclog->ic_callbacks))
2687 return;
2688
2689 atomic64_set(&log->l_last_sync_lsn, header_lsn);
2690 xlog_grant_push_ail(log, 0);
2691}
2692
2693/*
2694 * Return true if we need to stop processing, false to continue to the next
2695 * iclog. The caller will need to run callbacks if the iclog is returned in the
2696 * XLOG_STATE_CALLBACK state.
2697 */
2698static bool
2699xlog_state_iodone_process_iclog(
2700 struct xlog *log,
2701 struct xlog_in_core *iclog,
2702 struct xlog_in_core *completed_iclog,
2703 bool *ioerror)
2704{
2705 xfs_lsn_t lowest_lsn;
2706 xfs_lsn_t header_lsn;
2707
2708 /* Skip all iclogs in the ACTIVE & DIRTY states */
2709 if (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))
2710 return false;
2711
2712 /*
2713 * Between marking a filesystem SHUTDOWN and stopping the log, we do
2714 * flush all iclogs to disk (if there wasn't a log I/O error). So, we do
2715 * want things to go smoothly in case of just a SHUTDOWN w/o a
2716 * LOG_IO_ERROR.
2717 */
2718 if (iclog->ic_state & XLOG_STATE_IOERROR) {
2719 *ioerror = true;
2720 return false;
2721 }
2722
2723 /*
2724 * Can only perform callbacks in order. Since this iclog is not in the
2725 * DONE_SYNC/ DO_CALLBACK state, we skip the rest and just try to clean
2726 * up. If we set our iclog to DO_CALLBACK, we will not process it when
2727 * we retry since a previous iclog is in the CALLBACK and the state
2728 * cannot change since we are holding the l_icloglock.
2729 */
2730 if (!(iclog->ic_state &
2731 (XLOG_STATE_DONE_SYNC | XLOG_STATE_DO_CALLBACK))) {
2732 if (completed_iclog &&
2733 (completed_iclog->ic_state == XLOG_STATE_DONE_SYNC)) {
2734 completed_iclog->ic_state = XLOG_STATE_DO_CALLBACK;
2735 }
2736 return true;
2737 }
2738
2739 /*
2740 * We now have an iclog that is in either the DO_CALLBACK or DONE_SYNC
2741 * states. The other states (WANT_SYNC, SYNCING, or CALLBACK were caught
2742 * by the above if and are going to clean (i.e. we aren't doing their
2743 * callbacks) see the above if.
2744 *
2745 * We will do one more check here to see if we have chased our tail
2746 * around. If this is not the lowest lsn iclog, then we will leave it
2747 * for another completion to process.
2748 */
2749 header_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
2750 lowest_lsn = xlog_get_lowest_lsn(log);
2751 if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, header_lsn) < 0)
2752 return false;
2753
2754 xlog_state_set_callback(log, iclog, header_lsn);
2755 return false;
2756
2757}
2758
2759/*
2760 * Keep processing entries in the iclog callback list until we come around and
2761 * it is empty. We need to atomically see that the list is empty and change the
2762 * state to DIRTY so that we don't miss any more callbacks being added.
2763 *
2764 * This function is called with the icloglock held and returns with it held. We
2765 * drop it while running callbacks, however, as holding it over thousands of
2766 * callbacks is unnecessary and causes excessive contention if we do.
2767 */
2768static void
2769xlog_state_do_iclog_callbacks(
2770 struct xlog *log,
2771 struct xlog_in_core *iclog,
2772 bool aborted)
2773{
2774 spin_unlock(&log->l_icloglock);
2775 spin_lock(&iclog->ic_callback_lock);
2776 while (!list_empty(&iclog->ic_callbacks)) {
2777 LIST_HEAD(tmp);
2778
2779 list_splice_init(&iclog->ic_callbacks, &tmp);
2780
2781 spin_unlock(&iclog->ic_callback_lock);
2782 xlog_cil_process_committed(&tmp, aborted);
2783 spin_lock(&iclog->ic_callback_lock);
2784 }
2785
2786 /*
2787 * Pick up the icloglock while still holding the callback lock so we
2788 * serialise against anyone trying to add more callbacks to this iclog
2789 * now we've finished processing.
2790 */
2791 spin_lock(&log->l_icloglock);
2792 spin_unlock(&iclog->ic_callback_lock);
2793}
2794
2795#ifdef DEBUG
2796/*
2797 * Make one last gasp attempt to see if iclogs are being left in limbo. If the
2798 * above loop finds an iclog earlier than the current iclog and in one of the
2799 * syncing states, the current iclog is put into DO_CALLBACK and the callbacks
2800 * are deferred to the completion of the earlier iclog. Walk the iclogs in order
2801 * and make sure that no iclog is in DO_CALLBACK unless an earlier iclog is in
2802 * one of the syncing states.
2803 *
2804 * Note that SYNCING|IOERROR is a valid state so we cannot just check for
2805 * ic_state == SYNCING.
2806 */
2807static void
2808xlog_state_callback_check_state(
2809 struct xlog *log)
2810{
2811 struct xlog_in_core *first_iclog = log->l_iclog;
2812 struct xlog_in_core *iclog = first_iclog;
2813
2814 do {
2815 ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK);
2816 /*
2817 * Terminate the loop if iclogs are found in states
2818 * which will cause other threads to clean up iclogs.
2819 *
2820 * SYNCING - i/o completion will go through logs
2821 * DONE_SYNC - interrupt thread should be waiting for
2822 * l_icloglock
2823 * IOERROR - give up hope all ye who enter here
2824 */
2825 if (iclog->ic_state == XLOG_STATE_WANT_SYNC ||
2826 iclog->ic_state & XLOG_STATE_SYNCING ||
2827 iclog->ic_state == XLOG_STATE_DONE_SYNC ||
2828 iclog->ic_state == XLOG_STATE_IOERROR )
2829 break;
2830 iclog = iclog->ic_next;
2831 } while (first_iclog != iclog);
2832}
2833#else
2834#define xlog_state_callback_check_state(l) ((void)0)
2835#endif
2836
2606STATIC void 2837STATIC void
2607xlog_state_do_callback( 2838xlog_state_do_callback(
2608 struct xlog *log, 2839 struct xlog *log,
2609 bool aborted, 2840 bool aborted,
2610 struct xlog_in_core *ciclog) 2841 struct xlog_in_core *ciclog)
2611{ 2842{
2612 xlog_in_core_t *iclog; 2843 struct xlog_in_core *iclog;
2613 xlog_in_core_t *first_iclog; /* used to know when we've 2844 struct xlog_in_core *first_iclog;
2614 * processed all iclogs once */ 2845 bool did_callbacks = false;
2615 int flushcnt = 0; 2846 bool cycled_icloglock;
2616 xfs_lsn_t lowest_lsn; 2847 bool ioerror;
2617 int ioerrors; /* counter: iclogs with errors */ 2848 int flushcnt = 0;
2618 int loopdidcallbacks; /* flag: inner loop did callbacks*/ 2849 int repeats = 0;
2619 int funcdidcallbacks; /* flag: function did callbacks */
2620 int repeats; /* for issuing console warnings if
2621 * looping too many times */
2622 int wake = 0;
2623 2850
2624 spin_lock(&log->l_icloglock); 2851 spin_lock(&log->l_icloglock);
2625 first_iclog = iclog = log->l_iclog;
2626 ioerrors = 0;
2627 funcdidcallbacks = 0;
2628 repeats = 0;
2629
2630 do { 2852 do {
2631 /* 2853 /*
2632 * Scan all iclogs starting with the one pointed to by the 2854 * Scan all iclogs starting with the one pointed to by the
@@ -2638,137 +2860,34 @@ xlog_state_do_callback(
2638 */ 2860 */
2639 first_iclog = log->l_iclog; 2861 first_iclog = log->l_iclog;
2640 iclog = log->l_iclog; 2862 iclog = log->l_iclog;
2641 loopdidcallbacks = 0; 2863 cycled_icloglock = false;
2864 ioerror = false;
2642 repeats++; 2865 repeats++;
2643 2866
2644 do { 2867 do {
2868 if (xlog_state_iodone_process_iclog(log, iclog,
2869 ciclog, &ioerror))
2870 break;
2645 2871
2646 /* skip all iclogs in the ACTIVE & DIRTY states */ 2872 if (!(iclog->ic_state &
2647 if (iclog->ic_state & 2873 (XLOG_STATE_CALLBACK | XLOG_STATE_IOERROR))) {
2648 (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY)) {
2649 iclog = iclog->ic_next; 2874 iclog = iclog->ic_next;
2650 continue; 2875 continue;
2651 } 2876 }
2652 2877
2653 /* 2878 /*
2654 * Between marking a filesystem SHUTDOWN and stopping 2879 * Running callbacks will drop the icloglock which means
2655 * the log, we do flush all iclogs to disk (if there 2880 * we'll have to run at least one more complete loop.
2656 * wasn't a log I/O error). So, we do want things to
2657 * go smoothly in case of just a SHUTDOWN w/o a
2658 * LOG_IO_ERROR.
2659 */
2660 if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
2661 /*
2662 * Can only perform callbacks in order. Since
2663 * this iclog is not in the DONE_SYNC/
2664 * DO_CALLBACK state, we skip the rest and
2665 * just try to clean up. If we set our iclog
2666 * to DO_CALLBACK, we will not process it when
2667 * we retry since a previous iclog is in the
2668 * CALLBACK and the state cannot change since
2669 * we are holding the l_icloglock.
2670 */
2671 if (!(iclog->ic_state &
2672 (XLOG_STATE_DONE_SYNC |
2673 XLOG_STATE_DO_CALLBACK))) {
2674 if (ciclog && (ciclog->ic_state ==
2675 XLOG_STATE_DONE_SYNC)) {
2676 ciclog->ic_state = XLOG_STATE_DO_CALLBACK;
2677 }
2678 break;
2679 }
2680 /*
2681 * We now have an iclog that is in either the
2682 * DO_CALLBACK or DONE_SYNC states. The other
2683 * states (WANT_SYNC, SYNCING, or CALLBACK were
2684 * caught by the above if and are going to
2685 * clean (i.e. we aren't doing their callbacks)
2686 * see the above if.
2687 */
2688
2689 /*
2690 * We will do one more check here to see if we
2691 * have chased our tail around.
2692 */
2693
2694 lowest_lsn = xlog_get_lowest_lsn(log);
2695 if (lowest_lsn &&
2696 XFS_LSN_CMP(lowest_lsn,
2697 be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
2698 iclog = iclog->ic_next;
2699 continue; /* Leave this iclog for
2700 * another thread */
2701 }
2702
2703 iclog->ic_state = XLOG_STATE_CALLBACK;
2704
2705
2706 /*
2707 * Completion of a iclog IO does not imply that
2708 * a transaction has completed, as transactions
2709 * can be large enough to span many iclogs. We
2710 * cannot change the tail of the log half way
2711 * through a transaction as this may be the only
2712 * transaction in the log and moving th etail to
2713 * point to the middle of it will prevent
2714 * recovery from finding the start of the
2715 * transaction. Hence we should only update the
2716 * last_sync_lsn if this iclog contains
2717 * transaction completion callbacks on it.
2718 *
2719 * We have to do this before we drop the
2720 * icloglock to ensure we are the only one that
2721 * can update it.
2722 */
2723 ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
2724 be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
2725 if (!list_empty_careful(&iclog->ic_callbacks))
2726 atomic64_set(&log->l_last_sync_lsn,
2727 be64_to_cpu(iclog->ic_header.h_lsn));
2728
2729 } else
2730 ioerrors++;
2731
2732 spin_unlock(&log->l_icloglock);
2733
2734 /*
2735 * Keep processing entries in the callback list until
2736 * we come around and it is empty. We need to
2737 * atomically see that the list is empty and change the
2738 * state to DIRTY so that we don't miss any more
2739 * callbacks being added.
2740 */
2741 spin_lock(&iclog->ic_callback_lock);
2742 while (!list_empty(&iclog->ic_callbacks)) {
2743 LIST_HEAD(tmp);
2744
2745 list_splice_init(&iclog->ic_callbacks, &tmp);
2746
2747 spin_unlock(&iclog->ic_callback_lock);
2748 xlog_cil_process_committed(&tmp, aborted);
2749 spin_lock(&iclog->ic_callback_lock);
2750 }
2751
2752 loopdidcallbacks++;
2753 funcdidcallbacks++;
2754
2755 spin_lock(&log->l_icloglock);
2756 spin_unlock(&iclog->ic_callback_lock);
2757 if (!(iclog->ic_state & XLOG_STATE_IOERROR))
2758 iclog->ic_state = XLOG_STATE_DIRTY;
2759
2760 /*
2761 * Transition from DIRTY to ACTIVE if applicable.
2762 * NOP if STATE_IOERROR.
2763 */ 2881 */
2764 xlog_state_clean_log(log); 2882 cycled_icloglock = true;
2765 2883 xlog_state_do_iclog_callbacks(log, iclog, aborted);
2766 /* wake up threads waiting in xfs_log_force() */
2767 wake_up_all(&iclog->ic_force_wait);
2768 2884
2885 xlog_state_clean_iclog(log, iclog);
2769 iclog = iclog->ic_next; 2886 iclog = iclog->ic_next;
2770 } while (first_iclog != iclog); 2887 } while (first_iclog != iclog);
2771 2888
2889 did_callbacks |= cycled_icloglock;
2890
2772 if (repeats > 5000) { 2891 if (repeats > 5000) {
2773 flushcnt += repeats; 2892 flushcnt += repeats;
2774 repeats = 0; 2893 repeats = 0;
@@ -2776,50 +2895,15 @@ xlog_state_do_callback(
2776 "%s: possible infinite loop (%d iterations)", 2895 "%s: possible infinite loop (%d iterations)",
2777 __func__, flushcnt); 2896 __func__, flushcnt);
2778 } 2897 }
2779 } while (!ioerrors && loopdidcallbacks); 2898 } while (!ioerror && cycled_icloglock);
2780 2899
2781#ifdef DEBUG 2900 if (did_callbacks)
2782 /* 2901 xlog_state_callback_check_state(log);
2783 * Make one last gasp attempt to see if iclogs are being left in limbo.
2784 * If the above loop finds an iclog earlier than the current iclog and
2785 * in one of the syncing states, the current iclog is put into
2786 * DO_CALLBACK and the callbacks are deferred to the completion of the
2787 * earlier iclog. Walk the iclogs in order and make sure that no iclog
2788 * is in DO_CALLBACK unless an earlier iclog is in one of the syncing
2789 * states.
2790 *
2791 * Note that SYNCING|IOABORT is a valid state so we cannot just check
2792 * for ic_state == SYNCING.
2793 */
2794 if (funcdidcallbacks) {
2795 first_iclog = iclog = log->l_iclog;
2796 do {
2797 ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK);
2798 /*
2799 * Terminate the loop if iclogs are found in states
2800 * which will cause other threads to clean up iclogs.
2801 *
2802 * SYNCING - i/o completion will go through logs
2803 * DONE_SYNC - interrupt thread should be waiting for
2804 * l_icloglock
2805 * IOERROR - give up hope all ye who enter here
2806 */
2807 if (iclog->ic_state == XLOG_STATE_WANT_SYNC ||
2808 iclog->ic_state & XLOG_STATE_SYNCING ||
2809 iclog->ic_state == XLOG_STATE_DONE_SYNC ||
2810 iclog->ic_state == XLOG_STATE_IOERROR )
2811 break;
2812 iclog = iclog->ic_next;
2813 } while (first_iclog != iclog);
2814 }
2815#endif
2816 2902
2817 if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR)) 2903 if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR))
2818 wake = 1;
2819 spin_unlock(&log->l_icloglock);
2820
2821 if (wake)
2822 wake_up_all(&log->l_flush_wait); 2904 wake_up_all(&log->l_flush_wait);
2905
2906 spin_unlock(&log->l_icloglock);
2823} 2907}
2824 2908
2825 2909
@@ -3919,7 +4003,9 @@ xfs_log_force_umount(
3919 * item committed callback functions will do this again under lock to 4003 * item committed callback functions will do this again under lock to
3920 * avoid races. 4004 * avoid races.
3921 */ 4005 */
4006 spin_lock(&log->l_cilp->xc_push_lock);
3922 wake_up_all(&log->l_cilp->xc_commit_wait); 4007 wake_up_all(&log->l_cilp->xc_commit_wait);
4008 spin_unlock(&log->l_cilp->xc_push_lock);
3923 xlog_state_do_callback(log, true, NULL); 4009 xlog_state_do_callback(log, true, NULL);
3924 4010
3925#ifdef XFSERRORDEBUG 4011#ifdef XFSERRORDEBUG
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index fa5602d0fd7f..ef652abd112c 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -38,7 +38,7 @@ xlog_cil_ticket_alloc(
38 struct xlog_ticket *tic; 38 struct xlog_ticket *tic;
39 39
40 tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0, 40 tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
41 KM_SLEEP|KM_NOFS); 41 KM_NOFS);
42 42
43 /* 43 /*
44 * set the current reservation to zero so we know to steal the basic 44 * set the current reservation to zero so we know to steal the basic
@@ -186,7 +186,7 @@ xlog_cil_alloc_shadow_bufs(
186 */ 186 */
187 kmem_free(lip->li_lv_shadow); 187 kmem_free(lip->li_lv_shadow);
188 188
189 lv = kmem_alloc_large(buf_size, KM_SLEEP | KM_NOFS); 189 lv = kmem_alloc_large(buf_size, KM_NOFS);
190 memset(lv, 0, xlog_cil_iovec_space(niovecs)); 190 memset(lv, 0, xlog_cil_iovec_space(niovecs));
191 191
192 lv->lv_item = lip; 192 lv->lv_item = lip;
@@ -660,7 +660,7 @@ xlog_cil_push(
660 if (!cil) 660 if (!cil)
661 return 0; 661 return 0;
662 662
663 new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); 663 new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_NOFS);
664 new_ctx->ticket = xlog_cil_ticket_alloc(log); 664 new_ctx->ticket = xlog_cil_ticket_alloc(log);
665 665
666 down_write(&cil->xc_ctx_lock); 666 down_write(&cil->xc_ctx_lock);
@@ -1179,11 +1179,11 @@ xlog_cil_init(
1179 struct xfs_cil *cil; 1179 struct xfs_cil *cil;
1180 struct xfs_cil_ctx *ctx; 1180 struct xfs_cil_ctx *ctx;
1181 1181
1182 cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL); 1182 cil = kmem_zalloc(sizeof(*cil), KM_MAYFAIL);
1183 if (!cil) 1183 if (!cil)
1184 return -ENOMEM; 1184 return -ENOMEM;
1185 1185
1186 ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL); 1186 ctx = kmem_zalloc(sizeof(*ctx), KM_MAYFAIL);
1187 if (!ctx) { 1187 if (!ctx) {
1188 kmem_free(cil); 1188 kmem_free(cil);
1189 return -ENOMEM; 1189 return -ENOMEM;
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 13d1d3e95b88..508319039dce 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -97,6 +97,8 @@ xlog_alloc_buffer(
97 struct xlog *log, 97 struct xlog *log,
98 int nbblks) 98 int nbblks)
99{ 99{
100 int align_mask = xfs_buftarg_dma_alignment(log->l_targ);
101
100 /* 102 /*
101 * Pass log block 0 since we don't have an addr yet, buffer will be 103 * Pass log block 0 since we don't have an addr yet, buffer will be
102 * verified on read. 104 * verified on read.
@@ -125,7 +127,7 @@ xlog_alloc_buffer(
125 if (nbblks > 1 && log->l_sectBBsize > 1) 127 if (nbblks > 1 && log->l_sectBBsize > 1)
126 nbblks += log->l_sectBBsize; 128 nbblks += log->l_sectBBsize;
127 nbblks = round_up(nbblks, log->l_sectBBsize); 129 nbblks = round_up(nbblks, log->l_sectBBsize);
128 return kmem_alloc_large(BBTOB(nbblks), KM_MAYFAIL); 130 return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL);
129} 131}
130 132
131/* 133/*
@@ -1960,7 +1962,7 @@ xlog_recover_buffer_pass1(
1960 } 1962 }
1961 } 1963 }
1962 1964
1963 bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP); 1965 bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0);
1964 bcp->bc_blkno = buf_f->blf_blkno; 1966 bcp->bc_blkno = buf_f->blf_blkno;
1965 bcp->bc_len = buf_f->blf_len; 1967 bcp->bc_len = buf_f->blf_len;
1966 bcp->bc_refcount = 1; 1968 bcp->bc_refcount = 1;
@@ -2930,7 +2932,7 @@ xlog_recover_inode_pass2(
2930 if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { 2932 if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
2931 in_f = item->ri_buf[0].i_addr; 2933 in_f = item->ri_buf[0].i_addr;
2932 } else { 2934 } else {
2933 in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), KM_SLEEP); 2935 in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0);
2934 need_free = 1; 2936 need_free = 1;
2935 error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f); 2937 error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
2936 if (error) 2938 if (error)
@@ -4161,7 +4163,7 @@ xlog_recover_add_item(
4161{ 4163{
4162 xlog_recover_item_t *item; 4164 xlog_recover_item_t *item;
4163 4165
4164 item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP); 4166 item = kmem_zalloc(sizeof(xlog_recover_item_t), 0);
4165 INIT_LIST_HEAD(&item->ri_list); 4167 INIT_LIST_HEAD(&item->ri_list);
4166 list_add_tail(&item->ri_list, head); 4168 list_add_tail(&item->ri_list, head);
4167} 4169}
@@ -4201,7 +4203,7 @@ xlog_recover_add_to_cont_trans(
4201 old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; 4203 old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
4202 old_len = item->ri_buf[item->ri_cnt-1].i_len; 4204 old_len = item->ri_buf[item->ri_cnt-1].i_len;
4203 4205
4204 ptr = kmem_realloc(old_ptr, len + old_len, KM_SLEEP); 4206 ptr = kmem_realloc(old_ptr, len + old_len, 0);
4205 memcpy(&ptr[old_len], dp, len); 4207 memcpy(&ptr[old_len], dp, len);
4206 item->ri_buf[item->ri_cnt-1].i_len += len; 4208 item->ri_buf[item->ri_cnt-1].i_len += len;
4207 item->ri_buf[item->ri_cnt-1].i_addr = ptr; 4209 item->ri_buf[item->ri_cnt-1].i_addr = ptr;
@@ -4261,7 +4263,7 @@ xlog_recover_add_to_trans(
4261 return 0; 4263 return 0;
4262 } 4264 }
4263 4265
4264 ptr = kmem_alloc(len, KM_SLEEP); 4266 ptr = kmem_alloc(len, 0);
4265 memcpy(ptr, dp, len); 4267 memcpy(ptr, dp, len);
4266 in_f = (struct xfs_inode_log_format *)ptr; 4268 in_f = (struct xfs_inode_log_format *)ptr;
4267 4269
@@ -4289,7 +4291,7 @@ xlog_recover_add_to_trans(
4289 item->ri_total = in_f->ilf_size; 4291 item->ri_total = in_f->ilf_size;
4290 item->ri_buf = 4292 item->ri_buf =
4291 kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t), 4293 kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
4292 KM_SLEEP); 4294 0);
4293 } 4295 }
4294 ASSERT(item->ri_total > item->ri_cnt); 4296 ASSERT(item->ri_total > item->ri_cnt);
4295 /* Description region is ri_buf[0] */ 4297 /* Description region is ri_buf[0] */
@@ -4423,7 +4425,7 @@ xlog_recover_ophdr_to_trans(
4423 * This is a new transaction so allocate a new recovery container to 4425 * This is a new transaction so allocate a new recovery container to
4424 * hold the recovery ops that will follow. 4426 * hold the recovery ops that will follow.
4425 */ 4427 */
4426 trans = kmem_zalloc(sizeof(struct xlog_recover), KM_SLEEP); 4428 trans = kmem_zalloc(sizeof(struct xlog_recover), 0);
4427 trans->r_log_tid = tid; 4429 trans->r_log_tid = tid;
4428 trans->r_lsn = be64_to_cpu(rhead->h_lsn); 4430 trans->r_lsn = be64_to_cpu(rhead->h_lsn);
4429 INIT_LIST_HEAD(&trans->r_itemq); 4431 INIT_LIST_HEAD(&trans->r_itemq);
@@ -5022,16 +5024,27 @@ xlog_recover_process_one_iunlink(
5022} 5024}
5023 5025
5024/* 5026/*
5025 * xlog_iunlink_recover 5027 * Recover AGI unlinked lists
5028 *
5029 * This is called during recovery to process any inodes which we unlinked but
5030 * not freed when the system crashed. These inodes will be on the lists in the
5031 * AGI blocks. What we do here is scan all the AGIs and fully truncate and free
5032 * any inodes found on the lists. Each inode is removed from the lists when it
5033 * has been fully truncated and is freed. The freeing of the inode and its
5034 * removal from the list must be atomic.
5035 *
5036 * If everything we touch in the agi processing loop is already in memory, this
5037 * loop can hold the cpu for a long time. It runs without lock contention,
5038 * memory allocation contention, the need wait for IO, etc, and so will run
5039 * until we either run out of inodes to process, run low on memory or we run out
5040 * of log space.
5026 * 5041 *
5027 * This is called during recovery to process any inodes which 5042 * This behaviour is bad for latency on single CPU and non-preemptible kernels,
5028 * we unlinked but not freed when the system crashed. These 5043 * and can prevent other filesytem work (such as CIL pushes) from running. This
5029 * inodes will be on the lists in the AGI blocks. What we do 5044 * can lead to deadlocks if the recovery process runs out of log reservation
5030 * here is scan all the AGIs and fully truncate and free any 5045 * space. Hence we need to yield the CPU when there is other kernel work
5031 * inodes found on the lists. Each inode is removed from the 5046 * scheduled on this CPU to ensure other scheduled work can run without undue
5032 * lists when it has been fully truncated and is freed. The 5047 * latency.
5033 * freeing of the inode and its removal from the list must be
5034 * atomic.
5035 */ 5048 */
5036STATIC void 5049STATIC void
5037xlog_recover_process_iunlinks( 5050xlog_recover_process_iunlinks(
@@ -5078,6 +5091,7 @@ xlog_recover_process_iunlinks(
5078 while (agino != NULLAGINO) { 5091 while (agino != NULLAGINO) {
5079 agino = xlog_recover_process_one_iunlink(mp, 5092 agino = xlog_recover_process_one_iunlink(mp,
5080 agno, agino, bucket); 5093 agno, agino, bucket);
5094 cond_resched();
5081 } 5095 }
5082 } 5096 }
5083 xfs_buf_rele(agibp); 5097 xfs_buf_rele(agibp);
@@ -5527,7 +5541,7 @@ xlog_do_log_recovery(
5527 */ 5541 */
5528 log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE * 5542 log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
5529 sizeof(struct list_head), 5543 sizeof(struct list_head),
5530 KM_SLEEP); 5544 0);
5531 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) 5545 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
5532 INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); 5546 INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
5533 5547
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 322da6909290..ba5b6f3b2b88 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -82,7 +82,7 @@ xfs_uuid_mount(
82 if (hole < 0) { 82 if (hole < 0) {
83 xfs_uuid_table = kmem_realloc(xfs_uuid_table, 83 xfs_uuid_table = kmem_realloc(xfs_uuid_table,
84 (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table), 84 (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
85 KM_SLEEP); 85 0);
86 hole = xfs_uuid_table_size++; 86 hole = xfs_uuid_table_size++;
87 } 87 }
88 xfs_uuid_table[hole] = *uuid; 88 xfs_uuid_table[hole] = *uuid;
@@ -214,7 +214,7 @@ xfs_initialize_perag(
214 214
215 spin_lock(&mp->m_perag_lock); 215 spin_lock(&mp->m_perag_lock);
216 if (radix_tree_insert(&mp->m_perag_tree, index, pag)) { 216 if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
217 BUG(); 217 WARN_ON_ONCE(1);
218 spin_unlock(&mp->m_perag_lock); 218 spin_unlock(&mp->m_perag_lock);
219 radix_tree_preload_end(); 219 radix_tree_preload_end();
220 error = -EEXIST; 220 error = -EEXIST;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 4adb6837439a..fdb60e09a9c5 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -327,13 +327,6 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
327} 327}
328 328
329/* per-AG block reservation data structures*/ 329/* per-AG block reservation data structures*/
330enum xfs_ag_resv_type {
331 XFS_AG_RESV_NONE = 0,
332 XFS_AG_RESV_AGFL,
333 XFS_AG_RESV_METADATA,
334 XFS_AG_RESV_RMAPBT,
335};
336
337struct xfs_ag_resv { 330struct xfs_ag_resv {
338 /* number of blocks originally reserved here */ 331 /* number of blocks originally reserved here */
339 xfs_extlen_t ar_orig_reserved; 332 xfs_extlen_t ar_orig_reserved;
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 74738813f60d..a06661dac5be 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -333,12 +333,12 @@ xfs_mru_cache_create(
333 if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count)) 333 if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count))
334 return -EINVAL; 334 return -EINVAL;
335 335
336 if (!(mru = kmem_zalloc(sizeof(*mru), KM_SLEEP))) 336 if (!(mru = kmem_zalloc(sizeof(*mru), 0)))
337 return -ENOMEM; 337 return -ENOMEM;
338 338
339 /* An extra list is needed to avoid reaping up to a grp_time early. */ 339 /* An extra list is needed to avoid reaping up to a grp_time early. */
340 mru->grp_count = grp_count + 1; 340 mru->grp_count = grp_count + 1;
341 mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP); 341 mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), 0);
342 342
343 if (!mru->lists) { 343 if (!mru->lists) {
344 err = -ENOMEM; 344 err = -ENOMEM;
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 5e7a37f0cf84..ecd8ce152ab1 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -642,7 +642,7 @@ xfs_qm_init_quotainfo(
642 642
643 ASSERT(XFS_IS_QUOTA_RUNNING(mp)); 643 ASSERT(XFS_IS_QUOTA_RUNNING(mp));
644 644
645 qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP); 645 qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), 0);
646 646
647 error = list_lru_init(&qinf->qi_lru); 647 error = list_lru_init(&qinf->qi_lru);
648 if (error) 648 if (error)
@@ -978,7 +978,7 @@ xfs_qm_reset_dqcounts_buf(
978 if (qip->i_d.di_nblocks == 0) 978 if (qip->i_d.di_nblocks == 0)
979 return 0; 979 return 0;
980 980
981 map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP); 981 map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), 0);
982 982
983 lblkno = 0; 983 lblkno = 0;
984 maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); 984 maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index d8288aa0670a..2328268e6245 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -144,9 +144,9 @@ xfs_cui_init(
144 ASSERT(nextents > 0); 144 ASSERT(nextents > 0);
145 if (nextents > XFS_CUI_MAX_FAST_EXTENTS) 145 if (nextents > XFS_CUI_MAX_FAST_EXTENTS)
146 cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents), 146 cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents),
147 KM_SLEEP); 147 0);
148 else 148 else
149 cuip = kmem_zone_zalloc(xfs_cui_zone, KM_SLEEP); 149 cuip = kmem_zone_zalloc(xfs_cui_zone, 0);
150 150
151 xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops); 151 xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops);
152 cuip->cui_format.cui_nextents = nextents; 152 cuip->cui_format.cui_nextents = nextents;
@@ -223,7 +223,7 @@ xfs_trans_get_cud(
223{ 223{
224 struct xfs_cud_log_item *cudp; 224 struct xfs_cud_log_item *cudp;
225 225
226 cudp = kmem_zone_zalloc(xfs_cud_zone, KM_SLEEP); 226 cudp = kmem_zone_zalloc(xfs_cud_zone, 0);
227 xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD, 227 xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD,
228 &xfs_cud_item_ops); 228 &xfs_cud_item_ops);
229 cudp->cud_cuip = cuip; 229 cudp->cud_cuip = cuip;
@@ -555,26 +555,24 @@ xfs_cui_recover(
555 irec.br_blockcount = new_len; 555 irec.br_blockcount = new_len;
556 switch (type) { 556 switch (type) {
557 case XFS_REFCOUNT_INCREASE: 557 case XFS_REFCOUNT_INCREASE:
558 error = xfs_refcount_increase_extent(tp, &irec); 558 xfs_refcount_increase_extent(tp, &irec);
559 break; 559 break;
560 case XFS_REFCOUNT_DECREASE: 560 case XFS_REFCOUNT_DECREASE:
561 error = xfs_refcount_decrease_extent(tp, &irec); 561 xfs_refcount_decrease_extent(tp, &irec);
562 break; 562 break;
563 case XFS_REFCOUNT_ALLOC_COW: 563 case XFS_REFCOUNT_ALLOC_COW:
564 error = xfs_refcount_alloc_cow_extent(tp, 564 xfs_refcount_alloc_cow_extent(tp,
565 irec.br_startblock, 565 irec.br_startblock,
566 irec.br_blockcount); 566 irec.br_blockcount);
567 break; 567 break;
568 case XFS_REFCOUNT_FREE_COW: 568 case XFS_REFCOUNT_FREE_COW:
569 error = xfs_refcount_free_cow_extent(tp, 569 xfs_refcount_free_cow_extent(tp,
570 irec.br_startblock, 570 irec.br_startblock,
571 irec.br_blockcount); 571 irec.br_blockcount);
572 break; 572 break;
573 default: 573 default:
574 ASSERT(0); 574 ASSERT(0);
575 } 575 }
576 if (error)
577 goto abort_error;
578 requeue_only = true; 576 requeue_only = true;
579 } 577 }
580 } 578 }
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index edbe37b7f636..0f08153b4994 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -495,10 +495,8 @@ xfs_reflink_cancel_cow_blocks(
495 ASSERT((*tpp)->t_firstblock == NULLFSBLOCK); 495 ASSERT((*tpp)->t_firstblock == NULLFSBLOCK);
496 496
497 /* Free the CoW orphan record. */ 497 /* Free the CoW orphan record. */
498 error = xfs_refcount_free_cow_extent(*tpp, 498 xfs_refcount_free_cow_extent(*tpp, del.br_startblock,
499 del.br_startblock, del.br_blockcount); 499 del.br_blockcount);
500 if (error)
501 break;
502 500
503 xfs_bmap_add_free(*tpp, del.br_startblock, 501 xfs_bmap_add_free(*tpp, del.br_startblock,
504 del.br_blockcount, NULL); 502 del.br_blockcount, NULL);
@@ -675,15 +673,10 @@ xfs_reflink_end_cow_extent(
675 trace_xfs_reflink_cow_remap(ip, &del); 673 trace_xfs_reflink_cow_remap(ip, &del);
676 674
677 /* Free the CoW orphan record. */ 675 /* Free the CoW orphan record. */
678 error = xfs_refcount_free_cow_extent(tp, del.br_startblock, 676 xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount);
679 del.br_blockcount);
680 if (error)
681 goto out_cancel;
682 677
683 /* Map the new blocks into the data fork. */ 678 /* Map the new blocks into the data fork. */
684 error = xfs_bmap_map_extent(tp, ip, &del); 679 xfs_bmap_map_extent(tp, ip, &del);
685 if (error)
686 goto out_cancel;
687 680
688 /* Charge this new data fork mapping to the on-disk quota. */ 681 /* Charge this new data fork mapping to the on-disk quota. */
689 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT, 682 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT,
@@ -1070,14 +1063,10 @@ xfs_reflink_remap_extent(
1070 uirec.br_blockcount, uirec.br_startblock); 1063 uirec.br_blockcount, uirec.br_startblock);
1071 1064
1072 /* Update the refcount tree */ 1065 /* Update the refcount tree */
1073 error = xfs_refcount_increase_extent(tp, &uirec); 1066 xfs_refcount_increase_extent(tp, &uirec);
1074 if (error)
1075 goto out_cancel;
1076 1067
1077 /* Map the new blocks into the data fork. */ 1068 /* Map the new blocks into the data fork. */
1078 error = xfs_bmap_map_extent(tp, ip, &uirec); 1069 xfs_bmap_map_extent(tp, ip, &uirec);
1079 if (error)
1080 goto out_cancel;
1081 1070
1082 /* Update quota accounting. */ 1071 /* Update quota accounting. */
1083 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1072 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 77ed557b6127..8939e0ea09cd 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -142,9 +142,9 @@ xfs_rui_init(
142 142
143 ASSERT(nextents > 0); 143 ASSERT(nextents > 0);
144 if (nextents > XFS_RUI_MAX_FAST_EXTENTS) 144 if (nextents > XFS_RUI_MAX_FAST_EXTENTS)
145 ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), KM_SLEEP); 145 ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0);
146 else 146 else
147 ruip = kmem_zone_zalloc(xfs_rui_zone, KM_SLEEP); 147 ruip = kmem_zone_zalloc(xfs_rui_zone, 0);
148 148
149 xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops); 149 xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops);
150 ruip->rui_format.rui_nextents = nextents; 150 ruip->rui_format.rui_nextents = nextents;
@@ -244,7 +244,7 @@ xfs_trans_get_rud(
244{ 244{
245 struct xfs_rud_log_item *rudp; 245 struct xfs_rud_log_item *rudp;
246 246
247 rudp = kmem_zone_zalloc(xfs_rud_zone, KM_SLEEP); 247 rudp = kmem_zone_zalloc(xfs_rud_zone, 0);
248 xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD, 248 xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD,
249 &xfs_rud_item_ops); 249 &xfs_rud_item_ops);
250 rudp->rud_ruip = ruip; 250 rudp->rud_ruip = ruip;
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 5fa4db3c3e32..4a48a8c75b4f 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -865,7 +865,7 @@ xfs_alloc_rsum_cache(
865 * lower bound on the minimum level with any free extents. We can 865 * lower bound on the minimum level with any free extents. We can
866 * continue without the cache if it couldn't be allocated. 866 * continue without the cache if it couldn't be allocated.
867 */ 867 */
868 mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, KM_SLEEP); 868 mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, 0);
869 if (!mp->m_rsum_cache) 869 if (!mp->m_rsum_cache)
870 xfs_warn(mp, "could not allocate realtime summary cache"); 870 xfs_warn(mp, "could not allocate realtime summary cache");
871} 871}
@@ -963,7 +963,7 @@ xfs_growfs_rt(
963 /* 963 /*
964 * Allocate a new (fake) mount/sb. 964 * Allocate a new (fake) mount/sb.
965 */ 965 */
966 nmp = kmem_alloc(sizeof(*nmp), KM_SLEEP); 966 nmp = kmem_alloc(sizeof(*nmp), 0);
967 /* 967 /*
968 * Loop over the bitmap blocks. 968 * Loop over the bitmap blocks.
969 * We will do everything one bitmap block at a time. 969 * We will do everything one bitmap block at a time.
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f9450235533c..391b4748cae3 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -818,7 +818,8 @@ xfs_init_mount_workqueues(
818 goto out_destroy_buf; 818 goto out_destroy_buf;
819 819
820 mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s", 820 mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s",
821 WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); 821 WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND,
822 0, mp->m_fsname);
822 if (!mp->m_cil_workqueue) 823 if (!mp->m_cil_workqueue)
823 goto out_destroy_unwritten; 824 goto out_destroy_unwritten;
824 825
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 8094b1920eef..eaae275ed430 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -23,6 +23,7 @@ struct xlog;
23struct xlog_ticket; 23struct xlog_ticket;
24struct xlog_recover; 24struct xlog_recover;
25struct xlog_recover_item; 25struct xlog_recover_item;
26struct xlog_rec_header;
26struct xfs_buf_log_format; 27struct xfs_buf_log_format;
27struct xfs_inode_log_format; 28struct xfs_inode_log_format;
28struct xfs_bmbt_irec; 29struct xfs_bmbt_irec;
@@ -30,6 +31,10 @@ struct xfs_btree_cur;
30struct xfs_refcount_irec; 31struct xfs_refcount_irec;
31struct xfs_fsmap; 32struct xfs_fsmap;
32struct xfs_rmap_irec; 33struct xfs_rmap_irec;
34struct xfs_icreate_log;
35struct xfs_owner_info;
36struct xfs_trans_res;
37struct xfs_inobt_rec_incore;
33 38
34DECLARE_EVENT_CLASS(xfs_attr_list_class, 39DECLARE_EVENT_CLASS(xfs_attr_list_class,
35 TP_PROTO(struct xfs_attr_list_context *ctx), 40 TP_PROTO(struct xfs_attr_list_context *ctx),
@@ -3575,6 +3580,35 @@ TRACE_EVENT(xfs_pwork_init,
3575 __entry->nr_threads, __entry->pid) 3580 __entry->nr_threads, __entry->pid)
3576) 3581)
3577 3582
3583DECLARE_EVENT_CLASS(xfs_kmem_class,
3584 TP_PROTO(ssize_t size, int flags, unsigned long caller_ip),
3585 TP_ARGS(size, flags, caller_ip),
3586 TP_STRUCT__entry(
3587 __field(ssize_t, size)
3588 __field(int, flags)
3589 __field(unsigned long, caller_ip)
3590 ),
3591 TP_fast_assign(
3592 __entry->size = size;
3593 __entry->flags = flags;
3594 __entry->caller_ip = caller_ip;
3595 ),
3596 TP_printk("size %zd flags 0x%x caller %pS",
3597 __entry->size,
3598 __entry->flags,
3599 (char *)__entry->caller_ip)
3600)
3601
3602#define DEFINE_KMEM_EVENT(name) \
3603DEFINE_EVENT(xfs_kmem_class, name, \
3604 TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), \
3605 TP_ARGS(size, flags, caller_ip))
3606DEFINE_KMEM_EVENT(kmem_alloc);
3607DEFINE_KMEM_EVENT(kmem_alloc_io);
3608DEFINE_KMEM_EVENT(kmem_alloc_large);
3609DEFINE_KMEM_EVENT(kmem_realloc);
3610DEFINE_KMEM_EVENT(kmem_zone_alloc);
3611
3578#endif /* _TRACE_XFS_H */ 3612#endif /* _TRACE_XFS_H */
3579 3613
3580#undef TRACE_INCLUDE_PATH 3614#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index d42a68d8313b..f4795fdb7389 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -90,7 +90,7 @@ xfs_trans_dup(
90 90
91 trace_xfs_trans_dup(tp, _RET_IP_); 91 trace_xfs_trans_dup(tp, _RET_IP_);
92 92
93 ntp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP); 93 ntp = kmem_zone_zalloc(xfs_trans_zone, 0);
94 94
95 /* 95 /*
96 * Initialize the new transaction structure. 96 * Initialize the new transaction structure.
@@ -263,7 +263,7 @@ xfs_trans_alloc(
263 * GFP_NOFS allocation context so that we avoid lockdep false positives 263 * GFP_NOFS allocation context so that we avoid lockdep false positives
264 * by doing GFP_KERNEL allocations inside sb_start_intwrite(). 264 * by doing GFP_KERNEL allocations inside sb_start_intwrite().
265 */ 265 */
266 tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP); 266 tp = kmem_zone_zalloc(xfs_trans_zone, 0);
267 if (!(flags & XFS_TRANS_NO_WRITECOUNT)) 267 if (!(flags & XFS_TRANS_NO_WRITECOUNT))
268 sb_start_intwrite(mp->m_super); 268 sb_start_intwrite(mp->m_super);
269 269
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 1027c9ca6eb8..16457465833b 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -863,7 +863,7 @@ STATIC void
863xfs_trans_alloc_dqinfo( 863xfs_trans_alloc_dqinfo(
864 xfs_trans_t *tp) 864 xfs_trans_t *tp)
865{ 865{
866 tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, KM_SLEEP); 866 tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, 0);
867} 867}
868 868
869void 869void
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 3123b5aaad2a..cb895b1df5e4 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -30,7 +30,7 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused,
30 value = NULL; 30 value = NULL;
31 } 31 }
32 32
33 error = xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags); 33 error = xfs_attr_get(ip, name, (unsigned char **)&value, &asize, xflags);
34 if (error) 34 if (error)
35 return error; 35 return error;
36 return asize; 36 return asize;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ae6648145d18..ffe35d97afcb 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3543,6 +3543,8 @@ extern void inode_nohighmem(struct inode *inode);
3543/* mm/fadvise.c */ 3543/* mm/fadvise.c */
3544extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, 3544extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len,
3545 int advice); 3545 int advice);
3546extern int generic_fadvise(struct file *file, loff_t offset, loff_t len,
3547 int advice);
3546 3548
3547#if defined(CONFIG_IO_URING) 3549#if defined(CONFIG_IO_URING)
3548extern struct sock *io_uring_get_socket(struct file *file); 3550extern struct sock *io_uring_get_socket(struct file *file);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 467bcd032037..4f17c83db575 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -27,8 +27,7 @@
27 * deactivate the pages and clear PG_Referenced. 27 * deactivate the pages and clear PG_Referenced.
28 */ 28 */
29 29
30static int generic_fadvise(struct file *file, loff_t offset, loff_t len, 30int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
31 int advice)
32{ 31{
33 struct inode *inode; 32 struct inode *inode;
34 struct address_space *mapping; 33 struct address_space *mapping;
@@ -178,6 +177,7 @@ static int generic_fadvise(struct file *file, loff_t offset, loff_t len,
178 } 177 }
179 return 0; 178 return 0;
180} 179}
180EXPORT_SYMBOL(generic_fadvise);
181 181
182int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice) 182int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
183{ 183{
diff --git a/mm/madvise.c b/mm/madvise.c
index 968df3aa069f..bac973b9f2cc 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -14,6 +14,7 @@
14#include <linux/userfaultfd_k.h> 14#include <linux/userfaultfd_k.h>
15#include <linux/hugetlb.h> 15#include <linux/hugetlb.h>
16#include <linux/falloc.h> 16#include <linux/falloc.h>
17#include <linux/fadvise.h>
17#include <linux/sched.h> 18#include <linux/sched.h>
18#include <linux/ksm.h> 19#include <linux/ksm.h>
19#include <linux/fs.h> 20#include <linux/fs.h>
@@ -275,6 +276,7 @@ static long madvise_willneed(struct vm_area_struct *vma,
275 unsigned long start, unsigned long end) 276 unsigned long start, unsigned long end)
276{ 277{
277 struct file *file = vma->vm_file; 278 struct file *file = vma->vm_file;
279 loff_t offset;
278 280
279 *prev = vma; 281 *prev = vma;
280#ifdef CONFIG_SWAP 282#ifdef CONFIG_SWAP
@@ -298,12 +300,20 @@ static long madvise_willneed(struct vm_area_struct *vma,
298 return 0; 300 return 0;
299 } 301 }
300 302
301 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 303 /*
302 if (end > vma->vm_end) 304 * Filesystem's fadvise may need to take various locks. We need to
303 end = vma->vm_end; 305 * explicitly grab a reference because the vma (and hence the
304 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 306 * vma's reference to the file) can go away as soon as we drop
305 307 * mmap_sem.
306 force_page_cache_readahead(file->f_mapping, file, start, end - start); 308 */
309 *prev = NULL; /* tell sys_madvise we drop mmap_sem */
310 get_file(file);
311 up_read(&current->mm->mmap_sem);
312 offset = (loff_t)(start - vma->vm_start)
313 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
314 vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
315 fput(file);
316 down_read(&current->mm->mmap_sem);
307 return 0; 317 return 0;
308} 318}
309 319