aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-10-28 14:35:40 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2018-10-28 14:35:40 -0400
commitdad4f140edaa3f6bb452b6913d41af1ffd672e45 (patch)
tree1c0ebdcdfcdfb4ec9af7810c5ad9bae0f791ff5c
parent69d5b97c597307773fe6c59775a5d5a88bb7e6b3 (diff)
parent3a08cd52c37c793ffc199f6fc2ecfc368e284b2d (diff)
Merge branch 'xarray' of git://git.infradead.org/users/willy/linux-dax
Pull XArray conversion from Matthew Wilcox: "The XArray provides an improved interface to the radix tree data structure, providing locking as part of the API, specifying GFP flags at allocation time, eliminating preloading, less re-walking the tree, more efficient iterations and not exposing RCU-protected pointers to its users. This patch set 1. Introduces the XArray implementation 2. Converts the pagecache to use it 3. Converts memremap to use it The page cache is the most complex and important user of the radix tree, so converting it was most important. Converting the memremap code removes the only other user of the multiorder code, which allows us to remove the radix tree code that supported it. I have 40+ followup patches to convert many other users of the radix tree over to the XArray, but I'd like to get this part in first. The other conversions haven't been in linux-next and aren't suitable for applying yet, but you can see them in the xarray-conv branch if you're interested" * 'xarray' of git://git.infradead.org/users/willy/linux-dax: (90 commits) radix tree: Remove multiorder support radix tree test: Convert multiorder tests to XArray radix tree tests: Convert item_delete_rcu to XArray radix tree tests: Convert item_kill_tree to XArray radix tree tests: Move item_insert_order radix tree test suite: Remove multiorder benchmarking radix tree test suite: Remove __item_insert memremap: Convert to XArray xarray: Add range store functionality xarray: Move multiorder_check to in-kernel tests xarray: Move multiorder_shrink to kernel tests xarray: Move multiorder account test in-kernel radix tree test suite: Convert iteration test to XArray radix tree test suite: Convert tag_tagged_items to XArray radix tree: Remove radix_tree_clear_tags radix tree: Remove radix_tree_maybe_preload_order radix tree: Remove split/join code radix tree: Remove radix_tree_update_node_t page cache: Finish XArray conversion dax: Convert page fault handlers to XArray ...
-rw-r--r--.clang-format1
-rw-r--r--.mailmap7
-rw-r--r--Documentation/core-api/index.rst1
-rw-r--r--Documentation/core-api/xarray.rst435
-rw-r--r--MAINTAINERS17
-rw-r--r--arch/parisc/kernel/syscall.S2
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable.h4
-rw-r--r--arch/powerpc/include/asm/nohash/64/pgtable.h4
-rw-r--r--drivers/gpu/drm/i915/i915_gem.c17
-rw-r--r--drivers/input/keyboard/hilkbd.c2
-rw-r--r--drivers/pci/hotplug/acpiphp.h2
-rw-r--r--drivers/pci/hotplug/acpiphp_core.c4
-rw-r--r--drivers/pci/hotplug/acpiphp_glue.c2
-rw-r--r--drivers/staging/erofs/utils.c18
-rw-r--r--fs/btrfs/compression.c6
-rw-r--r--fs/btrfs/extent_io.c12
-rw-r--r--fs/buffer.c14
-rw-r--r--fs/dax.c917
-rw-r--r--fs/ext4/inode.c2
-rw-r--r--fs/f2fs/data.c6
-rw-r--r--fs/f2fs/dir.c2
-rw-r--r--fs/f2fs/f2fs.h2
-rw-r--r--fs/f2fs/inline.c2
-rw-r--r--fs/f2fs/node.c6
-rw-r--r--fs/fs-writeback.c25
-rw-r--r--fs/gfs2/aops.c2
-rw-r--r--fs/inode.c2
-rw-r--r--fs/isofs/dir.c2
-rw-r--r--fs/nfs/blocklayout/blocklayout.c2
-rw-r--r--fs/nilfs2/btnode.c26
-rw-r--r--fs/nilfs2/page.c29
-rw-r--r--fs/proc/task_mmu.c2
-rw-r--r--include/linux/fs.h63
-rw-r--r--include/linux/idr.h18
-rw-r--r--include/linux/pagemap.h10
-rw-r--r--include/linux/pagevec.h8
-rw-r--r--include/linux/radix-tree.h178
-rw-r--r--include/linux/swap.h22
-rw-r--r--include/linux/swapops.h19
-rw-r--r--include/linux/xarray.h1293
-rw-r--r--kernel/memremap.c75
-rw-r--r--lib/Kconfig5
-rw-r--r--lib/Kconfig.debug3
-rw-r--r--lib/Makefile3
-rw-r--r--lib/idr.c401
-rw-r--r--lib/radix-tree.c834
-rw-r--r--lib/test_xarray.c1238
-rw-r--r--lib/xarray.c2036
-rw-r--r--mm/Kconfig4
-rw-r--r--mm/filemap.c724
-rw-r--r--mm/huge_memory.c17
-rw-r--r--mm/khugepaged.c178
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memcontrol.c2
-rw-r--r--mm/memfd.c105
-rw-r--r--mm/migrate.c48
-rw-r--r--mm/mincore.c2
-rw-r--r--mm/page-writeback.c72
-rw-r--r--mm/readahead.c10
-rw-r--r--mm/shmem.c193
-rw-r--r--mm/swap.c6
-rw-r--r--mm/swap_state.c119
-rw-r--r--mm/truncate.c27
-rw-r--r--mm/vmscan.c10
-rw-r--r--mm/workingset.c68
-rw-r--r--tools/include/asm-generic/bitops.h1
-rw-r--r--tools/include/asm-generic/bitops/atomic.h9
-rw-r--r--tools/include/asm-generic/bitops/non-atomic.h109
-rw-r--r--tools/include/linux/bitmap.h1
-rw-r--r--tools/include/linux/kernel.h1
-rw-r--r--tools/include/linux/spinlock.h12
-rw-r--r--tools/testing/radix-tree/.gitignore1
-rw-r--r--tools/testing/radix-tree/Makefile11
-rw-r--r--tools/testing/radix-tree/benchmark.c141
-rw-r--r--tools/testing/radix-tree/bitmap.c23
-rw-r--r--tools/testing/radix-tree/generated/autoconf.h2
-rw-r--r--tools/testing/radix-tree/idr-test.c71
-rw-r--r--tools/testing/radix-tree/iteration_check.c109
-rw-r--r--tools/testing/radix-tree/linux/bug.h1
-rw-r--r--tools/testing/radix-tree/linux/kconfig.h1
-rw-r--r--tools/testing/radix-tree/linux/kernel.h5
-rw-r--r--tools/testing/radix-tree/linux/lockdep.h11
-rw-r--r--tools/testing/radix-tree/linux/radix-tree.h1
-rw-r--r--tools/testing/radix-tree/linux/rcupdate.h2
-rw-r--r--tools/testing/radix-tree/main.c66
-rw-r--r--tools/testing/radix-tree/multiorder.c609
-rw-r--r--tools/testing/radix-tree/regression1.c75
-rw-r--r--tools/testing/radix-tree/regression2.c8
-rw-r--r--tools/testing/radix-tree/regression3.c23
-rw-r--r--tools/testing/radix-tree/tag_check.c33
-rw-r--r--tools/testing/radix-tree/test.c131
-rw-r--r--tools/testing/radix-tree/test.h13
-rw-r--r--tools/testing/radix-tree/xarray.c35
93 files changed, 7052 insertions, 3821 deletions
diff --git a/.clang-format b/.clang-format
index 1d5da22e0ba5..e6080f5834a3 100644
--- a/.clang-format
+++ b/.clang-format
@@ -323,7 +323,6 @@ ForEachMacros:
323 - 'protocol_for_each_card' 323 - 'protocol_for_each_card'
324 - 'protocol_for_each_dev' 324 - 'protocol_for_each_dev'
325 - 'queue_for_each_hw_ctx' 325 - 'queue_for_each_hw_ctx'
326 - 'radix_tree_for_each_contig'
327 - 'radix_tree_for_each_slot' 326 - 'radix_tree_for_each_slot'
328 - 'radix_tree_for_each_tagged' 327 - 'radix_tree_for_each_tagged'
329 - 'rbtree_postorder_for_each_entry_safe' 328 - 'rbtree_postorder_for_each_entry_safe'
diff --git a/.mailmap b/.mailmap
index 285e09645b31..89f532caf639 100644
--- a/.mailmap
+++ b/.mailmap
@@ -119,6 +119,13 @@ Mark Brown <broonie@sirena.org.uk>
119Mark Yao <markyao0591@gmail.com> <mark.yao@rock-chips.com> 119Mark Yao <markyao0591@gmail.com> <mark.yao@rock-chips.com>
120Martin Kepplinger <martink@posteo.de> <martin.kepplinger@theobroma-systems.com> 120Martin Kepplinger <martink@posteo.de> <martin.kepplinger@theobroma-systems.com>
121Martin Kepplinger <martink@posteo.de> <martin.kepplinger@ginzinger.com> 121Martin Kepplinger <martink@posteo.de> <martin.kepplinger@ginzinger.com>
122Matthew Wilcox <willy@infradead.org> <matthew.r.wilcox@intel.com>
123Matthew Wilcox <willy@infradead.org> <matthew@wil.cx>
124Matthew Wilcox <willy@infradead.org> <mawilcox@linuxonhyperv.com>
125Matthew Wilcox <willy@infradead.org> <mawilcox@microsoft.com>
126Matthew Wilcox <willy@infradead.org> <willy@debian.org>
127Matthew Wilcox <willy@infradead.org> <willy@linux.intel.com>
128Matthew Wilcox <willy@infradead.org> <willy@parisc-linux.org>
122Matthieu CASTET <castet.matthieu@free.fr> 129Matthieu CASTET <castet.matthieu@free.fr>
123Mauro Carvalho Chehab <mchehab@kernel.org> <mchehab@brturbo.com.br> 130Mauro Carvalho Chehab <mchehab@kernel.org> <mchehab@brturbo.com.br>
124Mauro Carvalho Chehab <mchehab@kernel.org> <maurochehab@gmail.com> 131Mauro Carvalho Chehab <mchehab@kernel.org> <maurochehab@gmail.com>
diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst
index 29c790f571a5..3adee82be311 100644
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -21,6 +21,7 @@ Core utilities
21 local_ops 21 local_ops
22 workqueue 22 workqueue
23 genericirq 23 genericirq
24 xarray
24 flexible-arrays 25 flexible-arrays
25 librs 26 librs
26 genalloc 27 genalloc
diff --git a/Documentation/core-api/xarray.rst b/Documentation/core-api/xarray.rst
new file mode 100644
index 000000000000..a4e705108f42
--- /dev/null
+++ b/Documentation/core-api/xarray.rst
@@ -0,0 +1,435 @@
1.. SPDX-License-Identifier: GPL-2.0+
2
3======
4XArray
5======
6
7:Author: Matthew Wilcox
8
9Overview
10========
11
12The XArray is an abstract data type which behaves like a very large array
13of pointers. It meets many of the same needs as a hash or a conventional
14resizable array. Unlike a hash, it allows you to sensibly go to the
15next or previous entry in a cache-efficient manner. In contrast to a
16resizable array, there is no need to copy data or change MMU mappings in
17order to grow the array. It is more memory-efficient, parallelisable
18and cache friendly than a doubly-linked list. It takes advantage of
19RCU to perform lookups without locking.
20
21The XArray implementation is efficient when the indices used are densely
22clustered; hashing the object and using the hash as the index will not
23perform well. The XArray is optimised for small indices, but still has
24good performance with large indices. If your index can be larger than
25``ULONG_MAX`` then the XArray is not the data type for you. The most
26important user of the XArray is the page cache.
27
28Each non-``NULL`` entry in the array has three bits associated with
29it called marks. Each mark may be set or cleared independently of
30the others. You can iterate over entries which are marked.
31
32Normal pointers may be stored in the XArray directly. They must be 4-byte
33aligned, which is true for any pointer returned from :c:func:`kmalloc` and
34:c:func:`alloc_page`. It isn't true for arbitrary user-space pointers,
35nor for function pointers. You can store pointers to statically allocated
36objects, as long as those objects have an alignment of at least 4.
37
38You can also store integers between 0 and ``LONG_MAX`` in the XArray.
39You must first convert it into an entry using :c:func:`xa_mk_value`.
40When you retrieve an entry from the XArray, you can check whether it is
41a value entry by calling :c:func:`xa_is_value`, and convert it back to
42an integer by calling :c:func:`xa_to_value`.
43
44Some users want to store tagged pointers instead of using the marks
45described above. They can call :c:func:`xa_tag_pointer` to create an
46entry with a tag, :c:func:`xa_untag_pointer` to turn a tagged entry
47back into an untagged pointer and :c:func:`xa_pointer_tag` to retrieve
48the tag of an entry. Tagged pointers use the same bits that are used
49to distinguish value entries from normal pointers, so each user must
50decide whether they want to store value entries or tagged pointers in
51any particular XArray.
52
53The XArray does not support storing :c:func:`IS_ERR` pointers as some
54conflict with value entries or internal entries.
55
56An unusual feature of the XArray is the ability to create entries which
57occupy a range of indices. Once stored to, looking up any index in
58the range will return the same entry as looking up any other index in
59the range. Setting a mark on one index will set it on all of them.
60Storing to any index will store to all of them. Multi-index entries can
61be explicitly split into smaller entries, or storing ``NULL`` into any
62entry will cause the XArray to forget about the range.
63
64Normal API
65==========
66
67Start by initialising an XArray, either with :c:func:`DEFINE_XARRAY`
68for statically allocated XArrays or :c:func:`xa_init` for dynamically
69allocated ones. A freshly-initialised XArray contains a ``NULL``
70pointer at every index.
71
72You can then set entries using :c:func:`xa_store` and get entries
73using :c:func:`xa_load`. xa_store will overwrite any entry with the
74new entry and return the previous entry stored at that index. You can
75use :c:func:`xa_erase` instead of calling :c:func:`xa_store` with a
76``NULL`` entry. There is no difference between an entry that has never
77been stored to and one that has most recently had ``NULL`` stored to it.
78
79You can conditionally replace an entry at an index by using
80:c:func:`xa_cmpxchg`. Like :c:func:`cmpxchg`, it will only succeed if
81the entry at that index has the 'old' value. It also returns the entry
82which was at that index; if it returns the same entry which was passed as
83'old', then :c:func:`xa_cmpxchg` succeeded.
84
85If you want to only store a new entry to an index if the current entry
86at that index is ``NULL``, you can use :c:func:`xa_insert` which
87returns ``-EEXIST`` if the entry is not empty.
88
89You can enquire whether a mark is set on an entry by using
90:c:func:`xa_get_mark`. If the entry is not ``NULL``, you can set a mark
91on it by using :c:func:`xa_set_mark` and remove the mark from an entry by
92calling :c:func:`xa_clear_mark`. You can ask whether any entry in the
93XArray has a particular mark set by calling :c:func:`xa_marked`.
94
95You can copy entries out of the XArray into a plain array by calling
96:c:func:`xa_extract`. Or you can iterate over the present entries in
97the XArray by calling :c:func:`xa_for_each`. You may prefer to use
98:c:func:`xa_find` or :c:func:`xa_find_after` to move to the next present
99entry in the XArray.
100
101Calling :c:func:`xa_store_range` stores the same entry in a range
102of indices. If you do this, some of the other operations will behave
103in a slightly odd way. For example, marking the entry at one index
104may result in the entry being marked at some, but not all of the other
105indices. Storing into one index may result in the entry retrieved by
106some, but not all of the other indices changing.
107
108Finally, you can remove all entries from an XArray by calling
109:c:func:`xa_destroy`. If the XArray entries are pointers, you may wish
110to free the entries first. You can do this by iterating over all present
111entries in the XArray using the :c:func:`xa_for_each` iterator.
112
113ID assignment
114-------------
115
116You can call :c:func:`xa_alloc` to store the entry at any unused index
117in the XArray. If you need to modify the array from interrupt context,
118you can use :c:func:`xa_alloc_bh` or :c:func:`xa_alloc_irq` to disable
119interrupts while allocating the ID. Unlike :c:func:`xa_store`, allocating
120a ``NULL`` pointer does not delete an entry. Instead it reserves an
121entry like :c:func:`xa_reserve` and you can release it using either
122:c:func:`xa_erase` or :c:func:`xa_release`. To use ID assignment, the
123XArray must be defined with :c:func:`DEFINE_XARRAY_ALLOC`, or initialised
124by passing ``XA_FLAGS_ALLOC`` to :c:func:`xa_init_flags`,
125
126Memory allocation
127-----------------
128
129The :c:func:`xa_store`, :c:func:`xa_cmpxchg`, :c:func:`xa_alloc`,
130:c:func:`xa_reserve` and :c:func:`xa_insert` functions take a gfp_t
131parameter in case the XArray needs to allocate memory to store this entry.
132If the entry is being deleted, no memory allocation needs to be performed,
133and the GFP flags specified will be ignored.
134
135It is possible for no memory to be allocatable, particularly if you pass
136a restrictive set of GFP flags. In that case, the functions return a
137special value which can be turned into an errno using :c:func:`xa_err`.
138If you don't need to know exactly which error occurred, using
139:c:func:`xa_is_err` is slightly more efficient.
140
141Locking
142-------
143
144When using the Normal API, you do not have to worry about locking.
145The XArray uses RCU and an internal spinlock to synchronise access:
146
147No lock needed:
148 * :c:func:`xa_empty`
149 * :c:func:`xa_marked`
150
151Takes RCU read lock:
152 * :c:func:`xa_load`
153 * :c:func:`xa_for_each`
154 * :c:func:`xa_find`
155 * :c:func:`xa_find_after`
156 * :c:func:`xa_extract`
157 * :c:func:`xa_get_mark`
158
159Takes xa_lock internally:
160 * :c:func:`xa_store`
161 * :c:func:`xa_insert`
162 * :c:func:`xa_erase`
163 * :c:func:`xa_erase_bh`
164 * :c:func:`xa_erase_irq`
165 * :c:func:`xa_cmpxchg`
166 * :c:func:`xa_store_range`
167 * :c:func:`xa_alloc`
168 * :c:func:`xa_alloc_bh`
169 * :c:func:`xa_alloc_irq`
170 * :c:func:`xa_destroy`
171 * :c:func:`xa_set_mark`
172 * :c:func:`xa_clear_mark`
173
174Assumes xa_lock held on entry:
175 * :c:func:`__xa_store`
176 * :c:func:`__xa_insert`
177 * :c:func:`__xa_erase`
178 * :c:func:`__xa_cmpxchg`
179 * :c:func:`__xa_alloc`
180 * :c:func:`__xa_set_mark`
181 * :c:func:`__xa_clear_mark`
182
183If you want to take advantage of the lock to protect the data structures
184that you are storing in the XArray, you can call :c:func:`xa_lock`
185before calling :c:func:`xa_load`, then take a reference count on the
186object you have found before calling :c:func:`xa_unlock`. This will
187prevent stores from removing the object from the array between looking
188up the object and incrementing the refcount. You can also use RCU to
189avoid dereferencing freed memory, but an explanation of that is beyond
190the scope of this document.
191
192The XArray does not disable interrupts or softirqs while modifying
193the array. It is safe to read the XArray from interrupt or softirq
194context as the RCU lock provides enough protection.
195
196If, for example, you want to store entries in the XArray in process
197context and then erase them in softirq context, you can do that this way::
198
199 void foo_init(struct foo *foo)
200 {
201 xa_init_flags(&foo->array, XA_FLAGS_LOCK_BH);
202 }
203
204 int foo_store(struct foo *foo, unsigned long index, void *entry)
205 {
206 int err;
207
208 xa_lock_bh(&foo->array);
209 err = xa_err(__xa_store(&foo->array, index, entry, GFP_KERNEL));
210 if (!err)
211 foo->count++;
212 xa_unlock_bh(&foo->array);
213 return err;
214 }
215
216 /* foo_erase() is only called from softirq context */
217 void foo_erase(struct foo *foo, unsigned long index)
218 {
219 xa_lock(&foo->array);
220 __xa_erase(&foo->array, index);
221 foo->count--;
222 xa_unlock(&foo->array);
223 }
224
225If you are going to modify the XArray from interrupt or softirq context,
226you need to initialise the array using :c:func:`xa_init_flags`, passing
227``XA_FLAGS_LOCK_IRQ`` or ``XA_FLAGS_LOCK_BH``.
228
229The above example also shows a common pattern of wanting to extend the
230coverage of the xa_lock on the store side to protect some statistics
231associated with the array.
232
233Sharing the XArray with interrupt context is also possible, either
234using :c:func:`xa_lock_irqsave` in both the interrupt handler and process
235context, or :c:func:`xa_lock_irq` in process context and :c:func:`xa_lock`
236in the interrupt handler. Some of the more common patterns have helper
237functions such as :c:func:`xa_erase_bh` and :c:func:`xa_erase_irq`.
238
239Sometimes you need to protect access to the XArray with a mutex because
240that lock sits above another mutex in the locking hierarchy. That does
241not entitle you to use functions like :c:func:`__xa_erase` without taking
242the xa_lock; the xa_lock is used for lockdep validation and will be used
243for other purposes in the future.
244
245The :c:func:`__xa_set_mark` and :c:func:`__xa_clear_mark` functions are also
246available for situations where you look up an entry and want to atomically
247set or clear a mark. It may be more efficient to use the advanced API
248in this case, as it will save you from walking the tree twice.
249
250Advanced API
251============
252
253The advanced API offers more flexibility and better performance at the
254cost of an interface which can be harder to use and has fewer safeguards.
255No locking is done for you by the advanced API, and you are required
256to use the xa_lock while modifying the array. You can choose whether
257to use the xa_lock or the RCU lock while doing read-only operations on
258the array. You can mix advanced and normal operations on the same array;
259indeed the normal API is implemented in terms of the advanced API. The
260advanced API is only available to modules with a GPL-compatible license.
261
262The advanced API is based around the xa_state. This is an opaque data
263structure which you declare on the stack using the :c:func:`XA_STATE`
264macro. This macro initialises the xa_state ready to start walking
265around the XArray. It is used as a cursor to maintain the position
266in the XArray and let you compose various operations together without
267having to restart from the top every time.
268
269The xa_state is also used to store errors. You can call
270:c:func:`xas_error` to retrieve the error. All operations check whether
271the xa_state is in an error state before proceeding, so there's no need
272for you to check for an error after each call; you can make multiple
273calls in succession and only check at a convenient point. The only
274errors currently generated by the XArray code itself are ``ENOMEM`` and
275``EINVAL``, but it supports arbitrary errors in case you want to call
276:c:func:`xas_set_err` yourself.
277
278If the xa_state is holding an ``ENOMEM`` error, calling :c:func:`xas_nomem`
279will attempt to allocate more memory using the specified gfp flags and
280cache it in the xa_state for the next attempt. The idea is that you take
281the xa_lock, attempt the operation and drop the lock. The operation
282attempts to allocate memory while holding the lock, but it is more
283likely to fail. Once you have dropped the lock, :c:func:`xas_nomem`
284can try harder to allocate more memory. It will return ``true`` if it
285is worth retrying the operation (i.e. that there was a memory error *and*
286more memory was allocated). If it has previously allocated memory, and
287that memory wasn't used, and there is no error (or some error that isn't
288``ENOMEM``), then it will free the memory previously allocated.
289
290Internal Entries
291----------------
292
293The XArray reserves some entries for its own purposes. These are never
294exposed through the normal API, but when using the advanced API, it's
295possible to see them. Usually the best way to handle them is to pass them
296to :c:func:`xas_retry`, and retry the operation if it returns ``true``.
297
298.. flat-table::
299 :widths: 1 1 6
300
301 * - Name
302 - Test
303 - Usage
304
305 * - Node
306 - :c:func:`xa_is_node`
307 - An XArray node. May be visible when using a multi-index xa_state.
308
309 * - Sibling
310 - :c:func:`xa_is_sibling`
311 - A non-canonical entry for a multi-index entry. The value indicates
312 which slot in this node has the canonical entry.
313
314 * - Retry
315 - :c:func:`xa_is_retry`
316 - This entry is currently being modified by a thread which has the
317 xa_lock. The node containing this entry may be freed at the end
318 of this RCU period. You should restart the lookup from the head
319 of the array.
320
321 * - Zero
322 - :c:func:`xa_is_zero`
323 - Zero entries appear as ``NULL`` through the Normal API, but occupy
324 an entry in the XArray which can be used to reserve the index for
325 future use.
326
327Other internal entries may be added in the future. As far as possible, they
328will be handled by :c:func:`xas_retry`.
329
330Additional functionality
331------------------------
332
333The :c:func:`xas_create_range` function allocates all the necessary memory
334to store every entry in a range. It will set ENOMEM in the xa_state if
335it cannot allocate memory.
336
337You can use :c:func:`xas_init_marks` to reset the marks on an entry
338to their default state. This is usually all marks clear, unless the
339XArray is marked with ``XA_FLAGS_TRACK_FREE``, in which case mark 0 is set
340and all other marks are clear. Replacing one entry with another using
341:c:func:`xas_store` will not reset the marks on that entry; if you want
342the marks reset, you should do that explicitly.
343
344The :c:func:`xas_load` will walk the xa_state as close to the entry
345as it can. If you know the xa_state has already been walked to the
346entry and need to check that the entry hasn't changed, you can use
347:c:func:`xas_reload` to save a function call.
348
349If you need to move to a different index in the XArray, call
350:c:func:`xas_set`. This resets the cursor to the top of the tree, which
351will generally make the next operation walk the cursor to the desired
352spot in the tree. If you want to move to the next or previous index,
353call :c:func:`xas_next` or :c:func:`xas_prev`. Setting the index does
354not walk the cursor around the array so does not require a lock to be
355held, while moving to the next or previous index does.
356
357You can search for the next present entry using :c:func:`xas_find`. This
358is the equivalent of both :c:func:`xa_find` and :c:func:`xa_find_after`;
359if the cursor has been walked to an entry, then it will find the next
360entry after the one currently referenced. If not, it will return the
361entry at the index of the xa_state. Using :c:func:`xas_next_entry` to
362move to the next present entry instead of :c:func:`xas_find` will save
363a function call in the majority of cases at the expense of emitting more
364inline code.
365
366The :c:func:`xas_find_marked` function is similar. If the xa_state has
367not been walked, it will return the entry at the index of the xa_state,
368if it is marked. Otherwise, it will return the first marked entry after
369the entry referenced by the xa_state. The :c:func:`xas_next_marked`
370function is the equivalent of :c:func:`xas_next_entry`.
371
372When iterating over a range of the XArray using :c:func:`xas_for_each`
373or :c:func:`xas_for_each_marked`, it may be necessary to temporarily stop
374the iteration. The :c:func:`xas_pause` function exists for this purpose.
375After you have done the necessary work and wish to resume, the xa_state
376is in an appropriate state to continue the iteration after the entry
377you last processed. If you have interrupts disabled while iterating,
378then it is good manners to pause the iteration and reenable interrupts
379every ``XA_CHECK_SCHED`` entries.
380
381The :c:func:`xas_get_mark`, :c:func:`xas_set_mark` and
382:c:func:`xas_clear_mark` functions require the xa_state cursor to have
383been moved to the appropriate location in the xarray; they will do
384nothing if you have called :c:func:`xas_pause` or :c:func:`xas_set`
385immediately before.
386
387You can call :c:func:`xas_set_update` to have a callback function
388called each time the XArray updates a node. This is used by the page
389cache workingset code to maintain its list of nodes which contain only
390shadow entries.
391
392Multi-Index Entries
393-------------------
394
395The XArray has the ability to tie multiple indices together so that
396operations on one index affect all indices. For example, storing into
397any index will change the value of the entry retrieved from any index.
398Setting or clearing a mark on any index will set or clear the mark
399on every index that is tied together. The current implementation
400only allows tying ranges which are aligned powers of two together;
401eg indices 64-127 may be tied together, but 2-6 may not be. This may
402save substantial quantities of memory; for example tying 512 entries
403together will save over 4kB.
404
405You can create a multi-index entry by using :c:func:`XA_STATE_ORDER`
406or :c:func:`xas_set_order` followed by a call to :c:func:`xas_store`.
407Calling :c:func:`xas_load` with a multi-index xa_state will walk the
408xa_state to the right location in the tree, but the return value is not
409meaningful, potentially being an internal entry or ``NULL`` even when there
410is an entry stored within the range. Calling :c:func:`xas_find_conflict`
411will return the first entry within the range or ``NULL`` if there are no
412entries in the range. The :c:func:`xas_for_each_conflict` iterator will
413iterate over every entry which overlaps the specified range.
414
415If :c:func:`xas_load` encounters a multi-index entry, the xa_index
416in the xa_state will not be changed. When iterating over an XArray
417or calling :c:func:`xas_find`, if the initial index is in the middle
418of a multi-index entry, it will not be altered. Subsequent calls
419or iterations will move the index to the first index in the range.
420Each entry will only be returned once, no matter how many indices it
421occupies.
422
423Using :c:func:`xas_next` or :c:func:`xas_prev` with a multi-index xa_state
424is not supported. Using either of these functions on a multi-index entry
425will reveal sibling entries; these should be skipped over by the caller.
426
427Storing ``NULL`` into any index of a multi-index entry will set the entry
428at every index to ``NULL`` and dissolve the tie. Splitting a multi-index
429entry into entries occupying smaller ranges is not yet supported.
430
431Functions and structures
432========================
433
434.. kernel-doc:: include/linux/xarray.h
435.. kernel-doc:: lib/xarray.c
diff --git a/MAINTAINERS b/MAINTAINERS
index bfc9722a4932..a78d45755881 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -535,7 +535,7 @@ F: Documentation/hwmon/adt7475
535F: drivers/hwmon/adt7475.c 535F: drivers/hwmon/adt7475.c
536 536
537ADVANSYS SCSI DRIVER 537ADVANSYS SCSI DRIVER
538M: Matthew Wilcox <matthew@wil.cx> 538M: Matthew Wilcox <willy@infradead.org>
539M: Hannes Reinecke <hare@suse.com> 539M: Hannes Reinecke <hare@suse.com>
540L: linux-scsi@vger.kernel.org 540L: linux-scsi@vger.kernel.org
541S: Maintained 541S: Maintained
@@ -4393,7 +4393,7 @@ S: Maintained
4393F: drivers/i2c/busses/i2c-diolan-u2c.c 4393F: drivers/i2c/busses/i2c-diolan-u2c.c
4394 4394
4395FILESYSTEM DIRECT ACCESS (DAX) 4395FILESYSTEM DIRECT ACCESS (DAX)
4396M: Matthew Wilcox <mawilcox@microsoft.com> 4396M: Matthew Wilcox <willy@infradead.org>
4397M: Ross Zwisler <zwisler@kernel.org> 4397M: Ross Zwisler <zwisler@kernel.org>
4398M: Jan Kara <jack@suse.cz> 4398M: Jan Kara <jack@suse.cz>
4399L: linux-fsdevel@vger.kernel.org 4399L: linux-fsdevel@vger.kernel.org
@@ -8697,7 +8697,7 @@ F: drivers/message/fusion/
8697F: drivers/scsi/mpt3sas/ 8697F: drivers/scsi/mpt3sas/
8698 8698
8699LSILOGIC/SYMBIOS/NCR 53C8XX and 53C1010 PCI-SCSI drivers 8699LSILOGIC/SYMBIOS/NCR 53C8XX and 53C1010 PCI-SCSI drivers
8700M: Matthew Wilcox <matthew@wil.cx> 8700M: Matthew Wilcox <willy@infradead.org>
8701L: linux-scsi@vger.kernel.org 8701L: linux-scsi@vger.kernel.org
8702S: Maintained 8702S: Maintained
8703F: drivers/scsi/sym53c8xx_2/ 8703F: drivers/scsi/sym53c8xx_2/
@@ -16137,6 +16137,17 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/vdso
16137S: Maintained 16137S: Maintained
16138F: arch/x86/entry/vdso/ 16138F: arch/x86/entry/vdso/
16139 16139
16140XARRAY
16141M: Matthew Wilcox <willy@infradead.org>
16142L: linux-fsdevel@vger.kernel.org
16143S: Supported
16144F: Documentation/core-api/xarray.rst
16145F: lib/idr.c
16146F: lib/xarray.c
16147F: include/linux/idr.h
16148F: include/linux/xarray.h
16149F: tools/testing/radix-tree
16150
16140XC2028/3028 TUNER DRIVER 16151XC2028/3028 TUNER DRIVER
16141M: Mauro Carvalho Chehab <mchehab@kernel.org> 16152M: Mauro Carvalho Chehab <mchehab@kernel.org>
16142L: linux-media@vger.kernel.org 16153L: linux-media@vger.kernel.org
diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S
index f5f22ea9b97e..9505c317818d 100644
--- a/arch/parisc/kernel/syscall.S
+++ b/arch/parisc/kernel/syscall.S
@@ -2,7 +2,7 @@
2 * Linux/PA-RISC Project (http://www.parisc-linux.org/) 2 * Linux/PA-RISC Project (http://www.parisc-linux.org/)
3 * 3 *
4 * System call entry code / Linux gateway page 4 * System call entry code / Linux gateway page
5 * Copyright (c) Matthew Wilcox 1999 <willy@bofh.ai> 5 * Copyright (c) Matthew Wilcox 1999 <willy@infradead.org>
6 * Licensed under the GNU GPL. 6 * Licensed under the GNU GPL.
7 * thanks to Philipp Rumpf, Mike Shaver and various others 7 * thanks to Philipp Rumpf, Mike Shaver and various others
8 * sorry about the wall, puffin.. 8 * sorry about the wall, puffin..
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index c4a726c10af5..6c99e846a8c9 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -716,9 +716,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
716 BUILD_BUG_ON(_PAGE_HPTEFLAGS & (0x1f << _PAGE_BIT_SWAP_TYPE)); \ 716 BUILD_BUG_ON(_PAGE_HPTEFLAGS & (0x1f << _PAGE_BIT_SWAP_TYPE)); \
717 BUILD_BUG_ON(_PAGE_HPTEFLAGS & _PAGE_SWP_SOFT_DIRTY); \ 717 BUILD_BUG_ON(_PAGE_HPTEFLAGS & _PAGE_SWP_SOFT_DIRTY); \
718 } while (0) 718 } while (0)
719/* 719
720 * on pte we don't need handle RADIX_TREE_EXCEPTIONAL_SHIFT;
721 */
722#define SWP_TYPE_BITS 5 720#define SWP_TYPE_BITS 5
723#define __swp_type(x) (((x).val >> _PAGE_BIT_SWAP_TYPE) \ 721#define __swp_type(x) (((x).val >> _PAGE_BIT_SWAP_TYPE) \
724 & ((1UL << SWP_TYPE_BITS) - 1)) 722 & ((1UL << SWP_TYPE_BITS) - 1))
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
index 67421f74efcf..e77ed9761632 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -350,9 +350,7 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma,
350#define MAX_SWAPFILES_CHECK() do { \ 350#define MAX_SWAPFILES_CHECK() do { \
351 BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS); \ 351 BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS); \
352 } while (0) 352 } while (0)
353/* 353
354 * on pte we don't need handle RADIX_TREE_EXCEPTIONAL_SHIFT;
355 */
356#define SWP_TYPE_BITS 5 354#define SWP_TYPE_BITS 5
357#define __swp_type(x) (((x).val >> _PAGE_BIT_SWAP_TYPE) \ 355#define __swp_type(x) (((x).val >> _PAGE_BIT_SWAP_TYPE) \
358 & ((1UL << SWP_TYPE_BITS) - 1)) 356 & ((1UL << SWP_TYPE_BITS) - 1))
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index fcc73a6ab503..316730b45f84 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -5996,7 +5996,8 @@ i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
5996 count = __sg_page_count(sg); 5996 count = __sg_page_count(sg);
5997 5997
5998 while (idx + count <= n) { 5998 while (idx + count <= n) {
5999 unsigned long exception, i; 5999 void *entry;
6000 unsigned long i;
6000 int ret; 6001 int ret;
6001 6002
6002 /* If we cannot allocate and insert this entry, or the 6003 /* If we cannot allocate and insert this entry, or the
@@ -6011,12 +6012,9 @@ i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
6011 if (ret && ret != -EEXIST) 6012 if (ret && ret != -EEXIST)
6012 goto scan; 6013 goto scan;
6013 6014
6014 exception = 6015 entry = xa_mk_value(idx);
6015 RADIX_TREE_EXCEPTIONAL_ENTRY |
6016 idx << RADIX_TREE_EXCEPTIONAL_SHIFT;
6017 for (i = 1; i < count; i++) { 6016 for (i = 1; i < count; i++) {
6018 ret = radix_tree_insert(&iter->radix, idx + i, 6017 ret = radix_tree_insert(&iter->radix, idx + i, entry);
6019 (void *)exception);
6020 if (ret && ret != -EEXIST) 6018 if (ret && ret != -EEXIST)
6021 goto scan; 6019 goto scan;
6022 } 6020 }
@@ -6054,15 +6052,14 @@ lookup:
6054 GEM_BUG_ON(!sg); 6052 GEM_BUG_ON(!sg);
6055 6053
6056 /* If this index is in the middle of multi-page sg entry, 6054 /* If this index is in the middle of multi-page sg entry,
6057 * the radixtree will contain an exceptional entry that points 6055 * the radix tree will contain a value entry that points
6058 * to the start of that range. We will return the pointer to 6056 * to the start of that range. We will return the pointer to
6059 * the base page and the offset of this page within the 6057 * the base page and the offset of this page within the
6060 * sg entry's range. 6058 * sg entry's range.
6061 */ 6059 */
6062 *offset = 0; 6060 *offset = 0;
6063 if (unlikely(radix_tree_exception(sg))) { 6061 if (unlikely(xa_is_value(sg))) {
6064 unsigned long base = 6062 unsigned long base = xa_to_value(sg);
6065 (unsigned long)sg >> RADIX_TREE_EXCEPTIONAL_SHIFT;
6066 6063
6067 sg = radix_tree_lookup(&iter->radix, base); 6064 sg = radix_tree_lookup(&iter->radix, base);
6068 GEM_BUG_ON(!sg); 6065 GEM_BUG_ON(!sg);
diff --git a/drivers/input/keyboard/hilkbd.c b/drivers/input/keyboard/hilkbd.c
index 5c7afdec192c..f5c5ae8b6c06 100644
--- a/drivers/input/keyboard/hilkbd.c
+++ b/drivers/input/keyboard/hilkbd.c
@@ -2,7 +2,7 @@
2 * linux/drivers/hil/hilkbd.c 2 * linux/drivers/hil/hilkbd.c
3 * 3 *
4 * Copyright (C) 1998 Philip Blundell <philb@gnu.org> 4 * Copyright (C) 1998 Philip Blundell <philb@gnu.org>
5 * Copyright (C) 1999 Matthew Wilcox <willy@bofh.ai> 5 * Copyright (C) 1999 Matthew Wilcox <willy@infradead.org>
6 * Copyright (C) 1999-2007 Helge Deller <deller@gmx.de> 6 * Copyright (C) 1999-2007 Helge Deller <deller@gmx.de>
7 * 7 *
8 * Very basic HP Human Interface Loop (HIL) driver. 8 * Very basic HP Human Interface Loop (HIL) driver.
diff --git a/drivers/pci/hotplug/acpiphp.h b/drivers/pci/hotplug/acpiphp.h
index cf3058404f41..a2094c07af6a 100644
--- a/drivers/pci/hotplug/acpiphp.h
+++ b/drivers/pci/hotplug/acpiphp.h
@@ -8,7 +8,7 @@
8 * Copyright (C) 2002 Hiroshi Aono (h-aono@ap.jp.nec.com) 8 * Copyright (C) 2002 Hiroshi Aono (h-aono@ap.jp.nec.com)
9 * Copyright (C) 2002,2003 Takayoshi Kochi (t-kochi@bq.jp.nec.com) 9 * Copyright (C) 2002,2003 Takayoshi Kochi (t-kochi@bq.jp.nec.com)
10 * Copyright (C) 2002,2003 NEC Corporation 10 * Copyright (C) 2002,2003 NEC Corporation
11 * Copyright (C) 2003-2005 Matthew Wilcox (matthew.wilcox@hp.com) 11 * Copyright (C) 2003-2005 Matthew Wilcox (willy@infradead.org)
12 * Copyright (C) 2003-2005 Hewlett Packard 12 * Copyright (C) 2003-2005 Hewlett Packard
13 * 13 *
14 * All rights reserved. 14 * All rights reserved.
diff --git a/drivers/pci/hotplug/acpiphp_core.c b/drivers/pci/hotplug/acpiphp_core.c
index c9e2bd40c038..853e04ad272c 100644
--- a/drivers/pci/hotplug/acpiphp_core.c
+++ b/drivers/pci/hotplug/acpiphp_core.c
@@ -8,7 +8,7 @@
8 * Copyright (C) 2002 Hiroshi Aono (h-aono@ap.jp.nec.com) 8 * Copyright (C) 2002 Hiroshi Aono (h-aono@ap.jp.nec.com)
9 * Copyright (C) 2002,2003 Takayoshi Kochi (t-kochi@bq.jp.nec.com) 9 * Copyright (C) 2002,2003 Takayoshi Kochi (t-kochi@bq.jp.nec.com)
10 * Copyright (C) 2002,2003 NEC Corporation 10 * Copyright (C) 2002,2003 NEC Corporation
11 * Copyright (C) 2003-2005 Matthew Wilcox (matthew.wilcox@hp.com) 11 * Copyright (C) 2003-2005 Matthew Wilcox (willy@infradead.org)
12 * Copyright (C) 2003-2005 Hewlett Packard 12 * Copyright (C) 2003-2005 Hewlett Packard
13 * 13 *
14 * All rights reserved. 14 * All rights reserved.
@@ -40,7 +40,7 @@ bool acpiphp_disabled;
40static struct acpiphp_attention_info *attention_info; 40static struct acpiphp_attention_info *attention_info;
41 41
42#define DRIVER_VERSION "0.5" 42#define DRIVER_VERSION "0.5"
43#define DRIVER_AUTHOR "Greg Kroah-Hartman <gregkh@us.ibm.com>, Takayoshi Kochi <t-kochi@bq.jp.nec.com>, Matthew Wilcox <willy@hp.com>" 43#define DRIVER_AUTHOR "Greg Kroah-Hartman <gregkh@us.ibm.com>, Takayoshi Kochi <t-kochi@bq.jp.nec.com>, Matthew Wilcox <willy@infradead.org>"
44#define DRIVER_DESC "ACPI Hot Plug PCI Controller Driver" 44#define DRIVER_DESC "ACPI Hot Plug PCI Controller Driver"
45 45
46MODULE_AUTHOR(DRIVER_AUTHOR); 46MODULE_AUTHOR(DRIVER_AUTHOR);
diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c
index 12afa7fdf77e..e4c46637f32f 100644
--- a/drivers/pci/hotplug/acpiphp_glue.c
+++ b/drivers/pci/hotplug/acpiphp_glue.c
@@ -5,7 +5,7 @@
5 * Copyright (C) 2002,2003 Takayoshi Kochi (t-kochi@bq.jp.nec.com) 5 * Copyright (C) 2002,2003 Takayoshi Kochi (t-kochi@bq.jp.nec.com)
6 * Copyright (C) 2002 Hiroshi Aono (h-aono@ap.jp.nec.com) 6 * Copyright (C) 2002 Hiroshi Aono (h-aono@ap.jp.nec.com)
7 * Copyright (C) 2002,2003 NEC Corporation 7 * Copyright (C) 2002,2003 NEC Corporation
8 * Copyright (C) 2003-2005 Matthew Wilcox (matthew.wilcox@hp.com) 8 * Copyright (C) 2003-2005 Matthew Wilcox (willy@infradead.org)
9 * Copyright (C) 2003-2005 Hewlett Packard 9 * Copyright (C) 2003-2005 Hewlett Packard
10 * Copyright (C) 2005 Rajesh Shah (rajesh.shah@intel.com) 10 * Copyright (C) 2005 Rajesh Shah (rajesh.shah@intel.com)
11 * Copyright (C) 2005 Intel Corporation 11 * Copyright (C) 2005 Intel Corporation
diff --git a/drivers/staging/erofs/utils.c b/drivers/staging/erofs/utils.c
index 595cf90af9bb..bdee9bd09f11 100644
--- a/drivers/staging/erofs/utils.c
+++ b/drivers/staging/erofs/utils.c
@@ -35,7 +35,6 @@ static atomic_long_t erofs_global_shrink_cnt;
35 35
36#ifdef CONFIG_EROFS_FS_ZIP 36#ifdef CONFIG_EROFS_FS_ZIP
37 37
38/* radix_tree and the future XArray both don't use tagptr_t yet */
39struct erofs_workgroup *erofs_find_workgroup( 38struct erofs_workgroup *erofs_find_workgroup(
40 struct super_block *sb, pgoff_t index, bool *tag) 39 struct super_block *sb, pgoff_t index, bool *tag)
41{ 40{
@@ -47,9 +46,8 @@ repeat:
47 rcu_read_lock(); 46 rcu_read_lock();
48 grp = radix_tree_lookup(&sbi->workstn_tree, index); 47 grp = radix_tree_lookup(&sbi->workstn_tree, index);
49 if (grp != NULL) { 48 if (grp != NULL) {
50 *tag = radix_tree_exceptional_entry(grp); 49 *tag = xa_pointer_tag(grp);
51 grp = (void *)((unsigned long)grp & 50 grp = xa_untag_pointer(grp);
52 ~RADIX_TREE_EXCEPTIONAL_ENTRY);
53 51
54 if (erofs_workgroup_get(grp, &oldcount)) { 52 if (erofs_workgroup_get(grp, &oldcount)) {
55 /* prefer to relax rcu read side */ 53 /* prefer to relax rcu read side */
@@ -83,9 +81,7 @@ int erofs_register_workgroup(struct super_block *sb,
83 sbi = EROFS_SB(sb); 81 sbi = EROFS_SB(sb);
84 erofs_workstn_lock(sbi); 82 erofs_workstn_lock(sbi);
85 83
86 if (tag) 84 grp = xa_tag_pointer(grp, tag);
87 grp = (void *)((unsigned long)grp |
88 1UL << RADIX_TREE_EXCEPTIONAL_SHIFT);
89 85
90 err = radix_tree_insert(&sbi->workstn_tree, 86 err = radix_tree_insert(&sbi->workstn_tree,
91 grp->index, grp); 87 grp->index, grp);
@@ -131,9 +127,7 @@ repeat:
131 127
132 for (i = 0; i < found; ++i) { 128 for (i = 0; i < found; ++i) {
133 int cnt; 129 int cnt;
134 struct erofs_workgroup *grp = (void *) 130 struct erofs_workgroup *grp = xa_untag_pointer(batch[i]);
135 ((unsigned long)batch[i] &
136 ~RADIX_TREE_EXCEPTIONAL_ENTRY);
137 131
138 first_index = grp->index + 1; 132 first_index = grp->index + 1;
139 133
@@ -150,8 +144,8 @@ repeat:
150#endif 144#endif
151 continue; 145 continue;
152 146
153 if (radix_tree_delete(&sbi->workstn_tree, 147 if (xa_untag_pointer(radix_tree_delete(&sbi->workstn_tree,
154 grp->index) != grp) { 148 grp->index)) != grp) {
155#ifdef EROFS_FS_HAS_MANAGED_CACHE 149#ifdef EROFS_FS_HAS_MANAGED_CACHE
156skip: 150skip:
157 erofs_workgroup_unfreeze(grp, 1); 151 erofs_workgroup_unfreeze(grp, 1);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 8703ce68fe9d..2955a4ea2fa8 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -437,10 +437,8 @@ static noinline int add_ra_bio_pages(struct inode *inode,
437 if (pg_index > end_index) 437 if (pg_index > end_index)
438 break; 438 break;
439 439
440 rcu_read_lock(); 440 page = xa_load(&mapping->i_pages, pg_index);
441 page = radix_tree_lookup(&mapping->i_pages, pg_index); 441 if (page && !xa_is_value(page)) {
442 rcu_read_unlock();
443 if (page && !radix_tree_exceptional_entry(page)) {
444 misses++; 442 misses++;
445 if (misses > 4) 443 if (misses > 4)
446 break; 444 break;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 6877a74c7469..d228f706ff3e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3784,7 +3784,7 @@ int btree_write_cache_pages(struct address_space *mapping,
3784 pgoff_t index; 3784 pgoff_t index;
3785 pgoff_t end; /* Inclusive */ 3785 pgoff_t end; /* Inclusive */
3786 int scanned = 0; 3786 int scanned = 0;
3787 int tag; 3787 xa_mark_t tag;
3788 3788
3789 pagevec_init(&pvec); 3789 pagevec_init(&pvec);
3790 if (wbc->range_cyclic) { 3790 if (wbc->range_cyclic) {
@@ -3909,7 +3909,7 @@ static int extent_write_cache_pages(struct address_space *mapping,
3909 pgoff_t done_index; 3909 pgoff_t done_index;
3910 int range_whole = 0; 3910 int range_whole = 0;
3911 int scanned = 0; 3911 int scanned = 0;
3912 int tag; 3912 xa_mark_t tag;
3913 3913
3914 /* 3914 /*
3915 * We have to hold onto the inode so that ordered extents can do their 3915 * We have to hold onto the inode so that ordered extents can do their
@@ -5159,11 +5159,9 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb)
5159 5159
5160 clear_page_dirty_for_io(page); 5160 clear_page_dirty_for_io(page);
5161 xa_lock_irq(&page->mapping->i_pages); 5161 xa_lock_irq(&page->mapping->i_pages);
5162 if (!PageDirty(page)) { 5162 if (!PageDirty(page))
5163 radix_tree_tag_clear(&page->mapping->i_pages, 5163 __xa_clear_mark(&page->mapping->i_pages,
5164 page_index(page), 5164 page_index(page), PAGECACHE_TAG_DIRTY);
5165 PAGECACHE_TAG_DIRTY);
5166 }
5167 xa_unlock_irq(&page->mapping->i_pages); 5165 xa_unlock_irq(&page->mapping->i_pages);
5168 ClearPageError(page); 5166 ClearPageError(page);
5169 unlock_page(page); 5167 unlock_page(page);
diff --git a/fs/buffer.c b/fs/buffer.c
index 109f55196866..d60d61e8ed7d 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -562,7 +562,7 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
562EXPORT_SYMBOL(mark_buffer_dirty_inode); 562EXPORT_SYMBOL(mark_buffer_dirty_inode);
563 563
564/* 564/*
565 * Mark the page dirty, and set it dirty in the radix tree, and mark the inode 565 * Mark the page dirty, and set it dirty in the page cache, and mark the inode
566 * dirty. 566 * dirty.
567 * 567 *
568 * If warn is true, then emit a warning if the page is not uptodate and has 568 * If warn is true, then emit a warning if the page is not uptodate and has
@@ -579,8 +579,8 @@ void __set_page_dirty(struct page *page, struct address_space *mapping,
579 if (page->mapping) { /* Race with truncate? */ 579 if (page->mapping) { /* Race with truncate? */
580 WARN_ON_ONCE(warn && !PageUptodate(page)); 580 WARN_ON_ONCE(warn && !PageUptodate(page));
581 account_page_dirtied(page, mapping); 581 account_page_dirtied(page, mapping);
582 radix_tree_tag_set(&mapping->i_pages, 582 __xa_set_mark(&mapping->i_pages, page_index(page),
583 page_index(page), PAGECACHE_TAG_DIRTY); 583 PAGECACHE_TAG_DIRTY);
584 } 584 }
585 xa_unlock_irqrestore(&mapping->i_pages, flags); 585 xa_unlock_irqrestore(&mapping->i_pages, flags);
586} 586}
@@ -1050,7 +1050,7 @@ __getblk_slow(struct block_device *bdev, sector_t block,
1050 * The relationship between dirty buffers and dirty pages: 1050 * The relationship between dirty buffers and dirty pages:
1051 * 1051 *
1052 * Whenever a page has any dirty buffers, the page's dirty bit is set, and 1052 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1053 * the page is tagged dirty in its radix tree. 1053 * the page is tagged dirty in the page cache.
1054 * 1054 *
1055 * At all times, the dirtiness of the buffers represents the dirtiness of 1055 * At all times, the dirtiness of the buffers represents the dirtiness of
1056 * subsections of the page. If the page has buffers, the page dirty bit is 1056 * subsections of the page. If the page has buffers, the page dirty bit is
@@ -1073,9 +1073,9 @@ __getblk_slow(struct block_device *bdev, sector_t block,
1073 * mark_buffer_dirty - mark a buffer_head as needing writeout 1073 * mark_buffer_dirty - mark a buffer_head as needing writeout
1074 * @bh: the buffer_head to mark dirty 1074 * @bh: the buffer_head to mark dirty
1075 * 1075 *
1076 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its 1076 * mark_buffer_dirty() will set the dirty bit against the buffer, then set
1077 * backing page dirty, then tag the page as dirty in its address_space's radix 1077 * its backing page dirty, then tag the page as dirty in the page cache
1078 * tree and then attach the address_space's inode to its superblock's dirty 1078 * and then attach the address_space's inode to its superblock's dirty
1079 * inode list. 1079 * inode list.
1080 * 1080 *
1081 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, 1081 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
diff --git a/fs/dax.c b/fs/dax.c
index 0fb270f0a0ef..616e36ea6aaa 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -38,6 +38,17 @@
38#define CREATE_TRACE_POINTS 38#define CREATE_TRACE_POINTS
39#include <trace/events/fs_dax.h> 39#include <trace/events/fs_dax.h>
40 40
41static inline unsigned int pe_order(enum page_entry_size pe_size)
42{
43 if (pe_size == PE_SIZE_PTE)
44 return PAGE_SHIFT - PAGE_SHIFT;
45 if (pe_size == PE_SIZE_PMD)
46 return PMD_SHIFT - PAGE_SHIFT;
47 if (pe_size == PE_SIZE_PUD)
48 return PUD_SHIFT - PAGE_SHIFT;
49 return ~0;
50}
51
41/* We choose 4096 entries - same as per-zone page wait tables */ 52/* We choose 4096 entries - same as per-zone page wait tables */
42#define DAX_WAIT_TABLE_BITS 12 53#define DAX_WAIT_TABLE_BITS 12
43#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) 54#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
@@ -46,6 +57,9 @@
46#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) 57#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
47#define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT) 58#define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT)
48 59
60/* The order of a PMD entry */
61#define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT)
62
49static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; 63static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
50 64
51static int __init init_dax_wait_table(void) 65static int __init init_dax_wait_table(void)
@@ -59,63 +73,74 @@ static int __init init_dax_wait_table(void)
59fs_initcall(init_dax_wait_table); 73fs_initcall(init_dax_wait_table);
60 74
61/* 75/*
62 * We use lowest available bit in exceptional entry for locking, one bit for 76 * DAX pagecache entries use XArray value entries so they can't be mistaken
63 * the entry size (PMD) and two more to tell us if the entry is a zero page or 77 * for pages. We use one bit for locking, one bit for the entry size (PMD)
64 * an empty entry that is just used for locking. In total four special bits. 78 * and two more to tell us if the entry is a zero page or an empty entry that
79 * is just used for locking. In total four special bits.
65 * 80 *
66 * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE 81 * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
67 * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem 82 * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
68 * block allocation. 83 * block allocation.
69 */ 84 */
70#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4) 85#define DAX_SHIFT (4)
71#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT) 86#define DAX_LOCKED (1UL << 0)
72#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1)) 87#define DAX_PMD (1UL << 1)
73#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) 88#define DAX_ZERO_PAGE (1UL << 2)
74#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)) 89#define DAX_EMPTY (1UL << 3)
75 90
76static unsigned long dax_radix_pfn(void *entry) 91static unsigned long dax_to_pfn(void *entry)
77{ 92{
78 return (unsigned long)entry >> RADIX_DAX_SHIFT; 93 return xa_to_value(entry) >> DAX_SHIFT;
79} 94}
80 95
81static void *dax_radix_locked_entry(unsigned long pfn, unsigned long flags) 96static void *dax_make_entry(pfn_t pfn, unsigned long flags)
82{ 97{
83 return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags | 98 return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
84 (pfn << RADIX_DAX_SHIFT) | RADIX_DAX_ENTRY_LOCK);
85} 99}
86 100
87static unsigned int dax_radix_order(void *entry) 101static void *dax_make_page_entry(struct page *page)
88{ 102{
89 if ((unsigned long)entry & RADIX_DAX_PMD) 103 pfn_t pfn = page_to_pfn_t(page);
90 return PMD_SHIFT - PAGE_SHIFT; 104 return dax_make_entry(pfn, PageHead(page) ? DAX_PMD : 0);
105}
106
107static bool dax_is_locked(void *entry)
108{
109 return xa_to_value(entry) & DAX_LOCKED;
110}
111
112static unsigned int dax_entry_order(void *entry)
113{
114 if (xa_to_value(entry) & DAX_PMD)
115 return PMD_ORDER;
91 return 0; 116 return 0;
92} 117}
93 118
94static int dax_is_pmd_entry(void *entry) 119static int dax_is_pmd_entry(void *entry)
95{ 120{
96 return (unsigned long)entry & RADIX_DAX_PMD; 121 return xa_to_value(entry) & DAX_PMD;
97} 122}
98 123
99static int dax_is_pte_entry(void *entry) 124static int dax_is_pte_entry(void *entry)
100{ 125{
101 return !((unsigned long)entry & RADIX_DAX_PMD); 126 return !(xa_to_value(entry) & DAX_PMD);
102} 127}
103 128
104static int dax_is_zero_entry(void *entry) 129static int dax_is_zero_entry(void *entry)
105{ 130{
106 return (unsigned long)entry & RADIX_DAX_ZERO_PAGE; 131 return xa_to_value(entry) & DAX_ZERO_PAGE;
107} 132}
108 133
109static int dax_is_empty_entry(void *entry) 134static int dax_is_empty_entry(void *entry)
110{ 135{
111 return (unsigned long)entry & RADIX_DAX_EMPTY; 136 return xa_to_value(entry) & DAX_EMPTY;
112} 137}
113 138
114/* 139/*
115 * DAX radix tree locking 140 * DAX page cache entry locking
116 */ 141 */
117struct exceptional_entry_key { 142struct exceptional_entry_key {
118 struct address_space *mapping; 143 struct xarray *xa;
119 pgoff_t entry_start; 144 pgoff_t entry_start;
120}; 145};
121 146
@@ -124,10 +149,11 @@ struct wait_exceptional_entry_queue {
124 struct exceptional_entry_key key; 149 struct exceptional_entry_key key;
125}; 150};
126 151
127static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, 152static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas,
128 pgoff_t index, void *entry, struct exceptional_entry_key *key) 153 void *entry, struct exceptional_entry_key *key)
129{ 154{
130 unsigned long hash; 155 unsigned long hash;
156 unsigned long index = xas->xa_index;
131 157
132 /* 158 /*
133 * If 'entry' is a PMD, align the 'index' that we use for the wait 159 * If 'entry' is a PMD, align the 'index' that we use for the wait
@@ -136,22 +162,21 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
136 */ 162 */
137 if (dax_is_pmd_entry(entry)) 163 if (dax_is_pmd_entry(entry))
138 index &= ~PG_PMD_COLOUR; 164 index &= ~PG_PMD_COLOUR;
139 165 key->xa = xas->xa;
140 key->mapping = mapping;
141 key->entry_start = index; 166 key->entry_start = index;
142 167
143 hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS); 168 hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS);
144 return wait_table + hash; 169 return wait_table + hash;
145} 170}
146 171
147static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mode, 172static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
148 int sync, void *keyp) 173 unsigned int mode, int sync, void *keyp)
149{ 174{
150 struct exceptional_entry_key *key = keyp; 175 struct exceptional_entry_key *key = keyp;
151 struct wait_exceptional_entry_queue *ewait = 176 struct wait_exceptional_entry_queue *ewait =
152 container_of(wait, struct wait_exceptional_entry_queue, wait); 177 container_of(wait, struct wait_exceptional_entry_queue, wait);
153 178
154 if (key->mapping != ewait->key.mapping || 179 if (key->xa != ewait->key.xa ||
155 key->entry_start != ewait->key.entry_start) 180 key->entry_start != ewait->key.entry_start)
156 return 0; 181 return 0;
157 return autoremove_wake_function(wait, mode, sync, NULL); 182 return autoremove_wake_function(wait, mode, sync, NULL);
@@ -162,13 +187,12 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo
162 * The important information it's conveying is whether the entry at 187 * The important information it's conveying is whether the entry at
163 * this index used to be a PMD entry. 188 * this index used to be a PMD entry.
164 */ 189 */
165static void dax_wake_mapping_entry_waiter(struct address_space *mapping, 190static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all)
166 pgoff_t index, void *entry, bool wake_all)
167{ 191{
168 struct exceptional_entry_key key; 192 struct exceptional_entry_key key;
169 wait_queue_head_t *wq; 193 wait_queue_head_t *wq;
170 194
171 wq = dax_entry_waitqueue(mapping, index, entry, &key); 195 wq = dax_entry_waitqueue(xas, entry, &key);
172 196
173 /* 197 /*
174 * Checking for locked entry and prepare_to_wait_exclusive() happens 198 * Checking for locked entry and prepare_to_wait_exclusive() happens
@@ -181,55 +205,16 @@ static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
181} 205}
182 206
183/* 207/*
184 * Check whether the given slot is locked. Must be called with the i_pages 208 * Look up entry in page cache, wait for it to become unlocked if it
185 * lock held. 209 * is a DAX entry and return it. The caller must subsequently call
186 */ 210 * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry()
187static inline int slot_locked(struct address_space *mapping, void **slot) 211 * if it did.
188{
189 unsigned long entry = (unsigned long)
190 radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
191 return entry & RADIX_DAX_ENTRY_LOCK;
192}
193
194/*
195 * Mark the given slot as locked. Must be called with the i_pages lock held.
196 */
197static inline void *lock_slot(struct address_space *mapping, void **slot)
198{
199 unsigned long entry = (unsigned long)
200 radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
201
202 entry |= RADIX_DAX_ENTRY_LOCK;
203 radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry);
204 return (void *)entry;
205}
206
207/*
208 * Mark the given slot as unlocked. Must be called with the i_pages lock held.
209 */
210static inline void *unlock_slot(struct address_space *mapping, void **slot)
211{
212 unsigned long entry = (unsigned long)
213 radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
214
215 entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
216 radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry);
217 return (void *)entry;
218}
219
220/*
221 * Lookup entry in radix tree, wait for it to become unlocked if it is
222 * exceptional entry and return it. The caller must call
223 * put_unlocked_mapping_entry() when he decided not to lock the entry or
224 * put_locked_mapping_entry() when he locked the entry and now wants to
225 * unlock it.
226 * 212 *
227 * Must be called with the i_pages lock held. 213 * Must be called with the i_pages lock held.
228 */ 214 */
229static void *__get_unlocked_mapping_entry(struct address_space *mapping, 215static void *get_unlocked_entry(struct xa_state *xas)
230 pgoff_t index, void ***slotp, bool (*wait_fn)(void))
231{ 216{
232 void *entry, **slot; 217 void *entry;
233 struct wait_exceptional_entry_queue ewait; 218 struct wait_exceptional_entry_queue ewait;
234 wait_queue_head_t *wq; 219 wait_queue_head_t *wq;
235 220
@@ -237,80 +222,54 @@ static void *__get_unlocked_mapping_entry(struct address_space *mapping,
237 ewait.wait.func = wake_exceptional_entry_func; 222 ewait.wait.func = wake_exceptional_entry_func;
238 223
239 for (;;) { 224 for (;;) {
240 bool revalidate; 225 entry = xas_load(xas);
241 226 if (!entry || xa_is_internal(entry) ||
242 entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, 227 WARN_ON_ONCE(!xa_is_value(entry)) ||
243 &slot); 228 !dax_is_locked(entry))
244 if (!entry ||
245 WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) ||
246 !slot_locked(mapping, slot)) {
247 if (slotp)
248 *slotp = slot;
249 return entry; 229 return entry;
250 }
251 230
252 wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key); 231 wq = dax_entry_waitqueue(xas, entry, &ewait.key);
253 prepare_to_wait_exclusive(wq, &ewait.wait, 232 prepare_to_wait_exclusive(wq, &ewait.wait,
254 TASK_UNINTERRUPTIBLE); 233 TASK_UNINTERRUPTIBLE);
255 xa_unlock_irq(&mapping->i_pages); 234 xas_unlock_irq(xas);
256 revalidate = wait_fn(); 235 xas_reset(xas);
236 schedule();
257 finish_wait(wq, &ewait.wait); 237 finish_wait(wq, &ewait.wait);
258 xa_lock_irq(&mapping->i_pages); 238 xas_lock_irq(xas);
259 if (revalidate)
260 return ERR_PTR(-EAGAIN);
261 } 239 }
262} 240}
263 241
264static bool entry_wait(void) 242static void put_unlocked_entry(struct xa_state *xas, void *entry)
265{
266 schedule();
267 /*
268 * Never return an ERR_PTR() from
269 * __get_unlocked_mapping_entry(), just keep looping.
270 */
271 return false;
272}
273
274static void *get_unlocked_mapping_entry(struct address_space *mapping,
275 pgoff_t index, void ***slotp)
276{ 243{
277 return __get_unlocked_mapping_entry(mapping, index, slotp, entry_wait); 244 /* If we were the only waiter woken, wake the next one */
278} 245 if (entry)
279 246 dax_wake_entry(xas, entry, false);
280static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
281{
282 void *entry, **slot;
283
284 xa_lock_irq(&mapping->i_pages);
285 entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot);
286 if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
287 !slot_locked(mapping, slot))) {
288 xa_unlock_irq(&mapping->i_pages);
289 return;
290 }
291 unlock_slot(mapping, slot);
292 xa_unlock_irq(&mapping->i_pages);
293 dax_wake_mapping_entry_waiter(mapping, index, entry, false);
294} 247}
295 248
296static void put_locked_mapping_entry(struct address_space *mapping, 249/*
297 pgoff_t index) 250 * We used the xa_state to get the entry, but then we locked the entry and
251 * dropped the xa_lock, so we know the xa_state is stale and must be reset
252 * before use.
253 */
254static void dax_unlock_entry(struct xa_state *xas, void *entry)
298{ 255{
299 unlock_mapping_entry(mapping, index); 256 void *old;
257
258 xas_reset(xas);
259 xas_lock_irq(xas);
260 old = xas_store(xas, entry);
261 xas_unlock_irq(xas);
262 BUG_ON(!dax_is_locked(old));
263 dax_wake_entry(xas, entry, false);
300} 264}
301 265
302/* 266/*
303 * Called when we are done with radix tree entry we looked up via 267 * Return: The entry stored at this location before it was locked.
304 * get_unlocked_mapping_entry() and which we didn't lock in the end.
305 */ 268 */
306static void put_unlocked_mapping_entry(struct address_space *mapping, 269static void *dax_lock_entry(struct xa_state *xas, void *entry)
307 pgoff_t index, void *entry)
308{ 270{
309 if (!entry) 271 unsigned long v = xa_to_value(entry);
310 return; 272 return xas_store(xas, xa_mk_value(v | DAX_LOCKED));
311
312 /* We have to wake up next waiter for the radix tree entry lock */
313 dax_wake_mapping_entry_waiter(mapping, index, entry, false);
314} 273}
315 274
316static unsigned long dax_entry_size(void *entry) 275static unsigned long dax_entry_size(void *entry)
@@ -325,9 +284,9 @@ static unsigned long dax_entry_size(void *entry)
325 return PAGE_SIZE; 284 return PAGE_SIZE;
326} 285}
327 286
328static unsigned long dax_radix_end_pfn(void *entry) 287static unsigned long dax_end_pfn(void *entry)
329{ 288{
330 return dax_radix_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE; 289 return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
331} 290}
332 291
333/* 292/*
@@ -335,8 +294,8 @@ static unsigned long dax_radix_end_pfn(void *entry)
335 * 'empty' and 'zero' entries. 294 * 'empty' and 'zero' entries.
336 */ 295 */
337#define for_each_mapped_pfn(entry, pfn) \ 296#define for_each_mapped_pfn(entry, pfn) \
338 for (pfn = dax_radix_pfn(entry); \ 297 for (pfn = dax_to_pfn(entry); \
339 pfn < dax_radix_end_pfn(entry); pfn++) 298 pfn < dax_end_pfn(entry); pfn++)
340 299
341/* 300/*
342 * TODO: for reflink+dax we need a way to associate a single page with 301 * TODO: for reflink+dax we need a way to associate a single page with
@@ -393,33 +352,16 @@ static struct page *dax_busy_page(void *entry)
393 return NULL; 352 return NULL;
394} 353}
395 354
396static bool entry_wait_revalidate(void)
397{
398 rcu_read_unlock();
399 schedule();
400 rcu_read_lock();
401
402 /*
403 * Tell __get_unlocked_mapping_entry() to take a break, we need
404 * to revalidate page->mapping after dropping locks
405 */
406 return true;
407}
408
409bool dax_lock_mapping_entry(struct page *page) 355bool dax_lock_mapping_entry(struct page *page)
410{ 356{
411 pgoff_t index; 357 XA_STATE(xas, NULL, 0);
412 struct inode *inode; 358 void *entry;
413 bool did_lock = false;
414 void *entry = NULL, **slot;
415 struct address_space *mapping;
416 359
417 rcu_read_lock();
418 for (;;) { 360 for (;;) {
419 mapping = READ_ONCE(page->mapping); 361 struct address_space *mapping = READ_ONCE(page->mapping);
420 362
421 if (!dax_mapping(mapping)) 363 if (!dax_mapping(mapping))
422 break; 364 return false;
423 365
424 /* 366 /*
425 * In the device-dax case there's no need to lock, a 367 * In the device-dax case there's no need to lock, a
@@ -428,98 +370,94 @@ bool dax_lock_mapping_entry(struct page *page)
428 * otherwise we would not have a valid pfn_to_page() 370 * otherwise we would not have a valid pfn_to_page()
429 * translation. 371 * translation.
430 */ 372 */
431 inode = mapping->host; 373 if (S_ISCHR(mapping->host->i_mode))
432 if (S_ISCHR(inode->i_mode)) { 374 return true;
433 did_lock = true;
434 break;
435 }
436 375
437 xa_lock_irq(&mapping->i_pages); 376 xas.xa = &mapping->i_pages;
377 xas_lock_irq(&xas);
438 if (mapping != page->mapping) { 378 if (mapping != page->mapping) {
439 xa_unlock_irq(&mapping->i_pages); 379 xas_unlock_irq(&xas);
440 continue; 380 continue;
441 } 381 }
442 index = page->index; 382 xas_set(&xas, page->index);
443 383 entry = xas_load(&xas);
444 entry = __get_unlocked_mapping_entry(mapping, index, &slot, 384 if (dax_is_locked(entry)) {
445 entry_wait_revalidate); 385 entry = get_unlocked_entry(&xas);
446 if (!entry) { 386 /* Did the page move while we slept? */
447 xa_unlock_irq(&mapping->i_pages); 387 if (dax_to_pfn(entry) != page_to_pfn(page)) {
448 break; 388 xas_unlock_irq(&xas);
449 } else if (IS_ERR(entry)) { 389 continue;
450 xa_unlock_irq(&mapping->i_pages); 390 }
451 WARN_ON_ONCE(PTR_ERR(entry) != -EAGAIN);
452 continue;
453 } 391 }
454 lock_slot(mapping, slot); 392 dax_lock_entry(&xas, entry);
455 did_lock = true; 393 xas_unlock_irq(&xas);
456 xa_unlock_irq(&mapping->i_pages); 394 return true;
457 break;
458 } 395 }
459 rcu_read_unlock();
460
461 return did_lock;
462} 396}
463 397
464void dax_unlock_mapping_entry(struct page *page) 398void dax_unlock_mapping_entry(struct page *page)
465{ 399{
466 struct address_space *mapping = page->mapping; 400 struct address_space *mapping = page->mapping;
467 struct inode *inode = mapping->host; 401 XA_STATE(xas, &mapping->i_pages, page->index);
468 402
469 if (S_ISCHR(inode->i_mode)) 403 if (S_ISCHR(mapping->host->i_mode))
470 return; 404 return;
471 405
472 unlock_mapping_entry(mapping, page->index); 406 dax_unlock_entry(&xas, dax_make_page_entry(page));
473} 407}
474 408
475/* 409/*
476 * Find radix tree entry at given index. If it points to an exceptional entry, 410 * Find page cache entry at given index. If it is a DAX entry, return it
477 * return it with the radix tree entry locked. If the radix tree doesn't 411 * with the entry locked. If the page cache doesn't contain an entry at
478 * contain given index, create an empty exceptional entry for the index and 412 * that index, add a locked empty entry.
479 * return with it locked.
480 * 413 *
481 * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will 414 * When requesting an entry with size DAX_PMD, grab_mapping_entry() will
482 * either return that locked entry or will return an error. This error will 415 * either return that locked entry or will return VM_FAULT_FALLBACK.
483 * happen if there are any 4k entries within the 2MiB range that we are 416 * This will happen if there are any PTE entries within the PMD range
484 * requesting. 417 * that we are requesting.
485 * 418 *
486 * We always favor 4k entries over 2MiB entries. There isn't a flow where we 419 * We always favor PTE entries over PMD entries. There isn't a flow where we
487 * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB 420 * evict PTE entries in order to 'upgrade' them to a PMD entry. A PMD
488 * insertion will fail if it finds any 4k entries already in the tree, and a 421 * insertion will fail if it finds any PTE entries already in the tree, and a
489 * 4k insertion will cause an existing 2MiB entry to be unmapped and 422 * PTE insertion will cause an existing PMD entry to be unmapped and
490 * downgraded to 4k entries. This happens for both 2MiB huge zero pages as 423 * downgraded to PTE entries. This happens for both PMD zero pages as
491 * well as 2MiB empty entries. 424 * well as PMD empty entries.
492 * 425 *
493 * The exception to this downgrade path is for 2MiB DAX PMD entries that have 426 * The exception to this downgrade path is for PMD entries that have
494 * real storage backing them. We will leave these real 2MiB DAX entries in 427 * real storage backing them. We will leave these real PMD entries in
495 * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry. 428 * the tree, and PTE writes will simply dirty the entire PMD entry.
496 * 429 *
497 * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For 430 * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
498 * persistent memory the benefit is doubtful. We can add that later if we can 431 * persistent memory the benefit is doubtful. We can add that later if we can
499 * show it helps. 432 * show it helps.
433 *
434 * On error, this function does not return an ERR_PTR. Instead it returns
435 * a VM_FAULT code, encoded as an xarray internal entry. The ERR_PTR values
436 * overlap with xarray value entries.
500 */ 437 */
501static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, 438static void *grab_mapping_entry(struct xa_state *xas,
502 unsigned long size_flag) 439 struct address_space *mapping, unsigned long size_flag)
503{ 440{
504 bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */ 441 unsigned long index = xas->xa_index;
505 void *entry, **slot; 442 bool pmd_downgrade = false; /* splitting PMD entry into PTE entries? */
506 443 void *entry;
507restart:
508 xa_lock_irq(&mapping->i_pages);
509 entry = get_unlocked_mapping_entry(mapping, index, &slot);
510 444
511 if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) { 445retry:
512 entry = ERR_PTR(-EIO); 446 xas_lock_irq(xas);
513 goto out_unlock; 447 entry = get_unlocked_entry(xas);
514 } 448 if (xa_is_internal(entry))
449 goto fallback;
515 450
516 if (entry) { 451 if (entry) {
517 if (size_flag & RADIX_DAX_PMD) { 452 if (WARN_ON_ONCE(!xa_is_value(entry))) {
453 xas_set_err(xas, EIO);
454 goto out_unlock;
455 }
456
457 if (size_flag & DAX_PMD) {
518 if (dax_is_pte_entry(entry)) { 458 if (dax_is_pte_entry(entry)) {
519 put_unlocked_mapping_entry(mapping, index, 459 put_unlocked_entry(xas, entry);
520 entry); 460 goto fallback;
521 entry = ERR_PTR(-EEXIST);
522 goto out_unlock;
523 } 461 }
524 } else { /* trying to grab a PTE entry */ 462 } else { /* trying to grab a PTE entry */
525 if (dax_is_pmd_entry(entry) && 463 if (dax_is_pmd_entry(entry) &&
@@ -530,87 +468,57 @@ restart:
530 } 468 }
531 } 469 }
532 470
533 /* No entry for given index? Make sure radix tree is big enough. */ 471 if (pmd_downgrade) {
534 if (!entry || pmd_downgrade) { 472 /*
535 int err; 473 * Make sure 'entry' remains valid while we drop
536 474 * the i_pages lock.
537 if (pmd_downgrade) { 475 */
538 /* 476 dax_lock_entry(xas, entry);
539 * Make sure 'entry' remains valid while we drop
540 * the i_pages lock.
541 */
542 entry = lock_slot(mapping, slot);
543 }
544 477
545 xa_unlock_irq(&mapping->i_pages);
546 /* 478 /*
547 * Besides huge zero pages the only other thing that gets 479 * Besides huge zero pages the only other thing that gets
548 * downgraded are empty entries which don't need to be 480 * downgraded are empty entries which don't need to be
549 * unmapped. 481 * unmapped.
550 */ 482 */
551 if (pmd_downgrade && dax_is_zero_entry(entry)) 483 if (dax_is_zero_entry(entry)) {
552 unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, 484 xas_unlock_irq(xas);
553 PG_PMD_NR, false); 485 unmap_mapping_pages(mapping,
554 486 xas->xa_index & ~PG_PMD_COLOUR,
555 err = radix_tree_preload( 487 PG_PMD_NR, false);
556 mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); 488 xas_reset(xas);
557 if (err) { 489 xas_lock_irq(xas);
558 if (pmd_downgrade)
559 put_locked_mapping_entry(mapping, index);
560 return ERR_PTR(err);
561 }
562 xa_lock_irq(&mapping->i_pages);
563
564 if (!entry) {
565 /*
566 * We needed to drop the i_pages lock while calling
567 * radix_tree_preload() and we didn't have an entry to
568 * lock. See if another thread inserted an entry at
569 * our index during this time.
570 */
571 entry = __radix_tree_lookup(&mapping->i_pages, index,
572 NULL, &slot);
573 if (entry) {
574 radix_tree_preload_end();
575 xa_unlock_irq(&mapping->i_pages);
576 goto restart;
577 }
578 } 490 }
579 491
580 if (pmd_downgrade) { 492 dax_disassociate_entry(entry, mapping, false);
581 dax_disassociate_entry(entry, mapping, false); 493 xas_store(xas, NULL); /* undo the PMD join */
582 radix_tree_delete(&mapping->i_pages, index); 494 dax_wake_entry(xas, entry, true);
583 mapping->nrexceptional--; 495 mapping->nrexceptional--;
584 dax_wake_mapping_entry_waiter(mapping, index, entry, 496 entry = NULL;
585 true); 497 xas_set(xas, index);
586 } 498 }
587 499
588 entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY); 500 if (entry) {
589 501 dax_lock_entry(xas, entry);
590 err = __radix_tree_insert(&mapping->i_pages, index, 502 } else {
591 dax_radix_order(entry), entry); 503 entry = dax_make_entry(pfn_to_pfn_t(0), size_flag | DAX_EMPTY);
592 radix_tree_preload_end(); 504 dax_lock_entry(xas, entry);
593 if (err) { 505 if (xas_error(xas))
594 xa_unlock_irq(&mapping->i_pages); 506 goto out_unlock;
595 /*
596 * Our insertion of a DAX entry failed, most likely
597 * because we were inserting a PMD entry and it
598 * collided with a PTE sized entry at a different
599 * index in the PMD range. We haven't inserted
600 * anything into the radix tree and have no waiters to
601 * wake.
602 */
603 return ERR_PTR(err);
604 }
605 /* Good, we have inserted empty locked entry into the tree. */
606 mapping->nrexceptional++; 507 mapping->nrexceptional++;
607 xa_unlock_irq(&mapping->i_pages);
608 return entry;
609 } 508 }
610 entry = lock_slot(mapping, slot); 509
611 out_unlock: 510out_unlock:
612 xa_unlock_irq(&mapping->i_pages); 511 xas_unlock_irq(xas);
512 if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM))
513 goto retry;
514 if (xas->xa_node == XA_ERROR(-ENOMEM))
515 return xa_mk_internal(VM_FAULT_OOM);
516 if (xas_error(xas))
517 return xa_mk_internal(VM_FAULT_SIGBUS);
613 return entry; 518 return entry;
519fallback:
520 xas_unlock_irq(xas);
521 return xa_mk_internal(VM_FAULT_FALLBACK);
614} 522}
615 523
616/** 524/**
@@ -630,11 +538,10 @@ restart:
630 */ 538 */
631struct page *dax_layout_busy_page(struct address_space *mapping) 539struct page *dax_layout_busy_page(struct address_space *mapping)
632{ 540{
633 pgoff_t indices[PAGEVEC_SIZE]; 541 XA_STATE(xas, &mapping->i_pages, 0);
542 void *entry;
543 unsigned int scanned = 0;
634 struct page *page = NULL; 544 struct page *page = NULL;
635 struct pagevec pvec;
636 pgoff_t index, end;
637 unsigned i;
638 545
639 /* 546 /*
640 * In the 'limited' case get_user_pages() for dax is disabled. 547 * In the 'limited' case get_user_pages() for dax is disabled.
@@ -645,13 +552,9 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
645 if (!dax_mapping(mapping) || !mapping_mapped(mapping)) 552 if (!dax_mapping(mapping) || !mapping_mapped(mapping))
646 return NULL; 553 return NULL;
647 554
648 pagevec_init(&pvec);
649 index = 0;
650 end = -1;
651
652 /* 555 /*
653 * If we race get_user_pages_fast() here either we'll see the 556 * If we race get_user_pages_fast() here either we'll see the
654 * elevated page count in the pagevec_lookup and wait, or 557 * elevated page count in the iteration and wait, or
655 * get_user_pages_fast() will see that the page it took a reference 558 * get_user_pages_fast() will see that the page it took a reference
656 * against is no longer mapped in the page tables and bail to the 559 * against is no longer mapped in the page tables and bail to the
657 * get_user_pages() slow path. The slow path is protected by 560 * get_user_pages() slow path. The slow path is protected by
@@ -663,94 +566,68 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
663 */ 566 */
664 unmap_mapping_range(mapping, 0, 0, 1); 567 unmap_mapping_range(mapping, 0, 0, 1);
665 568
666 while (index < end && pagevec_lookup_entries(&pvec, mapping, index, 569 xas_lock_irq(&xas);
667 min(end - index, (pgoff_t)PAGEVEC_SIZE), 570 xas_for_each(&xas, entry, ULONG_MAX) {
668 indices)) { 571 if (WARN_ON_ONCE(!xa_is_value(entry)))
669 pgoff_t nr_pages = 1; 572 continue;
670 573 if (unlikely(dax_is_locked(entry)))
671 for (i = 0; i < pagevec_count(&pvec); i++) { 574 entry = get_unlocked_entry(&xas);
672 struct page *pvec_ent = pvec.pages[i]; 575 if (entry)
673 void *entry; 576 page = dax_busy_page(entry);
674 577 put_unlocked_entry(&xas, entry);
675 index = indices[i];
676 if (index >= end)
677 break;
678
679 if (WARN_ON_ONCE(
680 !radix_tree_exceptional_entry(pvec_ent)))
681 continue;
682
683 xa_lock_irq(&mapping->i_pages);
684 entry = get_unlocked_mapping_entry(mapping, index, NULL);
685 if (entry) {
686 page = dax_busy_page(entry);
687 /*
688 * Account for multi-order entries at
689 * the end of the pagevec.
690 */
691 if (i + 1 >= pagevec_count(&pvec))
692 nr_pages = 1UL << dax_radix_order(entry);
693 }
694 put_unlocked_mapping_entry(mapping, index, entry);
695 xa_unlock_irq(&mapping->i_pages);
696 if (page)
697 break;
698 }
699
700 /*
701 * We don't expect normal struct page entries to exist in our
702 * tree, but we keep these pagevec calls so that this code is
703 * consistent with the common pattern for handling pagevecs
704 * throughout the kernel.
705 */
706 pagevec_remove_exceptionals(&pvec);
707 pagevec_release(&pvec);
708 index += nr_pages;
709
710 if (page) 578 if (page)
711 break; 579 break;
580 if (++scanned % XA_CHECK_SCHED)
581 continue;
582
583 xas_pause(&xas);
584 xas_unlock_irq(&xas);
585 cond_resched();
586 xas_lock_irq(&xas);
712 } 587 }
588 xas_unlock_irq(&xas);
713 return page; 589 return page;
714} 590}
715EXPORT_SYMBOL_GPL(dax_layout_busy_page); 591EXPORT_SYMBOL_GPL(dax_layout_busy_page);
716 592
717static int __dax_invalidate_mapping_entry(struct address_space *mapping, 593static int __dax_invalidate_entry(struct address_space *mapping,
718 pgoff_t index, bool trunc) 594 pgoff_t index, bool trunc)
719{ 595{
596 XA_STATE(xas, &mapping->i_pages, index);
720 int ret = 0; 597 int ret = 0;
721 void *entry; 598 void *entry;
722 struct radix_tree_root *pages = &mapping->i_pages;
723 599
724 xa_lock_irq(pages); 600 xas_lock_irq(&xas);
725 entry = get_unlocked_mapping_entry(mapping, index, NULL); 601 entry = get_unlocked_entry(&xas);
726 if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry))) 602 if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
727 goto out; 603 goto out;
728 if (!trunc && 604 if (!trunc &&
729 (radix_tree_tag_get(pages, index, PAGECACHE_TAG_DIRTY) || 605 (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) ||
730 radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE))) 606 xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
731 goto out; 607 goto out;
732 dax_disassociate_entry(entry, mapping, trunc); 608 dax_disassociate_entry(entry, mapping, trunc);
733 radix_tree_delete(pages, index); 609 xas_store(&xas, NULL);
734 mapping->nrexceptional--; 610 mapping->nrexceptional--;
735 ret = 1; 611 ret = 1;
736out: 612out:
737 put_unlocked_mapping_entry(mapping, index, entry); 613 put_unlocked_entry(&xas, entry);
738 xa_unlock_irq(pages); 614 xas_unlock_irq(&xas);
739 return ret; 615 return ret;
740} 616}
617
741/* 618/*
742 * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree 619 * Delete DAX entry at @index from @mapping. Wait for it
743 * entry to get unlocked before deleting it. 620 * to be unlocked before deleting it.
744 */ 621 */
745int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) 622int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
746{ 623{
747 int ret = __dax_invalidate_mapping_entry(mapping, index, true); 624 int ret = __dax_invalidate_entry(mapping, index, true);
748 625
749 /* 626 /*
750 * This gets called from truncate / punch_hole path. As such, the caller 627 * This gets called from truncate / punch_hole path. As such, the caller
751 * must hold locks protecting against concurrent modifications of the 628 * must hold locks protecting against concurrent modifications of the
752 * radix tree (usually fs-private i_mmap_sem for writing). Since the 629 * page cache (usually fs-private i_mmap_sem for writing). Since the
753 * caller has seen exceptional entry for this index, we better find it 630 * caller has seen a DAX entry for this index, we better find it
754 * at that index as well... 631 * at that index as well...
755 */ 632 */
756 WARN_ON_ONCE(!ret); 633 WARN_ON_ONCE(!ret);
@@ -758,12 +635,12 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
758} 635}
759 636
760/* 637/*
761 * Invalidate exceptional DAX entry if it is clean. 638 * Invalidate DAX entry if it is clean.
762 */ 639 */
763int dax_invalidate_mapping_entry_sync(struct address_space *mapping, 640int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
764 pgoff_t index) 641 pgoff_t index)
765{ 642{
766 return __dax_invalidate_mapping_entry(mapping, index, false); 643 return __dax_invalidate_entry(mapping, index, false);
767} 644}
768 645
769static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, 646static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
@@ -799,30 +676,27 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
799 * already in the tree, we will skip the insertion and just dirty the PMD as 676 * already in the tree, we will skip the insertion and just dirty the PMD as
800 * appropriate. 677 * appropriate.
801 */ 678 */
802static void *dax_insert_mapping_entry(struct address_space *mapping, 679static void *dax_insert_entry(struct xa_state *xas,
803 struct vm_fault *vmf, 680 struct address_space *mapping, struct vm_fault *vmf,
804 void *entry, pfn_t pfn_t, 681 void *entry, pfn_t pfn, unsigned long flags, bool dirty)
805 unsigned long flags, bool dirty)
806{ 682{
807 struct radix_tree_root *pages = &mapping->i_pages; 683 void *new_entry = dax_make_entry(pfn, flags);
808 unsigned long pfn = pfn_t_to_pfn(pfn_t);
809 pgoff_t index = vmf->pgoff;
810 void *new_entry;
811 684
812 if (dirty) 685 if (dirty)
813 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 686 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
814 687
815 if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) { 688 if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) {
689 unsigned long index = xas->xa_index;
816 /* we are replacing a zero page with block mapping */ 690 /* we are replacing a zero page with block mapping */
817 if (dax_is_pmd_entry(entry)) 691 if (dax_is_pmd_entry(entry))
818 unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, 692 unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
819 PG_PMD_NR, false); 693 PG_PMD_NR, false);
820 else /* pte entry */ 694 else /* pte entry */
821 unmap_mapping_pages(mapping, vmf->pgoff, 1, false); 695 unmap_mapping_pages(mapping, index, 1, false);
822 } 696 }
823 697
824 xa_lock_irq(pages); 698 xas_reset(xas);
825 new_entry = dax_radix_locked_entry(pfn, flags); 699 xas_lock_irq(xas);
826 if (dax_entry_size(entry) != dax_entry_size(new_entry)) { 700 if (dax_entry_size(entry) != dax_entry_size(new_entry)) {
827 dax_disassociate_entry(entry, mapping, false); 701 dax_disassociate_entry(entry, mapping, false);
828 dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address); 702 dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
@@ -830,33 +704,30 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
830 704
831 if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { 705 if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
832 /* 706 /*
833 * Only swap our new entry into the radix tree if the current 707 * Only swap our new entry into the page cache if the current
834 * entry is a zero page or an empty entry. If a normal PTE or 708 * entry is a zero page or an empty entry. If a normal PTE or
835 * PMD entry is already in the tree, we leave it alone. This 709 * PMD entry is already in the cache, we leave it alone. This
836 * means that if we are trying to insert a PTE and the 710 * means that if we are trying to insert a PTE and the
837 * existing entry is a PMD, we will just leave the PMD in the 711 * existing entry is a PMD, we will just leave the PMD in the
838 * tree and dirty it if necessary. 712 * tree and dirty it if necessary.
839 */ 713 */
840 struct radix_tree_node *node; 714 void *old = dax_lock_entry(xas, new_entry);
841 void **slot; 715 WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) |
842 void *ret; 716 DAX_LOCKED));
843
844 ret = __radix_tree_lookup(pages, index, &node, &slot);
845 WARN_ON_ONCE(ret != entry);
846 __radix_tree_replace(pages, node, slot,
847 new_entry, NULL);
848 entry = new_entry; 717 entry = new_entry;
718 } else {
719 xas_load(xas); /* Walk the xa_state */
849 } 720 }
850 721
851 if (dirty) 722 if (dirty)
852 radix_tree_tag_set(pages, index, PAGECACHE_TAG_DIRTY); 723 xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
853 724
854 xa_unlock_irq(pages); 725 xas_unlock_irq(xas);
855 return entry; 726 return entry;
856} 727}
857 728
858static inline unsigned long 729static inline
859pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) 730unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
860{ 731{
861 unsigned long address; 732 unsigned long address;
862 733
@@ -866,8 +737,8 @@ pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
866} 737}
867 738
868/* Walk all mappings of a given index of a file and writeprotect them */ 739/* Walk all mappings of a given index of a file and writeprotect them */
869static void dax_mapping_entry_mkclean(struct address_space *mapping, 740static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
870 pgoff_t index, unsigned long pfn) 741 unsigned long pfn)
871{ 742{
872 struct vm_area_struct *vma; 743 struct vm_area_struct *vma;
873 pte_t pte, *ptep = NULL; 744 pte_t pte, *ptep = NULL;
@@ -937,11 +808,9 @@ unlock_pte:
937 i_mmap_unlock_read(mapping); 808 i_mmap_unlock_read(mapping);
938} 809}
939 810
940static int dax_writeback_one(struct dax_device *dax_dev, 811static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
941 struct address_space *mapping, pgoff_t index, void *entry) 812 struct address_space *mapping, void *entry)
942{ 813{
943 struct radix_tree_root *pages = &mapping->i_pages;
944 void *entry2, **slot;
945 unsigned long pfn; 814 unsigned long pfn;
946 long ret = 0; 815 long ret = 0;
947 size_t size; 816 size_t size;
@@ -950,32 +819,38 @@ static int dax_writeback_one(struct dax_device *dax_dev,
950 * A page got tagged dirty in DAX mapping? Something is seriously 819 * A page got tagged dirty in DAX mapping? Something is seriously
951 * wrong. 820 * wrong.
952 */ 821 */
953 if (WARN_ON(!radix_tree_exceptional_entry(entry))) 822 if (WARN_ON(!xa_is_value(entry)))
954 return -EIO; 823 return -EIO;
955 824
956 xa_lock_irq(pages); 825 if (unlikely(dax_is_locked(entry))) {
957 entry2 = get_unlocked_mapping_entry(mapping, index, &slot); 826 void *old_entry = entry;
958 /* Entry got punched out / reallocated? */ 827
959 if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2))) 828 entry = get_unlocked_entry(xas);
960 goto put_unlocked; 829
961 /* 830 /* Entry got punched out / reallocated? */
962 * Entry got reallocated elsewhere? No need to writeback. We have to 831 if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
963 * compare pfns as we must not bail out due to difference in lockbit 832 goto put_unlocked;
964 * or entry type. 833 /*
965 */ 834 * Entry got reallocated elsewhere? No need to writeback.
966 if (dax_radix_pfn(entry2) != dax_radix_pfn(entry)) 835 * We have to compare pfns as we must not bail out due to
967 goto put_unlocked; 836 * difference in lockbit or entry type.
968 if (WARN_ON_ONCE(dax_is_empty_entry(entry) || 837 */
969 dax_is_zero_entry(entry))) { 838 if (dax_to_pfn(old_entry) != dax_to_pfn(entry))
970 ret = -EIO; 839 goto put_unlocked;
971 goto put_unlocked; 840 if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
841 dax_is_zero_entry(entry))) {
842 ret = -EIO;
843 goto put_unlocked;
844 }
845
846 /* Another fsync thread may have already done this entry */
847 if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE))
848 goto put_unlocked;
972 } 849 }
973 850
974 /* Another fsync thread may have already written back this entry */
975 if (!radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE))
976 goto put_unlocked;
977 /* Lock the entry to serialize with page faults */ 851 /* Lock the entry to serialize with page faults */
978 entry = lock_slot(mapping, slot); 852 dax_lock_entry(xas, entry);
853
979 /* 854 /*
980 * We can clear the tag now but we have to be careful so that concurrent 855 * We can clear the tag now but we have to be careful so that concurrent
981 * dax_writeback_one() calls for the same index cannot finish before we 856 * dax_writeback_one() calls for the same index cannot finish before we
@@ -983,8 +858,8 @@ static int dax_writeback_one(struct dax_device *dax_dev,
983 * at the entry only under the i_pages lock and once they do that 858 * at the entry only under the i_pages lock and once they do that
984 * they will see the entry locked and wait for it to unlock. 859 * they will see the entry locked and wait for it to unlock.
985 */ 860 */
986 radix_tree_tag_clear(pages, index, PAGECACHE_TAG_TOWRITE); 861 xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE);
987 xa_unlock_irq(pages); 862 xas_unlock_irq(xas);
988 863
989 /* 864 /*
990 * Even if dax_writeback_mapping_range() was given a wbc->range_start 865 * Even if dax_writeback_mapping_range() was given a wbc->range_start
@@ -993,10 +868,10 @@ static int dax_writeback_one(struct dax_device *dax_dev,
993 * This allows us to flush for PMD_SIZE and not have to worry about 868 * This allows us to flush for PMD_SIZE and not have to worry about
994 * partial PMD writebacks. 869 * partial PMD writebacks.
995 */ 870 */
996 pfn = dax_radix_pfn(entry); 871 pfn = dax_to_pfn(entry);
997 size = PAGE_SIZE << dax_radix_order(entry); 872 size = PAGE_SIZE << dax_entry_order(entry);
998 873
999 dax_mapping_entry_mkclean(mapping, index, pfn); 874 dax_entry_mkclean(mapping, xas->xa_index, pfn);
1000 dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size); 875 dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size);
1001 /* 876 /*
1002 * After we have flushed the cache, we can clear the dirty tag. There 877 * After we have flushed the cache, we can clear the dirty tag. There
@@ -1004,16 +879,18 @@ static int dax_writeback_one(struct dax_device *dax_dev,
1004 * the pfn mappings are writeprotected and fault waits for mapping 879 * the pfn mappings are writeprotected and fault waits for mapping
1005 * entry lock. 880 * entry lock.
1006 */ 881 */
1007 xa_lock_irq(pages); 882 xas_reset(xas);
1008 radix_tree_tag_clear(pages, index, PAGECACHE_TAG_DIRTY); 883 xas_lock_irq(xas);
1009 xa_unlock_irq(pages); 884 xas_store(xas, entry);
1010 trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); 885 xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
1011 put_locked_mapping_entry(mapping, index); 886 dax_wake_entry(xas, entry, false);
887
888 trace_dax_writeback_one(mapping->host, xas->xa_index,
889 size >> PAGE_SHIFT);
1012 return ret; 890 return ret;
1013 891
1014 put_unlocked: 892 put_unlocked:
1015 put_unlocked_mapping_entry(mapping, index, entry2); 893 put_unlocked_entry(xas, entry);
1016 xa_unlock_irq(pages);
1017 return ret; 894 return ret;
1018} 895}
1019 896
@@ -1025,13 +902,13 @@ static int dax_writeback_one(struct dax_device *dax_dev,
1025int dax_writeback_mapping_range(struct address_space *mapping, 902int dax_writeback_mapping_range(struct address_space *mapping,
1026 struct block_device *bdev, struct writeback_control *wbc) 903 struct block_device *bdev, struct writeback_control *wbc)
1027{ 904{
905 XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT);
1028 struct inode *inode = mapping->host; 906 struct inode *inode = mapping->host;
1029 pgoff_t start_index, end_index; 907 pgoff_t end_index = wbc->range_end >> PAGE_SHIFT;
1030 pgoff_t indices[PAGEVEC_SIZE];
1031 struct dax_device *dax_dev; 908 struct dax_device *dax_dev;
1032 struct pagevec pvec; 909 void *entry;
1033 bool done = false; 910 int ret = 0;
1034 int i, ret = 0; 911 unsigned int scanned = 0;
1035 912
1036 if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) 913 if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
1037 return -EIO; 914 return -EIO;
@@ -1043,41 +920,29 @@ int dax_writeback_mapping_range(struct address_space *mapping,
1043 if (!dax_dev) 920 if (!dax_dev)
1044 return -EIO; 921 return -EIO;
1045 922
1046 start_index = wbc->range_start >> PAGE_SHIFT; 923 trace_dax_writeback_range(inode, xas.xa_index, end_index);
1047 end_index = wbc->range_end >> PAGE_SHIFT;
1048
1049 trace_dax_writeback_range(inode, start_index, end_index);
1050 924
1051 tag_pages_for_writeback(mapping, start_index, end_index); 925 tag_pages_for_writeback(mapping, xas.xa_index, end_index);
1052 926
1053 pagevec_init(&pvec); 927 xas_lock_irq(&xas);
1054 while (!done) { 928 xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) {
1055 pvec.nr = find_get_entries_tag(mapping, start_index, 929 ret = dax_writeback_one(&xas, dax_dev, mapping, entry);
1056 PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, 930 if (ret < 0) {
1057 pvec.pages, indices); 931 mapping_set_error(mapping, ret);
1058
1059 if (pvec.nr == 0)
1060 break; 932 break;
1061
1062 for (i = 0; i < pvec.nr; i++) {
1063 if (indices[i] > end_index) {
1064 done = true;
1065 break;
1066 }
1067
1068 ret = dax_writeback_one(dax_dev, mapping, indices[i],
1069 pvec.pages[i]);
1070 if (ret < 0) {
1071 mapping_set_error(mapping, ret);
1072 goto out;
1073 }
1074 } 933 }
1075 start_index = indices[pvec.nr - 1] + 1; 934 if (++scanned % XA_CHECK_SCHED)
935 continue;
936
937 xas_pause(&xas);
938 xas_unlock_irq(&xas);
939 cond_resched();
940 xas_lock_irq(&xas);
1076 } 941 }
1077out: 942 xas_unlock_irq(&xas);
1078 put_dax(dax_dev); 943 put_dax(dax_dev);
1079 trace_dax_writeback_range_done(inode, start_index, end_index); 944 trace_dax_writeback_range_done(inode, xas.xa_index, end_index);
1080 return (ret < 0 ? ret : 0); 945 return ret;
1081} 946}
1082EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); 947EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
1083 948
@@ -1125,16 +990,18 @@ out:
1125 * If this page is ever written to we will re-fault and change the mapping to 990 * If this page is ever written to we will re-fault and change the mapping to
1126 * point to real DAX storage instead. 991 * point to real DAX storage instead.
1127 */ 992 */
1128static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry, 993static vm_fault_t dax_load_hole(struct xa_state *xas,
1129 struct vm_fault *vmf) 994 struct address_space *mapping, void **entry,
995 struct vm_fault *vmf)
1130{ 996{
1131 struct inode *inode = mapping->host; 997 struct inode *inode = mapping->host;
1132 unsigned long vaddr = vmf->address; 998 unsigned long vaddr = vmf->address;
1133 pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr)); 999 pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
1134 vm_fault_t ret; 1000 vm_fault_t ret;
1135 1001
1136 dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE, 1002 *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
1137 false); 1003 DAX_ZERO_PAGE, false);
1004
1138 ret = vmf_insert_mixed(vmf->vma, vaddr, pfn); 1005 ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
1139 trace_dax_load_hole(inode, vmf, ret); 1006 trace_dax_load_hole(inode, vmf, ret);
1140 return ret; 1007 return ret;
@@ -1342,6 +1209,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
1342{ 1209{
1343 struct vm_area_struct *vma = vmf->vma; 1210 struct vm_area_struct *vma = vmf->vma;
1344 struct address_space *mapping = vma->vm_file->f_mapping; 1211 struct address_space *mapping = vma->vm_file->f_mapping;
1212 XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
1345 struct inode *inode = mapping->host; 1213 struct inode *inode = mapping->host;
1346 unsigned long vaddr = vmf->address; 1214 unsigned long vaddr = vmf->address;
1347 loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; 1215 loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
@@ -1368,9 +1236,9 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
1368 if (write && !vmf->cow_page) 1236 if (write && !vmf->cow_page)
1369 flags |= IOMAP_WRITE; 1237 flags |= IOMAP_WRITE;
1370 1238
1371 entry = grab_mapping_entry(mapping, vmf->pgoff, 0); 1239 entry = grab_mapping_entry(&xas, mapping, 0);
1372 if (IS_ERR(entry)) { 1240 if (xa_is_internal(entry)) {
1373 ret = dax_fault_return(PTR_ERR(entry)); 1241 ret = xa_to_internal(entry);
1374 goto out; 1242 goto out;
1375 } 1243 }
1376 1244
@@ -1443,7 +1311,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
1443 if (error < 0) 1311 if (error < 0)
1444 goto error_finish_iomap; 1312 goto error_finish_iomap;
1445 1313
1446 entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn, 1314 entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
1447 0, write && !sync); 1315 0, write && !sync);
1448 1316
1449 /* 1317 /*
@@ -1471,7 +1339,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
1471 case IOMAP_UNWRITTEN: 1339 case IOMAP_UNWRITTEN:
1472 case IOMAP_HOLE: 1340 case IOMAP_HOLE:
1473 if (!write) { 1341 if (!write) {
1474 ret = dax_load_hole(mapping, entry, vmf); 1342 ret = dax_load_hole(&xas, mapping, &entry, vmf);
1475 goto finish_iomap; 1343 goto finish_iomap;
1476 } 1344 }
1477 /*FALLTHRU*/ 1345 /*FALLTHRU*/
@@ -1498,21 +1366,20 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
1498 ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); 1366 ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
1499 } 1367 }
1500 unlock_entry: 1368 unlock_entry:
1501 put_locked_mapping_entry(mapping, vmf->pgoff); 1369 dax_unlock_entry(&xas, entry);
1502 out: 1370 out:
1503 trace_dax_pte_fault_done(inode, vmf, ret); 1371 trace_dax_pte_fault_done(inode, vmf, ret);
1504 return ret | major; 1372 return ret | major;
1505} 1373}
1506 1374
1507#ifdef CONFIG_FS_DAX_PMD 1375#ifdef CONFIG_FS_DAX_PMD
1508static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, 1376static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
1509 void *entry) 1377 struct iomap *iomap, void **entry)
1510{ 1378{
1511 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 1379 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1512 unsigned long pmd_addr = vmf->address & PMD_MASK; 1380 unsigned long pmd_addr = vmf->address & PMD_MASK;
1513 struct inode *inode = mapping->host; 1381 struct inode *inode = mapping->host;
1514 struct page *zero_page; 1382 struct page *zero_page;
1515 void *ret = NULL;
1516 spinlock_t *ptl; 1383 spinlock_t *ptl;
1517 pmd_t pmd_entry; 1384 pmd_t pmd_entry;
1518 pfn_t pfn; 1385 pfn_t pfn;
@@ -1523,8 +1390,8 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
1523 goto fallback; 1390 goto fallback;
1524 1391
1525 pfn = page_to_pfn_t(zero_page); 1392 pfn = page_to_pfn_t(zero_page);
1526 ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn, 1393 *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
1527 RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false); 1394 DAX_PMD | DAX_ZERO_PAGE, false);
1528 1395
1529 ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); 1396 ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
1530 if (!pmd_none(*(vmf->pmd))) { 1397 if (!pmd_none(*(vmf->pmd))) {
@@ -1536,11 +1403,11 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
1536 pmd_entry = pmd_mkhuge(pmd_entry); 1403 pmd_entry = pmd_mkhuge(pmd_entry);
1537 set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); 1404 set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
1538 spin_unlock(ptl); 1405 spin_unlock(ptl);
1539 trace_dax_pmd_load_hole(inode, vmf, zero_page, ret); 1406 trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry);
1540 return VM_FAULT_NOPAGE; 1407 return VM_FAULT_NOPAGE;
1541 1408
1542fallback: 1409fallback:
1543 trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret); 1410 trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
1544 return VM_FAULT_FALLBACK; 1411 return VM_FAULT_FALLBACK;
1545} 1412}
1546 1413
@@ -1549,6 +1416,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
1549{ 1416{
1550 struct vm_area_struct *vma = vmf->vma; 1417 struct vm_area_struct *vma = vmf->vma;
1551 struct address_space *mapping = vma->vm_file->f_mapping; 1418 struct address_space *mapping = vma->vm_file->f_mapping;
1419 XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
1552 unsigned long pmd_addr = vmf->address & PMD_MASK; 1420 unsigned long pmd_addr = vmf->address & PMD_MASK;
1553 bool write = vmf->flags & FAULT_FLAG_WRITE; 1421 bool write = vmf->flags & FAULT_FLAG_WRITE;
1554 bool sync; 1422 bool sync;
@@ -1556,7 +1424,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
1556 struct inode *inode = mapping->host; 1424 struct inode *inode = mapping->host;
1557 vm_fault_t result = VM_FAULT_FALLBACK; 1425 vm_fault_t result = VM_FAULT_FALLBACK;
1558 struct iomap iomap = { 0 }; 1426 struct iomap iomap = { 0 };
1559 pgoff_t max_pgoff, pgoff; 1427 pgoff_t max_pgoff;
1560 void *entry; 1428 void *entry;
1561 loff_t pos; 1429 loff_t pos;
1562 int error; 1430 int error;
@@ -1567,7 +1435,6 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
1567 * supposed to hold locks serializing us with truncate / punch hole so 1435 * supposed to hold locks serializing us with truncate / punch hole so
1568 * this is a reliable test. 1436 * this is a reliable test.
1569 */ 1437 */
1570 pgoff = linear_page_index(vma, pmd_addr);
1571 max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 1438 max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
1572 1439
1573 trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); 1440 trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
@@ -1576,7 +1443,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
1576 * Make sure that the faulting address's PMD offset (color) matches 1443 * Make sure that the faulting address's PMD offset (color) matches
1577 * the PMD offset from the start of the file. This is necessary so 1444 * the PMD offset from the start of the file. This is necessary so
1578 * that a PMD range in the page table overlaps exactly with a PMD 1445 * that a PMD range in the page table overlaps exactly with a PMD
1579 * range in the radix tree. 1446 * range in the page cache.
1580 */ 1447 */
1581 if ((vmf->pgoff & PG_PMD_COLOUR) != 1448 if ((vmf->pgoff & PG_PMD_COLOUR) !=
1582 ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR)) 1449 ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
@@ -1592,24 +1459,26 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
1592 if ((pmd_addr + PMD_SIZE) > vma->vm_end) 1459 if ((pmd_addr + PMD_SIZE) > vma->vm_end)
1593 goto fallback; 1460 goto fallback;
1594 1461
1595 if (pgoff >= max_pgoff) { 1462 if (xas.xa_index >= max_pgoff) {
1596 result = VM_FAULT_SIGBUS; 1463 result = VM_FAULT_SIGBUS;
1597 goto out; 1464 goto out;
1598 } 1465 }
1599 1466
1600 /* If the PMD would extend beyond the file size */ 1467 /* If the PMD would extend beyond the file size */
1601 if ((pgoff | PG_PMD_COLOUR) >= max_pgoff) 1468 if ((xas.xa_index | PG_PMD_COLOUR) >= max_pgoff)
1602 goto fallback; 1469 goto fallback;
1603 1470
1604 /* 1471 /*
1605 * grab_mapping_entry() will make sure we get a 2MiB empty entry, a 1472 * grab_mapping_entry() will make sure we get an empty PMD entry,
1606 * 2MiB zero page entry or a DAX PMD. If it can't (because a 4k page 1473 * a zero PMD entry or a DAX PMD. If it can't (because a PTE
1607 * is already in the tree, for instance), it will return -EEXIST and 1474 * entry is already in the array, for instance), it will return
1608 * we just fall back to 4k entries. 1475 * VM_FAULT_FALLBACK.
1609 */ 1476 */
1610 entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); 1477 entry = grab_mapping_entry(&xas, mapping, DAX_PMD);
1611 if (IS_ERR(entry)) 1478 if (xa_is_internal(entry)) {
1479 result = xa_to_internal(entry);
1612 goto fallback; 1480 goto fallback;
1481 }
1613 1482
1614 /* 1483 /*
1615 * It is possible, particularly with mixed reads & writes to private 1484 * It is possible, particularly with mixed reads & writes to private
@@ -1628,7 +1497,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
1628 * setting up a mapping, so really we're using iomap_begin() as a way 1497 * setting up a mapping, so really we're using iomap_begin() as a way
1629 * to look up our filesystem block. 1498 * to look up our filesystem block.
1630 */ 1499 */
1631 pos = (loff_t)pgoff << PAGE_SHIFT; 1500 pos = (loff_t)xas.xa_index << PAGE_SHIFT;
1632 error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); 1501 error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
1633 if (error) 1502 if (error)
1634 goto unlock_entry; 1503 goto unlock_entry;
@@ -1644,8 +1513,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
1644 if (error < 0) 1513 if (error < 0)
1645 goto finish_iomap; 1514 goto finish_iomap;
1646 1515
1647 entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn, 1516 entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
1648 RADIX_DAX_PMD, write && !sync); 1517 DAX_PMD, write && !sync);
1649 1518
1650 /* 1519 /*
1651 * If we are doing synchronous page fault and inode needs fsync, 1520 * If we are doing synchronous page fault and inode needs fsync,
@@ -1669,7 +1538,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
1669 case IOMAP_HOLE: 1538 case IOMAP_HOLE:
1670 if (WARN_ON_ONCE(write)) 1539 if (WARN_ON_ONCE(write))
1671 break; 1540 break;
1672 result = dax_pmd_load_hole(vmf, &iomap, entry); 1541 result = dax_pmd_load_hole(&xas, vmf, &iomap, &entry);
1673 break; 1542 break;
1674 default: 1543 default:
1675 WARN_ON_ONCE(1); 1544 WARN_ON_ONCE(1);
@@ -1692,7 +1561,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
1692 &iomap); 1561 &iomap);
1693 } 1562 }
1694 unlock_entry: 1563 unlock_entry:
1695 put_locked_mapping_entry(mapping, pgoff); 1564 dax_unlock_entry(&xas, entry);
1696 fallback: 1565 fallback:
1697 if (result == VM_FAULT_FALLBACK) { 1566 if (result == VM_FAULT_FALLBACK) {
1698 split_huge_pmd(vma, vmf->pmd, vmf->address); 1567 split_huge_pmd(vma, vmf->pmd, vmf->address);
@@ -1737,54 +1606,49 @@ vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
1737} 1606}
1738EXPORT_SYMBOL_GPL(dax_iomap_fault); 1607EXPORT_SYMBOL_GPL(dax_iomap_fault);
1739 1608
1740/** 1609/*
1741 * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables 1610 * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
1742 * @vmf: The description of the fault 1611 * @vmf: The description of the fault
1743 * @pe_size: Size of entry to be inserted
1744 * @pfn: PFN to insert 1612 * @pfn: PFN to insert
1613 * @order: Order of entry to insert.
1745 * 1614 *
1746 * This function inserts writeable PTE or PMD entry into page tables for mmaped 1615 * This function inserts a writeable PTE or PMD entry into the page tables
1747 * DAX file. It takes care of marking corresponding radix tree entry as dirty 1616 * for an mmaped DAX file. It also marks the page cache entry as dirty.
1748 * as well.
1749 */ 1617 */
1750static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf, 1618static vm_fault_t
1751 enum page_entry_size pe_size, 1619dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
1752 pfn_t pfn)
1753{ 1620{
1754 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 1621 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1755 void *entry, **slot; 1622 XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
1756 pgoff_t index = vmf->pgoff; 1623 void *entry;
1757 vm_fault_t ret; 1624 vm_fault_t ret;
1758 1625
1759 xa_lock_irq(&mapping->i_pages); 1626 xas_lock_irq(&xas);
1760 entry = get_unlocked_mapping_entry(mapping, index, &slot); 1627 entry = get_unlocked_entry(&xas);
1761 /* Did we race with someone splitting entry or so? */ 1628 /* Did we race with someone splitting entry or so? */
1762 if (!entry || 1629 if (!entry ||
1763 (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) || 1630 (order == 0 && !dax_is_pte_entry(entry)) ||
1764 (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) { 1631 (order == PMD_ORDER && (xa_is_internal(entry) ||
1765 put_unlocked_mapping_entry(mapping, index, entry); 1632 !dax_is_pmd_entry(entry)))) {
1766 xa_unlock_irq(&mapping->i_pages); 1633 put_unlocked_entry(&xas, entry);
1634 xas_unlock_irq(&xas);
1767 trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf, 1635 trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
1768 VM_FAULT_NOPAGE); 1636 VM_FAULT_NOPAGE);
1769 return VM_FAULT_NOPAGE; 1637 return VM_FAULT_NOPAGE;
1770 } 1638 }
1771 radix_tree_tag_set(&mapping->i_pages, index, PAGECACHE_TAG_DIRTY); 1639 xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
1772 entry = lock_slot(mapping, slot); 1640 dax_lock_entry(&xas, entry);
1773 xa_unlock_irq(&mapping->i_pages); 1641 xas_unlock_irq(&xas);
1774 switch (pe_size) { 1642 if (order == 0)
1775 case PE_SIZE_PTE:
1776 ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); 1643 ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
1777 break;
1778#ifdef CONFIG_FS_DAX_PMD 1644#ifdef CONFIG_FS_DAX_PMD
1779 case PE_SIZE_PMD: 1645 else if (order == PMD_ORDER)
1780 ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, 1646 ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
1781 pfn, true); 1647 pfn, true);
1782 break;
1783#endif 1648#endif
1784 default: 1649 else
1785 ret = VM_FAULT_FALLBACK; 1650 ret = VM_FAULT_FALLBACK;
1786 } 1651 dax_unlock_entry(&xas, entry);
1787 put_locked_mapping_entry(mapping, index);
1788 trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret); 1652 trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
1789 return ret; 1653 return ret;
1790} 1654}
@@ -1804,17 +1668,12 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
1804{ 1668{
1805 int err; 1669 int err;
1806 loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT; 1670 loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
1807 size_t len = 0; 1671 unsigned int order = pe_order(pe_size);
1672 size_t len = PAGE_SIZE << order;
1808 1673
1809 if (pe_size == PE_SIZE_PTE)
1810 len = PAGE_SIZE;
1811 else if (pe_size == PE_SIZE_PMD)
1812 len = PMD_SIZE;
1813 else
1814 WARN_ON_ONCE(1);
1815 err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1); 1674 err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
1816 if (err) 1675 if (err)
1817 return VM_FAULT_SIGBUS; 1676 return VM_FAULT_SIGBUS;
1818 return dax_insert_pfn_mkwrite(vmf, pe_size, pfn); 1677 return dax_insert_pfn_mkwrite(vmf, pfn, order);
1819} 1678}
1820EXPORT_SYMBOL_GPL(dax_finish_sync_fault); 1679EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c3d9a42c561e..05f01fbd9c7f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2643,7 +2643,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
2643 long left = mpd->wbc->nr_to_write; 2643 long left = mpd->wbc->nr_to_write;
2644 pgoff_t index = mpd->first_page; 2644 pgoff_t index = mpd->first_page;
2645 pgoff_t end = mpd->last_page; 2645 pgoff_t end = mpd->last_page;
2646 int tag; 2646 xa_mark_t tag;
2647 int i, err = 0; 2647 int i, err = 0;
2648 int blkbits = mpd->inode->i_blkbits; 2648 int blkbits = mpd->inode->i_blkbits;
2649 ext4_lblk_t lblk; 2649 ext4_lblk_t lblk;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 106f116466bf..b293cb3e27a2 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2071,7 +2071,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
2071 pgoff_t done_index; 2071 pgoff_t done_index;
2072 int cycled; 2072 int cycled;
2073 int range_whole = 0; 2073 int range_whole = 0;
2074 int tag; 2074 xa_mark_t tag;
2075 int nwritten = 0; 2075 int nwritten = 0;
2076 2076
2077 pagevec_init(&pvec); 2077 pagevec_init(&pvec);
@@ -2787,13 +2787,13 @@ const struct address_space_operations f2fs_dblock_aops = {
2787#endif 2787#endif
2788}; 2788};
2789 2789
2790void f2fs_clear_radix_tree_dirty_tag(struct page *page) 2790void f2fs_clear_page_cache_dirty_tag(struct page *page)
2791{ 2791{
2792 struct address_space *mapping = page_mapping(page); 2792 struct address_space *mapping = page_mapping(page);
2793 unsigned long flags; 2793 unsigned long flags;
2794 2794
2795 xa_lock_irqsave(&mapping->i_pages, flags); 2795 xa_lock_irqsave(&mapping->i_pages, flags);
2796 radix_tree_tag_clear(&mapping->i_pages, page_index(page), 2796 __xa_clear_mark(&mapping->i_pages, page_index(page),
2797 PAGECACHE_TAG_DIRTY); 2797 PAGECACHE_TAG_DIRTY);
2798 xa_unlock_irqrestore(&mapping->i_pages, flags); 2798 xa_unlock_irqrestore(&mapping->i_pages, flags);
2799} 2799}
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 2ef84b4590ea..bacc667950b6 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -726,7 +726,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
726 726
727 if (bit_pos == NR_DENTRY_IN_BLOCK && 727 if (bit_pos == NR_DENTRY_IN_BLOCK &&
728 !f2fs_truncate_hole(dir, page->index, page->index + 1)) { 728 !f2fs_truncate_hole(dir, page->index, page->index + 1)) {
729 f2fs_clear_radix_tree_dirty_tag(page); 729 f2fs_clear_page_cache_dirty_tag(page);
730 clear_page_dirty_for_io(page); 730 clear_page_dirty_for_io(page);
731 ClearPagePrivate(page); 731 ClearPagePrivate(page);
732 ClearPageUptodate(page); 732 ClearPageUptodate(page);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 56204a8f8a12..1e031971a466 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -3108,7 +3108,7 @@ int f2fs_migrate_page(struct address_space *mapping, struct page *newpage,
3108 struct page *page, enum migrate_mode mode); 3108 struct page *page, enum migrate_mode mode);
3109#endif 3109#endif
3110bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len); 3110bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len);
3111void f2fs_clear_radix_tree_dirty_tag(struct page *page); 3111void f2fs_clear_page_cache_dirty_tag(struct page *page);
3112 3112
3113/* 3113/*
3114 * gc.c 3114 * gc.c
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index cb31a719b048..7b0cff7e6051 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -243,7 +243,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
243 kunmap_atomic(src_addr); 243 kunmap_atomic(src_addr);
244 set_page_dirty(dn.inode_page); 244 set_page_dirty(dn.inode_page);
245 245
246 f2fs_clear_radix_tree_dirty_tag(page); 246 f2fs_clear_page_cache_dirty_tag(page);
247 247
248 set_inode_flag(inode, FI_APPEND_WRITE); 248 set_inode_flag(inode, FI_APPEND_WRITE);
249 set_inode_flag(inode, FI_DATA_EXIST); 249 set_inode_flag(inode, FI_DATA_EXIST);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 2b34206486d8..d338740d0fda 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -101,7 +101,7 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
101static void clear_node_page_dirty(struct page *page) 101static void clear_node_page_dirty(struct page *page)
102{ 102{
103 if (PageDirty(page)) { 103 if (PageDirty(page)) {
104 f2fs_clear_radix_tree_dirty_tag(page); 104 f2fs_clear_page_cache_dirty_tag(page);
105 clear_page_dirty_for_io(page); 105 clear_page_dirty_for_io(page);
106 dec_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); 106 dec_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
107 } 107 }
@@ -1306,9 +1306,7 @@ void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
1306 if (f2fs_check_nid_range(sbi, nid)) 1306 if (f2fs_check_nid_range(sbi, nid))
1307 return; 1307 return;
1308 1308
1309 rcu_read_lock(); 1309 apage = xa_load(&NODE_MAPPING(sbi)->i_pages, nid);
1310 apage = radix_tree_lookup(&NODE_MAPPING(sbi)->i_pages, nid);
1311 rcu_read_unlock();
1312 if (apage) 1310 if (apage)
1313 return; 1311 return;
1314 1312
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 471d863958bc..b40168fcc94a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -339,9 +339,9 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
339 struct address_space *mapping = inode->i_mapping; 339 struct address_space *mapping = inode->i_mapping;
340 struct bdi_writeback *old_wb = inode->i_wb; 340 struct bdi_writeback *old_wb = inode->i_wb;
341 struct bdi_writeback *new_wb = isw->new_wb; 341 struct bdi_writeback *new_wb = isw->new_wb;
342 struct radix_tree_iter iter; 342 XA_STATE(xas, &mapping->i_pages, 0);
343 struct page *page;
343 bool switched = false; 344 bool switched = false;
344 void **slot;
345 345
346 /* 346 /*
347 * By the time control reaches here, RCU grace period has passed 347 * By the time control reaches here, RCU grace period has passed
@@ -375,25 +375,18 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
375 * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to 375 * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to
376 * pages actually under writeback. 376 * pages actually under writeback.
377 */ 377 */
378 radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0, 378 xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_DIRTY) {
379 PAGECACHE_TAG_DIRTY) { 379 if (PageDirty(page)) {
380 struct page *page = radix_tree_deref_slot_protected(slot,
381 &mapping->i_pages.xa_lock);
382 if (likely(page) && PageDirty(page)) {
383 dec_wb_stat(old_wb, WB_RECLAIMABLE); 380 dec_wb_stat(old_wb, WB_RECLAIMABLE);
384 inc_wb_stat(new_wb, WB_RECLAIMABLE); 381 inc_wb_stat(new_wb, WB_RECLAIMABLE);
385 } 382 }
386 } 383 }
387 384
388 radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0, 385 xas_set(&xas, 0);
389 PAGECACHE_TAG_WRITEBACK) { 386 xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) {
390 struct page *page = radix_tree_deref_slot_protected(slot, 387 WARN_ON_ONCE(!PageWriteback(page));
391 &mapping->i_pages.xa_lock); 388 dec_wb_stat(old_wb, WB_WRITEBACK);
392 if (likely(page)) { 389 inc_wb_stat(new_wb, WB_WRITEBACK);
393 WARN_ON_ONCE(!PageWriteback(page));
394 dec_wb_stat(old_wb, WB_WRITEBACK);
395 inc_wb_stat(new_wb, WB_WRITEBACK);
396 }
397 } 390 }
398 391
399 wb_get(new_wb); 392 wb_get(new_wb);
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 31e8270d0b26..8afbb35559b9 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -366,7 +366,7 @@ static int gfs2_write_cache_jdata(struct address_space *mapping,
366 pgoff_t done_index; 366 pgoff_t done_index;
367 int cycled; 367 int cycled;
368 int range_whole = 0; 368 int range_whole = 0;
369 int tag; 369 xa_mark_t tag;
370 370
371 pagevec_init(&pvec); 371 pagevec_init(&pvec);
372 if (wbc->range_cyclic) { 372 if (wbc->range_cyclic) {
diff --git a/fs/inode.c b/fs/inode.c
index 42f6d25f32a5..9b808986d440 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -349,7 +349,7 @@ EXPORT_SYMBOL(inc_nlink);
349 349
350static void __address_space_init_once(struct address_space *mapping) 350static void __address_space_init_once(struct address_space *mapping)
351{ 351{
352 INIT_RADIX_TREE(&mapping->i_pages, GFP_ATOMIC | __GFP_ACCOUNT); 352 xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ);
353 init_rwsem(&mapping->i_mmap_rwsem); 353 init_rwsem(&mapping->i_mmap_rwsem);
354 INIT_LIST_HEAD(&mapping->private_list); 354 INIT_LIST_HEAD(&mapping->private_list);
355 spin_lock_init(&mapping->private_lock); 355 spin_lock_init(&mapping->private_lock);
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index 947ce22f5b3c..f0fe641893a5 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -46,7 +46,7 @@ int isofs_name_translate(struct iso_directory_record *de, char *new, struct inod
46 return i; 46 return i;
47} 47}
48 48
49/* Acorn extensions written by Matthew Wilcox <willy@bofh.ai> 1998 */ 49/* Acorn extensions written by Matthew Wilcox <willy@infradead.org> 1998 */
50int get_acorn_filename(struct iso_directory_record *de, 50int get_acorn_filename(struct iso_directory_record *de,
51 char *retname, struct inode *inode) 51 char *retname, struct inode *inode)
52{ 52{
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 06cb0c1d9aee..d3781cd983f6 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -896,7 +896,7 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
896 end = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 896 end = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
897 if (end != inode->i_mapping->nrpages) { 897 if (end != inode->i_mapping->nrpages) {
898 rcu_read_lock(); 898 rcu_read_lock();
899 end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX); 899 end = page_cache_next_miss(mapping, idx + 1, ULONG_MAX);
900 rcu_read_unlock(); 900 rcu_read_unlock();
901 } 901 }
902 902
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index ebb24a314f43..de99db518571 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -168,24 +168,18 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc,
168 ctxt->newbh = NULL; 168 ctxt->newbh = NULL;
169 169
170 if (inode->i_blkbits == PAGE_SHIFT) { 170 if (inode->i_blkbits == PAGE_SHIFT) {
171 lock_page(obh->b_page); 171 struct page *opage = obh->b_page;
172 /* 172 lock_page(opage);
173 * We cannot call radix_tree_preload for the kernels older
174 * than 2.6.23, because it is not exported for modules.
175 */
176retry: 173retry:
177 err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
178 if (err)
179 goto failed_unlock;
180 /* BUG_ON(oldkey != obh->b_page->index); */ 174 /* BUG_ON(oldkey != obh->b_page->index); */
181 if (unlikely(oldkey != obh->b_page->index)) 175 if (unlikely(oldkey != opage->index))
182 NILFS_PAGE_BUG(obh->b_page, 176 NILFS_PAGE_BUG(opage,
183 "invalid oldkey %lld (newkey=%lld)", 177 "invalid oldkey %lld (newkey=%lld)",
184 (unsigned long long)oldkey, 178 (unsigned long long)oldkey,
185 (unsigned long long)newkey); 179 (unsigned long long)newkey);
186 180
187 xa_lock_irq(&btnc->i_pages); 181 xa_lock_irq(&btnc->i_pages);
188 err = radix_tree_insert(&btnc->i_pages, newkey, obh->b_page); 182 err = __xa_insert(&btnc->i_pages, newkey, opage, GFP_NOFS);
189 xa_unlock_irq(&btnc->i_pages); 183 xa_unlock_irq(&btnc->i_pages);
190 /* 184 /*
191 * Note: page->index will not change to newkey until 185 * Note: page->index will not change to newkey until
@@ -193,7 +187,6 @@ retry:
193 * To protect the page in intermediate state, the page lock 187 * To protect the page in intermediate state, the page lock
194 * is held. 188 * is held.
195 */ 189 */
196 radix_tree_preload_end();
197 if (!err) 190 if (!err)
198 return 0; 191 return 0;
199 else if (err != -EEXIST) 192 else if (err != -EEXIST)
@@ -203,7 +196,7 @@ retry:
203 if (!err) 196 if (!err)
204 goto retry; 197 goto retry;
205 /* fallback to copy mode */ 198 /* fallback to copy mode */
206 unlock_page(obh->b_page); 199 unlock_page(opage);
207 } 200 }
208 201
209 nbh = nilfs_btnode_create_block(btnc, newkey); 202 nbh = nilfs_btnode_create_block(btnc, newkey);
@@ -243,9 +236,8 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc,
243 mark_buffer_dirty(obh); 236 mark_buffer_dirty(obh);
244 237
245 xa_lock_irq(&btnc->i_pages); 238 xa_lock_irq(&btnc->i_pages);
246 radix_tree_delete(&btnc->i_pages, oldkey); 239 __xa_erase(&btnc->i_pages, oldkey);
247 radix_tree_tag_set(&btnc->i_pages, newkey, 240 __xa_set_mark(&btnc->i_pages, newkey, PAGECACHE_TAG_DIRTY);
248 PAGECACHE_TAG_DIRTY);
249 xa_unlock_irq(&btnc->i_pages); 241 xa_unlock_irq(&btnc->i_pages);
250 242
251 opage->index = obh->b_blocknr = newkey; 243 opage->index = obh->b_blocknr = newkey;
@@ -275,7 +267,7 @@ void nilfs_btnode_abort_change_key(struct address_space *btnc,
275 267
276 if (nbh == NULL) { /* blocksize == pagesize */ 268 if (nbh == NULL) { /* blocksize == pagesize */
277 xa_lock_irq(&btnc->i_pages); 269 xa_lock_irq(&btnc->i_pages);
278 radix_tree_delete(&btnc->i_pages, newkey); 270 __xa_erase(&btnc->i_pages, newkey);
279 xa_unlock_irq(&btnc->i_pages); 271 xa_unlock_irq(&btnc->i_pages);
280 unlock_page(ctxt->bh->b_page); 272 unlock_page(ctxt->bh->b_page);
281 } else 273 } else
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 329a056b73b1..d7fc8d369d89 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -289,7 +289,7 @@ repeat:
289 * @dmap: destination page cache 289 * @dmap: destination page cache
290 * @smap: source page cache 290 * @smap: source page cache
291 * 291 *
292 * No pages must no be added to the cache during this process. 292 * No pages must be added to the cache during this process.
293 * This must be ensured by the caller. 293 * This must be ensured by the caller.
294 */ 294 */
295void nilfs_copy_back_pages(struct address_space *dmap, 295void nilfs_copy_back_pages(struct address_space *dmap,
@@ -298,7 +298,6 @@ void nilfs_copy_back_pages(struct address_space *dmap,
298 struct pagevec pvec; 298 struct pagevec pvec;
299 unsigned int i, n; 299 unsigned int i, n;
300 pgoff_t index = 0; 300 pgoff_t index = 0;
301 int err;
302 301
303 pagevec_init(&pvec); 302 pagevec_init(&pvec);
304repeat: 303repeat:
@@ -313,35 +312,34 @@ repeat:
313 lock_page(page); 312 lock_page(page);
314 dpage = find_lock_page(dmap, offset); 313 dpage = find_lock_page(dmap, offset);
315 if (dpage) { 314 if (dpage) {
316 /* override existing page on the destination cache */ 315 /* overwrite existing page in the destination cache */
317 WARN_ON(PageDirty(dpage)); 316 WARN_ON(PageDirty(dpage));
318 nilfs_copy_page(dpage, page, 0); 317 nilfs_copy_page(dpage, page, 0);
319 unlock_page(dpage); 318 unlock_page(dpage);
320 put_page(dpage); 319 put_page(dpage);
320 /* Do we not need to remove page from smap here? */
321 } else { 321 } else {
322 struct page *page2; 322 struct page *p;
323 323
324 /* move the page to the destination cache */ 324 /* move the page to the destination cache */
325 xa_lock_irq(&smap->i_pages); 325 xa_lock_irq(&smap->i_pages);
326 page2 = radix_tree_delete(&smap->i_pages, offset); 326 p = __xa_erase(&smap->i_pages, offset);
327 WARN_ON(page2 != page); 327 WARN_ON(page != p);
328
329 smap->nrpages--; 328 smap->nrpages--;
330 xa_unlock_irq(&smap->i_pages); 329 xa_unlock_irq(&smap->i_pages);
331 330
332 xa_lock_irq(&dmap->i_pages); 331 xa_lock_irq(&dmap->i_pages);
333 err = radix_tree_insert(&dmap->i_pages, offset, page); 332 p = __xa_store(&dmap->i_pages, offset, page, GFP_NOFS);
334 if (unlikely(err < 0)) { 333 if (unlikely(p)) {
335 WARN_ON(err == -EEXIST); 334 /* Probably -ENOMEM */
336 page->mapping = NULL; 335 page->mapping = NULL;
337 put_page(page); /* for cache */ 336 put_page(page);
338 } else { 337 } else {
339 page->mapping = dmap; 338 page->mapping = dmap;
340 dmap->nrpages++; 339 dmap->nrpages++;
341 if (PageDirty(page)) 340 if (PageDirty(page))
342 radix_tree_tag_set(&dmap->i_pages, 341 __xa_set_mark(&dmap->i_pages, offset,
343 offset, 342 PAGECACHE_TAG_DIRTY);
344 PAGECACHE_TAG_DIRTY);
345 } 343 }
346 xa_unlock_irq(&dmap->i_pages); 344 xa_unlock_irq(&dmap->i_pages);
347 } 345 }
@@ -467,8 +465,7 @@ int __nilfs_clear_page_dirty(struct page *page)
467 if (mapping) { 465 if (mapping) {
468 xa_lock_irq(&mapping->i_pages); 466 xa_lock_irq(&mapping->i_pages);
469 if (test_bit(PG_dirty, &page->flags)) { 467 if (test_bit(PG_dirty, &page->flags)) {
470 radix_tree_tag_clear(&mapping->i_pages, 468 __xa_clear_mark(&mapping->i_pages, page_index(page),
471 page_index(page),
472 PAGECACHE_TAG_DIRTY); 469 PAGECACHE_TAG_DIRTY);
473 xa_unlock_irq(&mapping->i_pages); 470 xa_unlock_irq(&mapping->i_pages);
474 return clear_page_dirty_for_io(page); 471 return clear_page_dirty_for_io(page);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index a027473561c6..47c3764c469b 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -521,7 +521,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
521 if (!page) 521 if (!page)
522 return; 522 return;
523 523
524 if (radix_tree_exceptional_entry(page)) 524 if (xa_is_value(page))
525 mss->swap += PAGE_SIZE; 525 mss->swap += PAGE_SIZE;
526 else 526 else
527 put_page(page); 527 put_page(page);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 897eae8faee1..771341470bce 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -403,24 +403,40 @@ int pagecache_write_end(struct file *, struct address_space *mapping,
403 loff_t pos, unsigned len, unsigned copied, 403 loff_t pos, unsigned len, unsigned copied,
404 struct page *page, void *fsdata); 404 struct page *page, void *fsdata);
405 405
406/**
407 * struct address_space - Contents of a cacheable, mappable object.
408 * @host: Owner, either the inode or the block_device.
409 * @i_pages: Cached pages.
410 * @gfp_mask: Memory allocation flags to use for allocating pages.
411 * @i_mmap_writable: Number of VM_SHARED mappings.
412 * @i_mmap: Tree of private and shared mappings.
413 * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
414 * @nrpages: Number of page entries, protected by the i_pages lock.
415 * @nrexceptional: Shadow or DAX entries, protected by the i_pages lock.
416 * @writeback_index: Writeback starts here.
417 * @a_ops: Methods.
418 * @flags: Error bits and flags (AS_*).
419 * @wb_err: The most recent error which has occurred.
420 * @private_lock: For use by the owner of the address_space.
421 * @private_list: For use by the owner of the address_space.
422 * @private_data: For use by the owner of the address_space.
423 */
406struct address_space { 424struct address_space {
407 struct inode *host; /* owner: inode, block_device */ 425 struct inode *host;
408 struct radix_tree_root i_pages; /* cached pages */ 426 struct xarray i_pages;
409 atomic_t i_mmap_writable;/* count VM_SHARED mappings */ 427 gfp_t gfp_mask;
410 struct rb_root_cached i_mmap; /* tree of private and shared mappings */ 428 atomic_t i_mmap_writable;
411 struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */ 429 struct rb_root_cached i_mmap;
412 /* Protected by the i_pages lock */ 430 struct rw_semaphore i_mmap_rwsem;
413 unsigned long nrpages; /* number of total pages */ 431 unsigned long nrpages;
414 /* number of shadow or DAX exceptional entries */
415 unsigned long nrexceptional; 432 unsigned long nrexceptional;
416 pgoff_t writeback_index;/* writeback starts here */ 433 pgoff_t writeback_index;
417 const struct address_space_operations *a_ops; /* methods */ 434 const struct address_space_operations *a_ops;
418 unsigned long flags; /* error bits */ 435 unsigned long flags;
419 spinlock_t private_lock; /* for use by the address_space */
420 gfp_t gfp_mask; /* implicit gfp mask for allocations */
421 struct list_head private_list; /* for use by the address_space */
422 void *private_data; /* ditto */
423 errseq_t wb_err; 436 errseq_t wb_err;
437 spinlock_t private_lock;
438 struct list_head private_list;
439 void *private_data;
424} __attribute__((aligned(sizeof(long)))) __randomize_layout; 440} __attribute__((aligned(sizeof(long)))) __randomize_layout;
425 /* 441 /*
426 * On most architectures that alignment is already the case; but 442 * On most architectures that alignment is already the case; but
@@ -467,15 +483,18 @@ struct block_device {
467 struct mutex bd_fsfreeze_mutex; 483 struct mutex bd_fsfreeze_mutex;
468} __randomize_layout; 484} __randomize_layout;
469 485
486/* XArray tags, for tagging dirty and writeback pages in the pagecache. */
487#define PAGECACHE_TAG_DIRTY XA_MARK_0
488#define PAGECACHE_TAG_WRITEBACK XA_MARK_1
489#define PAGECACHE_TAG_TOWRITE XA_MARK_2
490
470/* 491/*
471 * Radix-tree tags, for tagging dirty and writeback pages within the pagecache 492 * Returns true if any of the pages in the mapping are marked with the tag.
472 * radix trees
473 */ 493 */
474#define PAGECACHE_TAG_DIRTY 0 494static inline bool mapping_tagged(struct address_space *mapping, xa_mark_t tag)
475#define PAGECACHE_TAG_WRITEBACK 1 495{
476#define PAGECACHE_TAG_TOWRITE 2 496 return xa_marked(&mapping->i_pages, tag);
477 497}
478int mapping_tagged(struct address_space *mapping, int tag);
479 498
480static inline void i_mmap_lock_write(struct address_space *mapping) 499static inline void i_mmap_lock_write(struct address_space *mapping)
481{ 500{
diff --git a/include/linux/idr.h b/include/linux/idr.h
index 3ec8628ce17f..60daf34b625d 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -214,8 +214,7 @@ static inline void idr_preload_end(void)
214 ++id, (entry) = idr_get_next((idr), &(id))) 214 ++id, (entry) = idr_get_next((idr), &(id)))
215 215
216/* 216/*
217 * IDA - IDR based id allocator, use when translation from id to 217 * IDA - ID Allocator, use when translation from id to pointer isn't necessary.
218 * pointer isn't necessary.
219 */ 218 */
220#define IDA_CHUNK_SIZE 128 /* 128 bytes per chunk */ 219#define IDA_CHUNK_SIZE 128 /* 128 bytes per chunk */
221#define IDA_BITMAP_LONGS (IDA_CHUNK_SIZE / sizeof(long)) 220#define IDA_BITMAP_LONGS (IDA_CHUNK_SIZE / sizeof(long))
@@ -225,14 +224,14 @@ struct ida_bitmap {
225 unsigned long bitmap[IDA_BITMAP_LONGS]; 224 unsigned long bitmap[IDA_BITMAP_LONGS];
226}; 225};
227 226
228DECLARE_PER_CPU(struct ida_bitmap *, ida_bitmap);
229
230struct ida { 227struct ida {
231 struct radix_tree_root ida_rt; 228 struct xarray xa;
232}; 229};
233 230
231#define IDA_INIT_FLAGS (XA_FLAGS_LOCK_IRQ | XA_FLAGS_ALLOC)
232
234#define IDA_INIT(name) { \ 233#define IDA_INIT(name) { \
235 .ida_rt = RADIX_TREE_INIT(name, IDR_RT_MARKER | GFP_NOWAIT), \ 234 .xa = XARRAY_INIT(name, IDA_INIT_FLAGS) \
236} 235}
237#define DEFINE_IDA(name) struct ida name = IDA_INIT(name) 236#define DEFINE_IDA(name) struct ida name = IDA_INIT(name)
238 237
@@ -292,7 +291,7 @@ static inline int ida_alloc_max(struct ida *ida, unsigned int max, gfp_t gfp)
292 291
293static inline void ida_init(struct ida *ida) 292static inline void ida_init(struct ida *ida)
294{ 293{
295 INIT_RADIX_TREE(&ida->ida_rt, IDR_RT_MARKER | GFP_NOWAIT); 294 xa_init_flags(&ida->xa, IDA_INIT_FLAGS);
296} 295}
297 296
298#define ida_simple_get(ida, start, end, gfp) \ 297#define ida_simple_get(ida, start, end, gfp) \
@@ -301,9 +300,6 @@ static inline void ida_init(struct ida *ida)
301 300
302static inline bool ida_is_empty(const struct ida *ida) 301static inline bool ida_is_empty(const struct ida *ida)
303{ 302{
304 return radix_tree_empty(&ida->ida_rt); 303 return xa_empty(&ida->xa);
305} 304}
306
307/* in lib/radix-tree.c */
308int ida_pre_get(struct ida *ida, gfp_t gfp_mask);
309#endif /* __IDR_H__ */ 305#endif /* __IDR_H__ */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index b1bd2186e6d2..226f96f0dee0 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -241,9 +241,9 @@ static inline gfp_t readahead_gfp_mask(struct address_space *x)
241 241
242typedef int filler_t(void *, struct page *); 242typedef int filler_t(void *, struct page *);
243 243
244pgoff_t page_cache_next_hole(struct address_space *mapping, 244pgoff_t page_cache_next_miss(struct address_space *mapping,
245 pgoff_t index, unsigned long max_scan); 245 pgoff_t index, unsigned long max_scan);
246pgoff_t page_cache_prev_hole(struct address_space *mapping, 246pgoff_t page_cache_prev_miss(struct address_space *mapping,
247 pgoff_t index, unsigned long max_scan); 247 pgoff_t index, unsigned long max_scan);
248 248
249#define FGP_ACCESSED 0x00000001 249#define FGP_ACCESSED 0x00000001
@@ -363,17 +363,17 @@ static inline unsigned find_get_pages(struct address_space *mapping,
363unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, 363unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
364 unsigned int nr_pages, struct page **pages); 364 unsigned int nr_pages, struct page **pages);
365unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, 365unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
366 pgoff_t end, int tag, unsigned int nr_pages, 366 pgoff_t end, xa_mark_t tag, unsigned int nr_pages,
367 struct page **pages); 367 struct page **pages);
368static inline unsigned find_get_pages_tag(struct address_space *mapping, 368static inline unsigned find_get_pages_tag(struct address_space *mapping,
369 pgoff_t *index, int tag, unsigned int nr_pages, 369 pgoff_t *index, xa_mark_t tag, unsigned int nr_pages,
370 struct page **pages) 370 struct page **pages)
371{ 371{
372 return find_get_pages_range_tag(mapping, index, (pgoff_t)-1, tag, 372 return find_get_pages_range_tag(mapping, index, (pgoff_t)-1, tag,
373 nr_pages, pages); 373 nr_pages, pages);
374} 374}
375unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, 375unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
376 int tag, unsigned int nr_entries, 376 xa_mark_t tag, unsigned int nr_entries,
377 struct page **entries, pgoff_t *indices); 377 struct page **entries, pgoff_t *indices);
378 378
379struct page *grab_cache_page_write_begin(struct address_space *mapping, 379struct page *grab_cache_page_write_begin(struct address_space *mapping,
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 6dc456ac6136..081d934eda64 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -9,6 +9,8 @@
9#ifndef _LINUX_PAGEVEC_H 9#ifndef _LINUX_PAGEVEC_H
10#define _LINUX_PAGEVEC_H 10#define _LINUX_PAGEVEC_H
11 11
12#include <linux/xarray.h>
13
12/* 15 pointers + header align the pagevec structure to a power of two */ 14/* 15 pointers + header align the pagevec structure to a power of two */
13#define PAGEVEC_SIZE 15 15#define PAGEVEC_SIZE 15
14 16
@@ -40,12 +42,12 @@ static inline unsigned pagevec_lookup(struct pagevec *pvec,
40 42
41unsigned pagevec_lookup_range_tag(struct pagevec *pvec, 43unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
42 struct address_space *mapping, pgoff_t *index, pgoff_t end, 44 struct address_space *mapping, pgoff_t *index, pgoff_t end,
43 int tag); 45 xa_mark_t tag);
44unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec, 46unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec,
45 struct address_space *mapping, pgoff_t *index, pgoff_t end, 47 struct address_space *mapping, pgoff_t *index, pgoff_t end,
46 int tag, unsigned max_pages); 48 xa_mark_t tag, unsigned max_pages);
47static inline unsigned pagevec_lookup_tag(struct pagevec *pvec, 49static inline unsigned pagevec_lookup_tag(struct pagevec *pvec,
48 struct address_space *mapping, pgoff_t *index, int tag) 50 struct address_space *mapping, pgoff_t *index, xa_mark_t tag)
49{ 51{
50 return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag); 52 return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag);
51} 53}
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 34149e8b5f73..06c4c7a6c09c 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -28,34 +28,30 @@
28#include <linux/rcupdate.h> 28#include <linux/rcupdate.h>
29#include <linux/spinlock.h> 29#include <linux/spinlock.h>
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/xarray.h>
32
33/* Keep unconverted code working */
34#define radix_tree_root xarray
35#define radix_tree_node xa_node
31 36
32/* 37/*
33 * The bottom two bits of the slot determine how the remaining bits in the 38 * The bottom two bits of the slot determine how the remaining bits in the
34 * slot are interpreted: 39 * slot are interpreted:
35 * 40 *
36 * 00 - data pointer 41 * 00 - data pointer
37 * 01 - internal entry 42 * 10 - internal entry
38 * 10 - exceptional entry 43 * x1 - value entry
39 * 11 - this bit combination is currently unused/reserved
40 * 44 *
41 * The internal entry may be a pointer to the next level in the tree, a 45 * The internal entry may be a pointer to the next level in the tree, a
42 * sibling entry, or an indicator that the entry in this slot has been moved 46 * sibling entry, or an indicator that the entry in this slot has been moved
43 * to another location in the tree and the lookup should be restarted. While 47 * to another location in the tree and the lookup should be restarted. While
44 * NULL fits the 'data pointer' pattern, it means that there is no entry in 48 * NULL fits the 'data pointer' pattern, it means that there is no entry in
45 * the tree for this index (no matter what level of the tree it is found at). 49 * the tree for this index (no matter what level of the tree it is found at).
46 * This means that you cannot store NULL in the tree as a value for the index. 50 * This means that storing a NULL entry in the tree is the same as deleting
51 * the entry from the tree.
47 */ 52 */
48#define RADIX_TREE_ENTRY_MASK 3UL 53#define RADIX_TREE_ENTRY_MASK 3UL
49#define RADIX_TREE_INTERNAL_NODE 1UL 54#define RADIX_TREE_INTERNAL_NODE 2UL
50
51/*
52 * Most users of the radix tree store pointers but shmem/tmpfs stores swap
53 * entries in the same tree. They are marked as exceptional entries to
54 * distinguish them from pointers to struct page.
55 * EXCEPTIONAL_ENTRY tests the bit, EXCEPTIONAL_SHIFT shifts content past it.
56 */
57#define RADIX_TREE_EXCEPTIONAL_ENTRY 2
58#define RADIX_TREE_EXCEPTIONAL_SHIFT 2
59 55
60static inline bool radix_tree_is_internal_node(void *ptr) 56static inline bool radix_tree_is_internal_node(void *ptr)
61{ 57{
@@ -65,75 +61,32 @@ static inline bool radix_tree_is_internal_node(void *ptr)
65 61
66/*** radix-tree API starts here ***/ 62/*** radix-tree API starts here ***/
67 63
68#define RADIX_TREE_MAX_TAGS 3 64#define RADIX_TREE_MAP_SHIFT XA_CHUNK_SHIFT
69
70#ifndef RADIX_TREE_MAP_SHIFT
71#define RADIX_TREE_MAP_SHIFT (CONFIG_BASE_SMALL ? 4 : 6)
72#endif
73
74#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) 65#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT)
75#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1) 66#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1)
76 67
77#define RADIX_TREE_TAG_LONGS \ 68#define RADIX_TREE_MAX_TAGS XA_MAX_MARKS
78 ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) 69#define RADIX_TREE_TAG_LONGS XA_MARK_LONGS
79 70
80#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) 71#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long))
81#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ 72#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
82 RADIX_TREE_MAP_SHIFT)) 73 RADIX_TREE_MAP_SHIFT))
83 74
84/* 75/* The IDR tag is stored in the low bits of xa_flags */
85 * @count is the count of every non-NULL element in the ->slots array
86 * whether that is an exceptional entry, a retry entry, a user pointer,
87 * a sibling entry or a pointer to the next level of the tree.
88 * @exceptional is the count of every element in ->slots which is
89 * either radix_tree_exceptional_entry() or is a sibling entry for an
90 * exceptional entry.
91 */
92struct radix_tree_node {
93 unsigned char shift; /* Bits remaining in each slot */
94 unsigned char offset; /* Slot offset in parent */
95 unsigned char count; /* Total entry count */
96 unsigned char exceptional; /* Exceptional entry count */
97 struct radix_tree_node *parent; /* Used when ascending tree */
98 struct radix_tree_root *root; /* The tree we belong to */
99 union {
100 struct list_head private_list; /* For tree user */
101 struct rcu_head rcu_head; /* Used when freeing node */
102 };
103 void __rcu *slots[RADIX_TREE_MAP_SIZE];
104 unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
105};
106
107/* The IDR tag is stored in the low bits of the GFP flags */
108#define ROOT_IS_IDR ((__force gfp_t)4) 76#define ROOT_IS_IDR ((__force gfp_t)4)
109/* The top bits of gfp_mask are used to store the root tags */ 77/* The top bits of xa_flags are used to store the root tags */
110#define ROOT_TAG_SHIFT (__GFP_BITS_SHIFT) 78#define ROOT_TAG_SHIFT (__GFP_BITS_SHIFT)
111 79
112struct radix_tree_root { 80#define RADIX_TREE_INIT(name, mask) XARRAY_INIT(name, mask)
113 spinlock_t xa_lock;
114 gfp_t gfp_mask;
115 struct radix_tree_node __rcu *rnode;
116};
117
118#define RADIX_TREE_INIT(name, mask) { \
119 .xa_lock = __SPIN_LOCK_UNLOCKED(name.xa_lock), \
120 .gfp_mask = (mask), \
121 .rnode = NULL, \
122}
123 81
124#define RADIX_TREE(name, mask) \ 82#define RADIX_TREE(name, mask) \
125 struct radix_tree_root name = RADIX_TREE_INIT(name, mask) 83 struct radix_tree_root name = RADIX_TREE_INIT(name, mask)
126 84
127#define INIT_RADIX_TREE(root, mask) \ 85#define INIT_RADIX_TREE(root, mask) xa_init_flags(root, mask)
128do { \
129 spin_lock_init(&(root)->xa_lock); \
130 (root)->gfp_mask = (mask); \
131 (root)->rnode = NULL; \
132} while (0)
133 86
134static inline bool radix_tree_empty(const struct radix_tree_root *root) 87static inline bool radix_tree_empty(const struct radix_tree_root *root)
135{ 88{
136 return root->rnode == NULL; 89 return root->xa_head == NULL;
137} 90}
138 91
139/** 92/**
@@ -143,7 +96,6 @@ static inline bool radix_tree_empty(const struct radix_tree_root *root)
143 * @next_index: one beyond the last index for this chunk 96 * @next_index: one beyond the last index for this chunk
144 * @tags: bit-mask for tag-iterating 97 * @tags: bit-mask for tag-iterating
145 * @node: node that contains current slot 98 * @node: node that contains current slot
146 * @shift: shift for the node that holds our slots
147 * 99 *
148 * This radix tree iterator works in terms of "chunks" of slots. A chunk is a 100 * This radix tree iterator works in terms of "chunks" of slots. A chunk is a
149 * subinterval of slots contained within one radix tree leaf node. It is 101 * subinterval of slots contained within one radix tree leaf node. It is
@@ -157,20 +109,8 @@ struct radix_tree_iter {
157 unsigned long next_index; 109 unsigned long next_index;
158 unsigned long tags; 110 unsigned long tags;
159 struct radix_tree_node *node; 111 struct radix_tree_node *node;
160#ifdef CONFIG_RADIX_TREE_MULTIORDER
161 unsigned int shift;
162#endif
163}; 112};
164 113
165static inline unsigned int iter_shift(const struct radix_tree_iter *iter)
166{
167#ifdef CONFIG_RADIX_TREE_MULTIORDER
168 return iter->shift;
169#else
170 return 0;
171#endif
172}
173
174/** 114/**
175 * Radix-tree synchronization 115 * Radix-tree synchronization
176 * 116 *
@@ -194,12 +134,11 @@ static inline unsigned int iter_shift(const struct radix_tree_iter *iter)
194 * radix_tree_lookup_slot 134 * radix_tree_lookup_slot
195 * radix_tree_tag_get 135 * radix_tree_tag_get
196 * radix_tree_gang_lookup 136 * radix_tree_gang_lookup
197 * radix_tree_gang_lookup_slot
198 * radix_tree_gang_lookup_tag 137 * radix_tree_gang_lookup_tag
199 * radix_tree_gang_lookup_tag_slot 138 * radix_tree_gang_lookup_tag_slot
200 * radix_tree_tagged 139 * radix_tree_tagged
201 * 140 *
202 * The first 8 functions are able to be called locklessly, using RCU. The 141 * The first 7 functions are able to be called locklessly, using RCU. The
203 * caller must ensure calls to these functions are made within rcu_read_lock() 142 * caller must ensure calls to these functions are made within rcu_read_lock()
204 * regions. Other readers (lock-free or otherwise) and modifications may be 143 * regions. Other readers (lock-free or otherwise) and modifications may be
205 * running concurrently. 144 * running concurrently.
@@ -269,17 +208,6 @@ static inline int radix_tree_deref_retry(void *arg)
269} 208}
270 209
271/** 210/**
272 * radix_tree_exceptional_entry - radix_tree_deref_slot gave exceptional entry?
273 * @arg: value returned by radix_tree_deref_slot
274 * Returns: 0 if well-aligned pointer, non-0 if exceptional entry.
275 */
276static inline int radix_tree_exceptional_entry(void *arg)
277{
278 /* Not unlikely because radix_tree_exception often tested first */
279 return (unsigned long)arg & RADIX_TREE_EXCEPTIONAL_ENTRY;
280}
281
282/**
283 * radix_tree_exception - radix_tree_deref_slot returned either exception? 211 * radix_tree_exception - radix_tree_deref_slot returned either exception?
284 * @arg: value returned by radix_tree_deref_slot 212 * @arg: value returned by radix_tree_deref_slot
285 * Returns: 0 if well-aligned pointer, non-0 if either kind of exception. 213 * Returns: 0 if well-aligned pointer, non-0 if either kind of exception.
@@ -289,47 +217,28 @@ static inline int radix_tree_exception(void *arg)
289 return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK); 217 return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK);
290} 218}
291 219
292int __radix_tree_create(struct radix_tree_root *, unsigned long index, 220int radix_tree_insert(struct radix_tree_root *, unsigned long index,
293 unsigned order, struct radix_tree_node **nodep, 221 void *);
294 void __rcu ***slotp);
295int __radix_tree_insert(struct radix_tree_root *, unsigned long index,
296 unsigned order, void *);
297static inline int radix_tree_insert(struct radix_tree_root *root,
298 unsigned long index, void *entry)
299{
300 return __radix_tree_insert(root, index, 0, entry);
301}
302void *__radix_tree_lookup(const struct radix_tree_root *, unsigned long index, 222void *__radix_tree_lookup(const struct radix_tree_root *, unsigned long index,
303 struct radix_tree_node **nodep, void __rcu ***slotp); 223 struct radix_tree_node **nodep, void __rcu ***slotp);
304void *radix_tree_lookup(const struct radix_tree_root *, unsigned long); 224void *radix_tree_lookup(const struct radix_tree_root *, unsigned long);
305void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *, 225void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *,
306 unsigned long index); 226 unsigned long index);
307typedef void (*radix_tree_update_node_t)(struct radix_tree_node *);
308void __radix_tree_replace(struct radix_tree_root *, struct radix_tree_node *, 227void __radix_tree_replace(struct radix_tree_root *, struct radix_tree_node *,
309 void __rcu **slot, void *entry, 228 void __rcu **slot, void *entry);
310 radix_tree_update_node_t update_node);
311void radix_tree_iter_replace(struct radix_tree_root *, 229void radix_tree_iter_replace(struct radix_tree_root *,
312 const struct radix_tree_iter *, void __rcu **slot, void *entry); 230 const struct radix_tree_iter *, void __rcu **slot, void *entry);
313void radix_tree_replace_slot(struct radix_tree_root *, 231void radix_tree_replace_slot(struct radix_tree_root *,
314 void __rcu **slot, void *entry); 232 void __rcu **slot, void *entry);
315void __radix_tree_delete_node(struct radix_tree_root *,
316 struct radix_tree_node *,
317 radix_tree_update_node_t update_node);
318void radix_tree_iter_delete(struct radix_tree_root *, 233void radix_tree_iter_delete(struct radix_tree_root *,
319 struct radix_tree_iter *iter, void __rcu **slot); 234 struct radix_tree_iter *iter, void __rcu **slot);
320void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); 235void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
321void *radix_tree_delete(struct radix_tree_root *, unsigned long); 236void *radix_tree_delete(struct radix_tree_root *, unsigned long);
322void radix_tree_clear_tags(struct radix_tree_root *, struct radix_tree_node *,
323 void __rcu **slot);
324unsigned int radix_tree_gang_lookup(const struct radix_tree_root *, 237unsigned int radix_tree_gang_lookup(const struct radix_tree_root *,
325 void **results, unsigned long first_index, 238 void **results, unsigned long first_index,
326 unsigned int max_items); 239 unsigned int max_items);
327unsigned int radix_tree_gang_lookup_slot(const struct radix_tree_root *,
328 void __rcu ***results, unsigned long *indices,
329 unsigned long first_index, unsigned int max_items);
330int radix_tree_preload(gfp_t gfp_mask); 240int radix_tree_preload(gfp_t gfp_mask);
331int radix_tree_maybe_preload(gfp_t gfp_mask); 241int radix_tree_maybe_preload(gfp_t gfp_mask);
332int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order);
333void radix_tree_init(void); 242void radix_tree_init(void);
334void *radix_tree_tag_set(struct radix_tree_root *, 243void *radix_tree_tag_set(struct radix_tree_root *,
335 unsigned long index, unsigned int tag); 244 unsigned long index, unsigned int tag);
@@ -337,8 +246,6 @@ void *radix_tree_tag_clear(struct radix_tree_root *,
337 unsigned long index, unsigned int tag); 246 unsigned long index, unsigned int tag);
338int radix_tree_tag_get(const struct radix_tree_root *, 247int radix_tree_tag_get(const struct radix_tree_root *,
339 unsigned long index, unsigned int tag); 248 unsigned long index, unsigned int tag);
340void radix_tree_iter_tag_set(struct radix_tree_root *,
341 const struct radix_tree_iter *iter, unsigned int tag);
342void radix_tree_iter_tag_clear(struct radix_tree_root *, 249void radix_tree_iter_tag_clear(struct radix_tree_root *,
343 const struct radix_tree_iter *iter, unsigned int tag); 250 const struct radix_tree_iter *iter, unsigned int tag);
344unsigned int radix_tree_gang_lookup_tag(const struct radix_tree_root *, 251unsigned int radix_tree_gang_lookup_tag(const struct radix_tree_root *,
@@ -354,12 +261,6 @@ static inline void radix_tree_preload_end(void)
354 preempt_enable(); 261 preempt_enable();
355} 262}
356 263
357int radix_tree_split_preload(unsigned old_order, unsigned new_order, gfp_t);
358int radix_tree_split(struct radix_tree_root *, unsigned long index,
359 unsigned new_order);
360int radix_tree_join(struct radix_tree_root *, unsigned long index,
361 unsigned new_order, void *);
362
363void __rcu **idr_get_free(struct radix_tree_root *root, 264void __rcu **idr_get_free(struct radix_tree_root *root,
364 struct radix_tree_iter *iter, gfp_t gfp, 265 struct radix_tree_iter *iter, gfp_t gfp,
365 unsigned long max); 266 unsigned long max);
@@ -465,7 +366,7 @@ void __rcu **radix_tree_iter_retry(struct radix_tree_iter *iter)
465static inline unsigned long 366static inline unsigned long
466__radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots) 367__radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots)
467{ 368{
468 return iter->index + (slots << iter_shift(iter)); 369 return iter->index + slots;
469} 370}
470 371
471/** 372/**
@@ -490,21 +391,9 @@ void __rcu **__must_check radix_tree_iter_resume(void __rcu **slot,
490static __always_inline long 391static __always_inline long
491radix_tree_chunk_size(struct radix_tree_iter *iter) 392radix_tree_chunk_size(struct radix_tree_iter *iter)
492{ 393{
493 return (iter->next_index - iter->index) >> iter_shift(iter); 394 return iter->next_index - iter->index;
494} 395}
495 396
496#ifdef CONFIG_RADIX_TREE_MULTIORDER
497void __rcu **__radix_tree_next_slot(void __rcu **slot,
498 struct radix_tree_iter *iter, unsigned flags);
499#else
500/* Can't happen without sibling entries, but the compiler can't tell that */
501static inline void __rcu **__radix_tree_next_slot(void __rcu **slot,
502 struct radix_tree_iter *iter, unsigned flags)
503{
504 return slot;
505}
506#endif
507
508/** 397/**
509 * radix_tree_next_slot - find next slot in chunk 398 * radix_tree_next_slot - find next slot in chunk
510 * 399 *
@@ -563,8 +452,6 @@ static __always_inline void __rcu **radix_tree_next_slot(void __rcu **slot,
563 return NULL; 452 return NULL;
564 453
565 found: 454 found:
566 if (unlikely(radix_tree_is_internal_node(rcu_dereference_raw(*slot))))
567 return __radix_tree_next_slot(slot, iter, flags);
568 return slot; 455 return slot;
569} 456}
570 457
@@ -584,23 +471,6 @@ static __always_inline void __rcu **radix_tree_next_slot(void __rcu **slot,
584 slot = radix_tree_next_slot(slot, iter, 0)) 471 slot = radix_tree_next_slot(slot, iter, 0))
585 472
586/** 473/**
587 * radix_tree_for_each_contig - iterate over contiguous slots
588 *
589 * @slot: the void** variable for pointer to slot
590 * @root: the struct radix_tree_root pointer
591 * @iter: the struct radix_tree_iter pointer
592 * @start: iteration starting index
593 *
594 * @slot points to radix tree slot, @iter->index contains its index.
595 */
596#define radix_tree_for_each_contig(slot, root, iter, start) \
597 for (slot = radix_tree_iter_init(iter, start) ; \
598 slot || (slot = radix_tree_next_chunk(root, iter, \
599 RADIX_TREE_ITER_CONTIG)) ; \
600 slot = radix_tree_next_slot(slot, iter, \
601 RADIX_TREE_ITER_CONTIG))
602
603/**
604 * radix_tree_for_each_tagged - iterate over tagged slots 474 * radix_tree_for_each_tagged - iterate over tagged slots
605 * 475 *
606 * @slot: the void** variable for pointer to slot 476 * @slot: the void** variable for pointer to slot
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 38195f5c96b1..d8a07a4f171d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -300,17 +300,12 @@ void *workingset_eviction(struct address_space *mapping, struct page *page);
300void workingset_refault(struct page *page, void *shadow); 300void workingset_refault(struct page *page, void *shadow);
301void workingset_activation(struct page *page); 301void workingset_activation(struct page *page);
302 302
303/* Do not use directly, use workingset_lookup_update */ 303/* Only track the nodes of mappings with shadow entries */
304void workingset_update_node(struct radix_tree_node *node); 304void workingset_update_node(struct xa_node *node);
305 305#define mapping_set_update(xas, mapping) do { \
306/* Returns workingset_update_node() if the mapping has shadow entries. */ 306 if (!dax_mapping(mapping) && !shmem_mapping(mapping)) \
307#define workingset_lookup_update(mapping) \ 307 xas_set_update(xas, workingset_update_node); \
308({ \ 308} while (0)
309 radix_tree_update_node_t __helper = workingset_update_node; \
310 if (dax_mapping(mapping) || shmem_mapping(mapping)) \
311 __helper = NULL; \
312 __helper; \
313})
314 309
315/* linux/mm/page_alloc.c */ 310/* linux/mm/page_alloc.c */
316extern unsigned long totalram_pages; 311extern unsigned long totalram_pages;
@@ -409,7 +404,7 @@ extern void show_swap_cache_info(void);
409extern int add_to_swap(struct page *page); 404extern int add_to_swap(struct page *page);
410extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); 405extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t);
411extern int __add_to_swap_cache(struct page *page, swp_entry_t entry); 406extern int __add_to_swap_cache(struct page *page, swp_entry_t entry);
412extern void __delete_from_swap_cache(struct page *); 407extern void __delete_from_swap_cache(struct page *, swp_entry_t entry);
413extern void delete_from_swap_cache(struct page *); 408extern void delete_from_swap_cache(struct page *);
414extern void free_page_and_swap_cache(struct page *); 409extern void free_page_and_swap_cache(struct page *);
415extern void free_pages_and_swap_cache(struct page **, int); 410extern void free_pages_and_swap_cache(struct page **, int);
@@ -563,7 +558,8 @@ static inline int add_to_swap_cache(struct page *page, swp_entry_t entry,
563 return -1; 558 return -1;
564} 559}
565 560
566static inline void __delete_from_swap_cache(struct page *page) 561static inline void __delete_from_swap_cache(struct page *page,
562 swp_entry_t entry)
567{ 563{
568} 564}
569 565
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 22af9d8a84ae..4d961668e5fc 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -18,9 +18,8 @@
18 * 18 *
19 * swp_entry_t's are *never* stored anywhere in their arch-dependent format. 19 * swp_entry_t's are *never* stored anywhere in their arch-dependent format.
20 */ 20 */
21#define SWP_TYPE_SHIFT(e) ((sizeof(e.val) * 8) - \ 21#define SWP_TYPE_SHIFT (BITS_PER_XA_VALUE - MAX_SWAPFILES_SHIFT)
22 (MAX_SWAPFILES_SHIFT + RADIX_TREE_EXCEPTIONAL_SHIFT)) 22#define SWP_OFFSET_MASK ((1UL << SWP_TYPE_SHIFT) - 1)
23#define SWP_OFFSET_MASK(e) ((1UL << SWP_TYPE_SHIFT(e)) - 1)
24 23
25/* 24/*
26 * Store a type+offset into a swp_entry_t in an arch-independent format 25 * Store a type+offset into a swp_entry_t in an arch-independent format
@@ -29,8 +28,7 @@ static inline swp_entry_t swp_entry(unsigned long type, pgoff_t offset)
29{ 28{
30 swp_entry_t ret; 29 swp_entry_t ret;
31 30
32 ret.val = (type << SWP_TYPE_SHIFT(ret)) | 31 ret.val = (type << SWP_TYPE_SHIFT) | (offset & SWP_OFFSET_MASK);
33 (offset & SWP_OFFSET_MASK(ret));
34 return ret; 32 return ret;
35} 33}
36 34
@@ -40,7 +38,7 @@ static inline swp_entry_t swp_entry(unsigned long type, pgoff_t offset)
40 */ 38 */
41static inline unsigned swp_type(swp_entry_t entry) 39static inline unsigned swp_type(swp_entry_t entry)
42{ 40{
43 return (entry.val >> SWP_TYPE_SHIFT(entry)); 41 return (entry.val >> SWP_TYPE_SHIFT);
44} 42}
45 43
46/* 44/*
@@ -49,7 +47,7 @@ static inline unsigned swp_type(swp_entry_t entry)
49 */ 47 */
50static inline pgoff_t swp_offset(swp_entry_t entry) 48static inline pgoff_t swp_offset(swp_entry_t entry)
51{ 49{
52 return entry.val & SWP_OFFSET_MASK(entry); 50 return entry.val & SWP_OFFSET_MASK;
53} 51}
54 52
55#ifdef CONFIG_MMU 53#ifdef CONFIG_MMU
@@ -90,16 +88,13 @@ static inline swp_entry_t radix_to_swp_entry(void *arg)
90{ 88{
91 swp_entry_t entry; 89 swp_entry_t entry;
92 90
93 entry.val = (unsigned long)arg >> RADIX_TREE_EXCEPTIONAL_SHIFT; 91 entry.val = xa_to_value(arg);
94 return entry; 92 return entry;
95} 93}
96 94
97static inline void *swp_to_radix_entry(swp_entry_t entry) 95static inline void *swp_to_radix_entry(swp_entry_t entry)
98{ 96{
99 unsigned long value; 97 return xa_mk_value(entry.val);
100
101 value = entry.val << RADIX_TREE_EXCEPTIONAL_SHIFT;
102 return (void *)(value | RADIX_TREE_EXCEPTIONAL_ENTRY);
103} 98}
104 99
105#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) 100#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index 2dfc8006fe64..d9514928ddac 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -4,10 +4,432 @@
4/* 4/*
5 * eXtensible Arrays 5 * eXtensible Arrays
6 * Copyright (c) 2017 Microsoft Corporation 6 * Copyright (c) 2017 Microsoft Corporation
7 * Author: Matthew Wilcox <mawilcox@microsoft.com> 7 * Author: Matthew Wilcox <willy@infradead.org>
8 *
9 * See Documentation/core-api/xarray.rst for how to use the XArray.
8 */ 10 */
9 11
12#include <linux/bug.h>
13#include <linux/compiler.h>
14#include <linux/gfp.h>
15#include <linux/kconfig.h>
16#include <linux/kernel.h>
17#include <linux/rcupdate.h>
10#include <linux/spinlock.h> 18#include <linux/spinlock.h>
19#include <linux/types.h>
20
21/*
22 * The bottom two bits of the entry determine how the XArray interprets
23 * the contents:
24 *
25 * 00: Pointer entry
26 * 10: Internal entry
27 * x1: Value entry or tagged pointer
28 *
29 * Attempting to store internal entries in the XArray is a bug.
30 *
31 * Most internal entries are pointers to the next node in the tree.
32 * The following internal entries have a special meaning:
33 *
34 * 0-62: Sibling entries
35 * 256: Zero entry
36 * 257: Retry entry
37 *
38 * Errors are also represented as internal entries, but use the negative
39 * space (-4094 to -2). They're never stored in the slots array; only
40 * returned by the normal API.
41 */
42
43#define BITS_PER_XA_VALUE (BITS_PER_LONG - 1)
44
45/**
46 * xa_mk_value() - Create an XArray entry from an integer.
47 * @v: Value to store in XArray.
48 *
49 * Context: Any context.
50 * Return: An entry suitable for storing in the XArray.
51 */
52static inline void *xa_mk_value(unsigned long v)
53{
54 WARN_ON((long)v < 0);
55 return (void *)((v << 1) | 1);
56}
57
58/**
59 * xa_to_value() - Get value stored in an XArray entry.
60 * @entry: XArray entry.
61 *
62 * Context: Any context.
63 * Return: The value stored in the XArray entry.
64 */
65static inline unsigned long xa_to_value(const void *entry)
66{
67 return (unsigned long)entry >> 1;
68}
69
70/**
71 * xa_is_value() - Determine if an entry is a value.
72 * @entry: XArray entry.
73 *
74 * Context: Any context.
75 * Return: True if the entry is a value, false if it is a pointer.
76 */
77static inline bool xa_is_value(const void *entry)
78{
79 return (unsigned long)entry & 1;
80}
81
82/**
83 * xa_tag_pointer() - Create an XArray entry for a tagged pointer.
84 * @p: Plain pointer.
85 * @tag: Tag value (0, 1 or 3).
86 *
87 * If the user of the XArray prefers, they can tag their pointers instead
88 * of storing value entries. Three tags are available (0, 1 and 3).
89 * These are distinct from the xa_mark_t as they are not replicated up
90 * through the array and cannot be searched for.
91 *
92 * Context: Any context.
93 * Return: An XArray entry.
94 */
95static inline void *xa_tag_pointer(void *p, unsigned long tag)
96{
97 return (void *)((unsigned long)p | tag);
98}
99
100/**
101 * xa_untag_pointer() - Turn an XArray entry into a plain pointer.
102 * @entry: XArray entry.
103 *
104 * If you have stored a tagged pointer in the XArray, call this function
105 * to get the untagged version of the pointer.
106 *
107 * Context: Any context.
108 * Return: A pointer.
109 */
110static inline void *xa_untag_pointer(void *entry)
111{
112 return (void *)((unsigned long)entry & ~3UL);
113}
114
115/**
116 * xa_pointer_tag() - Get the tag stored in an XArray entry.
117 * @entry: XArray entry.
118 *
119 * If you have stored a tagged pointer in the XArray, call this function
120 * to get the tag of that pointer.
121 *
122 * Context: Any context.
123 * Return: A tag.
124 */
125static inline unsigned int xa_pointer_tag(void *entry)
126{
127 return (unsigned long)entry & 3UL;
128}
129
130/*
131 * xa_mk_internal() - Create an internal entry.
132 * @v: Value to turn into an internal entry.
133 *
134 * Context: Any context.
135 * Return: An XArray internal entry corresponding to this value.
136 */
137static inline void *xa_mk_internal(unsigned long v)
138{
139 return (void *)((v << 2) | 2);
140}
141
142/*
143 * xa_to_internal() - Extract the value from an internal entry.
144 * @entry: XArray entry.
145 *
146 * Context: Any context.
147 * Return: The value which was stored in the internal entry.
148 */
149static inline unsigned long xa_to_internal(const void *entry)
150{
151 return (unsigned long)entry >> 2;
152}
153
154/*
155 * xa_is_internal() - Is the entry an internal entry?
156 * @entry: XArray entry.
157 *
158 * Context: Any context.
159 * Return: %true if the entry is an internal entry.
160 */
161static inline bool xa_is_internal(const void *entry)
162{
163 return ((unsigned long)entry & 3) == 2;
164}
165
166/**
167 * xa_is_err() - Report whether an XArray operation returned an error
168 * @entry: Result from calling an XArray function
169 *
170 * If an XArray operation cannot complete an operation, it will return
171 * a special value indicating an error. This function tells you
172 * whether an error occurred; xa_err() tells you which error occurred.
173 *
174 * Context: Any context.
175 * Return: %true if the entry indicates an error.
176 */
177static inline bool xa_is_err(const void *entry)
178{
179 return unlikely(xa_is_internal(entry));
180}
181
182/**
183 * xa_err() - Turn an XArray result into an errno.
184 * @entry: Result from calling an XArray function.
185 *
186 * If an XArray operation cannot complete an operation, it will return
187 * a special pointer value which encodes an errno. This function extracts
188 * the errno from the pointer value, or returns 0 if the pointer does not
189 * represent an errno.
190 *
191 * Context: Any context.
192 * Return: A negative errno or 0.
193 */
194static inline int xa_err(void *entry)
195{
196 /* xa_to_internal() would not do sign extension. */
197 if (xa_is_err(entry))
198 return (long)entry >> 2;
199 return 0;
200}
201
202typedef unsigned __bitwise xa_mark_t;
203#define XA_MARK_0 ((__force xa_mark_t)0U)
204#define XA_MARK_1 ((__force xa_mark_t)1U)
205#define XA_MARK_2 ((__force xa_mark_t)2U)
206#define XA_PRESENT ((__force xa_mark_t)8U)
207#define XA_MARK_MAX XA_MARK_2
208#define XA_FREE_MARK XA_MARK_0
209
210enum xa_lock_type {
211 XA_LOCK_IRQ = 1,
212 XA_LOCK_BH = 2,
213};
214
215/*
216 * Values for xa_flags. The radix tree stores its GFP flags in the xa_flags,
217 * and we remain compatible with that.
218 */
219#define XA_FLAGS_LOCK_IRQ ((__force gfp_t)XA_LOCK_IRQ)
220#define XA_FLAGS_LOCK_BH ((__force gfp_t)XA_LOCK_BH)
221#define XA_FLAGS_TRACK_FREE ((__force gfp_t)4U)
222#define XA_FLAGS_MARK(mark) ((__force gfp_t)((1U << __GFP_BITS_SHIFT) << \
223 (__force unsigned)(mark)))
224
225#define XA_FLAGS_ALLOC (XA_FLAGS_TRACK_FREE | XA_FLAGS_MARK(XA_FREE_MARK))
226
227/**
228 * struct xarray - The anchor of the XArray.
229 * @xa_lock: Lock that protects the contents of the XArray.
230 *
231 * To use the xarray, define it statically or embed it in your data structure.
232 * It is a very small data structure, so it does not usually make sense to
233 * allocate it separately and keep a pointer to it in your data structure.
234 *
235 * You may use the xa_lock to protect your own data structures as well.
236 */
237/*
238 * If all of the entries in the array are NULL, @xa_head is a NULL pointer.
239 * If the only non-NULL entry in the array is at index 0, @xa_head is that
240 * entry. If any other entry in the array is non-NULL, @xa_head points
241 * to an @xa_node.
242 */
243struct xarray {
244 spinlock_t xa_lock;
245/* private: The rest of the data structure is not to be used directly. */
246 gfp_t xa_flags;
247 void __rcu * xa_head;
248};
249
250#define XARRAY_INIT(name, flags) { \
251 .xa_lock = __SPIN_LOCK_UNLOCKED(name.xa_lock), \
252 .xa_flags = flags, \
253 .xa_head = NULL, \
254}
255
256/**
257 * DEFINE_XARRAY_FLAGS() - Define an XArray with custom flags.
258 * @name: A string that names your XArray.
259 * @flags: XA_FLAG values.
260 *
261 * This is intended for file scope definitions of XArrays. It declares
262 * and initialises an empty XArray with the chosen name and flags. It is
263 * equivalent to calling xa_init_flags() on the array, but it does the
264 * initialisation at compiletime instead of runtime.
265 */
266#define DEFINE_XARRAY_FLAGS(name, flags) \
267 struct xarray name = XARRAY_INIT(name, flags)
268
269/**
270 * DEFINE_XARRAY() - Define an XArray.
271 * @name: A string that names your XArray.
272 *
273 * This is intended for file scope definitions of XArrays. It declares
274 * and initialises an empty XArray with the chosen name. It is equivalent
275 * to calling xa_init() on the array, but it does the initialisation at
276 * compiletime instead of runtime.
277 */
278#define DEFINE_XARRAY(name) DEFINE_XARRAY_FLAGS(name, 0)
279
280/**
281 * DEFINE_XARRAY_ALLOC() - Define an XArray which can allocate IDs.
282 * @name: A string that names your XArray.
283 *
284 * This is intended for file scope definitions of allocating XArrays.
285 * See also DEFINE_XARRAY().
286 */
287#define DEFINE_XARRAY_ALLOC(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC)
288
289void xa_init_flags(struct xarray *, gfp_t flags);
290void *xa_load(struct xarray *, unsigned long index);
291void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t);
292void *xa_cmpxchg(struct xarray *, unsigned long index,
293 void *old, void *entry, gfp_t);
294int xa_reserve(struct xarray *, unsigned long index, gfp_t);
295void *xa_store_range(struct xarray *, unsigned long first, unsigned long last,
296 void *entry, gfp_t);
297bool xa_get_mark(struct xarray *, unsigned long index, xa_mark_t);
298void xa_set_mark(struct xarray *, unsigned long index, xa_mark_t);
299void xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t);
300void *xa_find(struct xarray *xa, unsigned long *index,
301 unsigned long max, xa_mark_t) __attribute__((nonnull(2)));
302void *xa_find_after(struct xarray *xa, unsigned long *index,
303 unsigned long max, xa_mark_t) __attribute__((nonnull(2)));
304unsigned int xa_extract(struct xarray *, void **dst, unsigned long start,
305 unsigned long max, unsigned int n, xa_mark_t);
306void xa_destroy(struct xarray *);
307
308/**
309 * xa_init() - Initialise an empty XArray.
310 * @xa: XArray.
311 *
312 * An empty XArray is full of NULL entries.
313 *
314 * Context: Any context.
315 */
316static inline void xa_init(struct xarray *xa)
317{
318 xa_init_flags(xa, 0);
319}
320
321/**
322 * xa_empty() - Determine if an array has any present entries.
323 * @xa: XArray.
324 *
325 * Context: Any context.
326 * Return: %true if the array contains only NULL pointers.
327 */
328static inline bool xa_empty(const struct xarray *xa)
329{
330 return xa->xa_head == NULL;
331}
332
333/**
334 * xa_marked() - Inquire whether any entry in this array has a mark set
335 * @xa: Array
336 * @mark: Mark value
337 *
338 * Context: Any context.
339 * Return: %true if any entry has this mark set.
340 */
341static inline bool xa_marked(const struct xarray *xa, xa_mark_t mark)
342{
343 return xa->xa_flags & XA_FLAGS_MARK(mark);
344}
345
346/**
347 * xa_erase() - Erase this entry from the XArray.
348 * @xa: XArray.
349 * @index: Index of entry.
350 *
351 * This function is the equivalent of calling xa_store() with %NULL as
352 * the third argument. The XArray does not need to allocate memory, so
353 * the user does not need to provide GFP flags.
354 *
355 * Context: Process context. Takes and releases the xa_lock.
356 * Return: The entry which used to be at this index.
357 */
358static inline void *xa_erase(struct xarray *xa, unsigned long index)
359{
360 return xa_store(xa, index, NULL, 0);
361}
362
363/**
364 * xa_insert() - Store this entry in the XArray unless another entry is
365 * already present.
366 * @xa: XArray.
367 * @index: Index into array.
368 * @entry: New entry.
369 * @gfp: Memory allocation flags.
370 *
371 * If you would rather see the existing entry in the array, use xa_cmpxchg().
372 * This function is for users who don't care what the entry is, only that
373 * one is present.
374 *
375 * Context: Process context. Takes and releases the xa_lock.
376 * May sleep if the @gfp flags permit.
377 * Return: 0 if the store succeeded. -EEXIST if another entry was present.
378 * -ENOMEM if memory could not be allocated.
379 */
380static inline int xa_insert(struct xarray *xa, unsigned long index,
381 void *entry, gfp_t gfp)
382{
383 void *curr = xa_cmpxchg(xa, index, NULL, entry, gfp);
384 if (!curr)
385 return 0;
386 if (xa_is_err(curr))
387 return xa_err(curr);
388 return -EEXIST;
389}
390
391/**
392 * xa_release() - Release a reserved entry.
393 * @xa: XArray.
394 * @index: Index of entry.
395 *
396 * After calling xa_reserve(), you can call this function to release the
397 * reservation. If the entry at @index has been stored to, this function
398 * will do nothing.
399 */
400static inline void xa_release(struct xarray *xa, unsigned long index)
401{
402 xa_cmpxchg(xa, index, NULL, NULL, 0);
403}
404
405/**
406 * xa_for_each() - Iterate over a portion of an XArray.
407 * @xa: XArray.
408 * @entry: Entry retrieved from array.
409 * @index: Index of @entry.
410 * @max: Maximum index to retrieve from array.
411 * @filter: Selection criterion.
412 *
413 * Initialise @index to the lowest index you want to retrieve from the
414 * array. During the iteration, @entry will have the value of the entry
415 * stored in @xa at @index. The iteration will skip all entries in the
416 * array which do not match @filter. You may modify @index during the
417 * iteration if you want to skip or reprocess indices. It is safe to modify
418 * the array during the iteration. At the end of the iteration, @entry will
419 * be set to NULL and @index will have a value less than or equal to max.
420 *
421 * xa_for_each() is O(n.log(n)) while xas_for_each() is O(n). You have
422 * to handle your own locking with xas_for_each(), and if you have to unlock
423 * after each iteration, it will also end up being O(n.log(n)). xa_for_each()
424 * will spin if it hits a retry entry; if you intend to see retry entries,
425 * you should use the xas_for_each() iterator instead. The xas_for_each()
426 * iterator will expand into more inline code than xa_for_each().
427 *
428 * Context: Any context. Takes and releases the RCU lock.
429 */
430#define xa_for_each(xa, entry, index, max, filter) \
431 for (entry = xa_find(xa, &index, max, filter); entry; \
432 entry = xa_find_after(xa, &index, max, filter))
11 433
12#define xa_trylock(xa) spin_trylock(&(xa)->xa_lock) 434#define xa_trylock(xa) spin_trylock(&(xa)->xa_lock)
13#define xa_lock(xa) spin_lock(&(xa)->xa_lock) 435#define xa_lock(xa) spin_lock(&(xa)->xa_lock)
@@ -21,4 +443,873 @@
21#define xa_unlock_irqrestore(xa, flags) \ 443#define xa_unlock_irqrestore(xa, flags) \
22 spin_unlock_irqrestore(&(xa)->xa_lock, flags) 444 spin_unlock_irqrestore(&(xa)->xa_lock, flags)
23 445
446/*
447 * Versions of the normal API which require the caller to hold the
448 * xa_lock. If the GFP flags allow it, they will drop the lock to
449 * allocate memory, then reacquire it afterwards. These functions
450 * may also re-enable interrupts if the XArray flags indicate the
451 * locking should be interrupt safe.
452 */
453void *__xa_erase(struct xarray *, unsigned long index);
454void *__xa_store(struct xarray *, unsigned long index, void *entry, gfp_t);
455void *__xa_cmpxchg(struct xarray *, unsigned long index, void *old,
456 void *entry, gfp_t);
457int __xa_alloc(struct xarray *, u32 *id, u32 max, void *entry, gfp_t);
458void __xa_set_mark(struct xarray *, unsigned long index, xa_mark_t);
459void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t);
460
461/**
462 * __xa_insert() - Store this entry in the XArray unless another entry is
463 * already present.
464 * @xa: XArray.
465 * @index: Index into array.
466 * @entry: New entry.
467 * @gfp: Memory allocation flags.
468 *
469 * If you would rather see the existing entry in the array, use __xa_cmpxchg().
470 * This function is for users who don't care what the entry is, only that
471 * one is present.
472 *
473 * Context: Any context. Expects xa_lock to be held on entry. May
474 * release and reacquire xa_lock if the @gfp flags permit.
475 * Return: 0 if the store succeeded. -EEXIST if another entry was present.
476 * -ENOMEM if memory could not be allocated.
477 */
478static inline int __xa_insert(struct xarray *xa, unsigned long index,
479 void *entry, gfp_t gfp)
480{
481 void *curr = __xa_cmpxchg(xa, index, NULL, entry, gfp);
482 if (!curr)
483 return 0;
484 if (xa_is_err(curr))
485 return xa_err(curr);
486 return -EEXIST;
487}
488
489/**
490 * xa_erase_bh() - Erase this entry from the XArray.
491 * @xa: XArray.
492 * @index: Index of entry.
493 *
494 * This function is the equivalent of calling xa_store() with %NULL as
495 * the third argument. The XArray does not need to allocate memory, so
496 * the user does not need to provide GFP flags.
497 *
498 * Context: Process context. Takes and releases the xa_lock while
499 * disabling softirqs.
500 * Return: The entry which used to be at this index.
501 */
502static inline void *xa_erase_bh(struct xarray *xa, unsigned long index)
503{
504 void *entry;
505
506 xa_lock_bh(xa);
507 entry = __xa_erase(xa, index);
508 xa_unlock_bh(xa);
509
510 return entry;
511}
512
513/**
514 * xa_erase_irq() - Erase this entry from the XArray.
515 * @xa: XArray.
516 * @index: Index of entry.
517 *
518 * This function is the equivalent of calling xa_store() with %NULL as
519 * the third argument. The XArray does not need to allocate memory, so
520 * the user does not need to provide GFP flags.
521 *
522 * Context: Process context. Takes and releases the xa_lock while
523 * disabling interrupts.
524 * Return: The entry which used to be at this index.
525 */
526static inline void *xa_erase_irq(struct xarray *xa, unsigned long index)
527{
528 void *entry;
529
530 xa_lock_irq(xa);
531 entry = __xa_erase(xa, index);
532 xa_unlock_irq(xa);
533
534 return entry;
535}
536
537/**
538 * xa_alloc() - Find somewhere to store this entry in the XArray.
539 * @xa: XArray.
540 * @id: Pointer to ID.
541 * @max: Maximum ID to allocate (inclusive).
542 * @entry: New entry.
543 * @gfp: Memory allocation flags.
544 *
545 * Allocates an unused ID in the range specified by @id and @max.
546 * Updates the @id pointer with the index, then stores the entry at that
547 * index. A concurrent lookup will not see an uninitialised @id.
548 *
549 * Context: Process context. Takes and releases the xa_lock. May sleep if
550 * the @gfp flags permit.
551 * Return: 0 on success, -ENOMEM if memory allocation fails or -ENOSPC if
552 * there is no more space in the XArray.
553 */
554static inline int xa_alloc(struct xarray *xa, u32 *id, u32 max, void *entry,
555 gfp_t gfp)
556{
557 int err;
558
559 xa_lock(xa);
560 err = __xa_alloc(xa, id, max, entry, gfp);
561 xa_unlock(xa);
562
563 return err;
564}
565
566/**
567 * xa_alloc_bh() - Find somewhere to store this entry in the XArray.
568 * @xa: XArray.
569 * @id: Pointer to ID.
570 * @max: Maximum ID to allocate (inclusive).
571 * @entry: New entry.
572 * @gfp: Memory allocation flags.
573 *
574 * Allocates an unused ID in the range specified by @id and @max.
575 * Updates the @id pointer with the index, then stores the entry at that
576 * index. A concurrent lookup will not see an uninitialised @id.
577 *
578 * Context: Process context. Takes and releases the xa_lock while
579 * disabling softirqs. May sleep if the @gfp flags permit.
580 * Return: 0 on success, -ENOMEM if memory allocation fails or -ENOSPC if
581 * there is no more space in the XArray.
582 */
583static inline int xa_alloc_bh(struct xarray *xa, u32 *id, u32 max, void *entry,
584 gfp_t gfp)
585{
586 int err;
587
588 xa_lock_bh(xa);
589 err = __xa_alloc(xa, id, max, entry, gfp);
590 xa_unlock_bh(xa);
591
592 return err;
593}
594
595/**
596 * xa_alloc_irq() - Find somewhere to store this entry in the XArray.
597 * @xa: XArray.
598 * @id: Pointer to ID.
599 * @max: Maximum ID to allocate (inclusive).
600 * @entry: New entry.
601 * @gfp: Memory allocation flags.
602 *
603 * Allocates an unused ID in the range specified by @id and @max.
604 * Updates the @id pointer with the index, then stores the entry at that
605 * index. A concurrent lookup will not see an uninitialised @id.
606 *
607 * Context: Process context. Takes and releases the xa_lock while
608 * disabling interrupts. May sleep if the @gfp flags permit.
609 * Return: 0 on success, -ENOMEM if memory allocation fails or -ENOSPC if
610 * there is no more space in the XArray.
611 */
612static inline int xa_alloc_irq(struct xarray *xa, u32 *id, u32 max, void *entry,
613 gfp_t gfp)
614{
615 int err;
616
617 xa_lock_irq(xa);
618 err = __xa_alloc(xa, id, max, entry, gfp);
619 xa_unlock_irq(xa);
620
621 return err;
622}
623
624/* Everything below here is the Advanced API. Proceed with caution. */
625
626/*
627 * The xarray is constructed out of a set of 'chunks' of pointers. Choosing
628 * the best chunk size requires some tradeoffs. A power of two recommends
629 * itself so that we can walk the tree based purely on shifts and masks.
630 * Generally, the larger the better; as the number of slots per level of the
631 * tree increases, the less tall the tree needs to be. But that needs to be
632 * balanced against the memory consumption of each node. On a 64-bit system,
633 * xa_node is currently 576 bytes, and we get 7 of them per 4kB page. If we
634 * doubled the number of slots per node, we'd get only 3 nodes per 4kB page.
635 */
636#ifndef XA_CHUNK_SHIFT
637#define XA_CHUNK_SHIFT (CONFIG_BASE_SMALL ? 4 : 6)
638#endif
639#define XA_CHUNK_SIZE (1UL << XA_CHUNK_SHIFT)
640#define XA_CHUNK_MASK (XA_CHUNK_SIZE - 1)
641#define XA_MAX_MARKS 3
642#define XA_MARK_LONGS DIV_ROUND_UP(XA_CHUNK_SIZE, BITS_PER_LONG)
643
644/*
645 * @count is the count of every non-NULL element in the ->slots array
646 * whether that is a value entry, a retry entry, a user pointer,
647 * a sibling entry or a pointer to the next level of the tree.
648 * @nr_values is the count of every element in ->slots which is
649 * either a value entry or a sibling of a value entry.
650 */
651struct xa_node {
652 unsigned char shift; /* Bits remaining in each slot */
653 unsigned char offset; /* Slot offset in parent */
654 unsigned char count; /* Total entry count */
655 unsigned char nr_values; /* Value entry count */
656 struct xa_node __rcu *parent; /* NULL at top of tree */
657 struct xarray *array; /* The array we belong to */
658 union {
659 struct list_head private_list; /* For tree user */
660 struct rcu_head rcu_head; /* Used when freeing node */
661 };
662 void __rcu *slots[XA_CHUNK_SIZE];
663 union {
664 unsigned long tags[XA_MAX_MARKS][XA_MARK_LONGS];
665 unsigned long marks[XA_MAX_MARKS][XA_MARK_LONGS];
666 };
667};
668
669void xa_dump(const struct xarray *);
670void xa_dump_node(const struct xa_node *);
671
672#ifdef XA_DEBUG
673#define XA_BUG_ON(xa, x) do { \
674 if (x) { \
675 xa_dump(xa); \
676 BUG(); \
677 } \
678 } while (0)
679#define XA_NODE_BUG_ON(node, x) do { \
680 if (x) { \
681 if (node) xa_dump_node(node); \
682 BUG(); \
683 } \
684 } while (0)
685#else
686#define XA_BUG_ON(xa, x) do { } while (0)
687#define XA_NODE_BUG_ON(node, x) do { } while (0)
688#endif
689
690/* Private */
691static inline void *xa_head(const struct xarray *xa)
692{
693 return rcu_dereference_check(xa->xa_head,
694 lockdep_is_held(&xa->xa_lock));
695}
696
697/* Private */
698static inline void *xa_head_locked(const struct xarray *xa)
699{
700 return rcu_dereference_protected(xa->xa_head,
701 lockdep_is_held(&xa->xa_lock));
702}
703
704/* Private */
705static inline void *xa_entry(const struct xarray *xa,
706 const struct xa_node *node, unsigned int offset)
707{
708 XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE);
709 return rcu_dereference_check(node->slots[offset],
710 lockdep_is_held(&xa->xa_lock));
711}
712
713/* Private */
714static inline void *xa_entry_locked(const struct xarray *xa,
715 const struct xa_node *node, unsigned int offset)
716{
717 XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE);
718 return rcu_dereference_protected(node->slots[offset],
719 lockdep_is_held(&xa->xa_lock));
720}
721
722/* Private */
723static inline struct xa_node *xa_parent(const struct xarray *xa,
724 const struct xa_node *node)
725{
726 return rcu_dereference_check(node->parent,
727 lockdep_is_held(&xa->xa_lock));
728}
729
730/* Private */
731static inline struct xa_node *xa_parent_locked(const struct xarray *xa,
732 const struct xa_node *node)
733{
734 return rcu_dereference_protected(node->parent,
735 lockdep_is_held(&xa->xa_lock));
736}
737
738/* Private */
739static inline void *xa_mk_node(const struct xa_node *node)
740{
741 return (void *)((unsigned long)node | 2);
742}
743
744/* Private */
745static inline struct xa_node *xa_to_node(const void *entry)
746{
747 return (struct xa_node *)((unsigned long)entry - 2);
748}
749
750/* Private */
751static inline bool xa_is_node(const void *entry)
752{
753 return xa_is_internal(entry) && (unsigned long)entry > 4096;
754}
755
756/* Private */
757static inline void *xa_mk_sibling(unsigned int offset)
758{
759 return xa_mk_internal(offset);
760}
761
762/* Private */
763static inline unsigned long xa_to_sibling(const void *entry)
764{
765 return xa_to_internal(entry);
766}
767
768/**
769 * xa_is_sibling() - Is the entry a sibling entry?
770 * @entry: Entry retrieved from the XArray
771 *
772 * Return: %true if the entry is a sibling entry.
773 */
774static inline bool xa_is_sibling(const void *entry)
775{
776 return IS_ENABLED(CONFIG_XARRAY_MULTI) && xa_is_internal(entry) &&
777 (entry < xa_mk_sibling(XA_CHUNK_SIZE - 1));
778}
779
780#define XA_ZERO_ENTRY xa_mk_internal(256)
781#define XA_RETRY_ENTRY xa_mk_internal(257)
782
783/**
784 * xa_is_zero() - Is the entry a zero entry?
785 * @entry: Entry retrieved from the XArray
786 *
787 * Return: %true if the entry is a zero entry.
788 */
789static inline bool xa_is_zero(const void *entry)
790{
791 return unlikely(entry == XA_ZERO_ENTRY);
792}
793
794/**
795 * xa_is_retry() - Is the entry a retry entry?
796 * @entry: Entry retrieved from the XArray
797 *
798 * Return: %true if the entry is a retry entry.
799 */
800static inline bool xa_is_retry(const void *entry)
801{
802 return unlikely(entry == XA_RETRY_ENTRY);
803}
804
805/**
806 * typedef xa_update_node_t - A callback function from the XArray.
807 * @node: The node which is being processed
808 *
809 * This function is called every time the XArray updates the count of
810 * present and value entries in a node. It allows advanced users to
811 * maintain the private_list in the node.
812 *
813 * Context: The xa_lock is held and interrupts may be disabled.
814 * Implementations should not drop the xa_lock, nor re-enable
815 * interrupts.
816 */
817typedef void (*xa_update_node_t)(struct xa_node *node);
818
819/*
820 * The xa_state is opaque to its users. It contains various different pieces
821 * of state involved in the current operation on the XArray. It should be
822 * declared on the stack and passed between the various internal routines.
823 * The various elements in it should not be accessed directly, but only
824 * through the provided accessor functions. The below documentation is for
825 * the benefit of those working on the code, not for users of the XArray.
826 *
827 * @xa_node usually points to the xa_node containing the slot we're operating
828 * on (and @xa_offset is the offset in the slots array). If there is a
829 * single entry in the array at index 0, there are no allocated xa_nodes to
830 * point to, and so we store %NULL in @xa_node. @xa_node is set to
831 * the value %XAS_RESTART if the xa_state is not walked to the correct
832 * position in the tree of nodes for this operation. If an error occurs
833 * during an operation, it is set to an %XAS_ERROR value. If we run off the
834 * end of the allocated nodes, it is set to %XAS_BOUNDS.
835 */
836struct xa_state {
837 struct xarray *xa;
838 unsigned long xa_index;
839 unsigned char xa_shift;
840 unsigned char xa_sibs;
841 unsigned char xa_offset;
842 unsigned char xa_pad; /* Helps gcc generate better code */
843 struct xa_node *xa_node;
844 struct xa_node *xa_alloc;
845 xa_update_node_t xa_update;
846};
847
848/*
849 * We encode errnos in the xas->xa_node. If an error has happened, we need to
850 * drop the lock to fix it, and once we've done so the xa_state is invalid.
851 */
852#define XA_ERROR(errno) ((struct xa_node *)(((unsigned long)errno << 2) | 2UL))
853#define XAS_BOUNDS ((struct xa_node *)1UL)
854#define XAS_RESTART ((struct xa_node *)3UL)
855
856#define __XA_STATE(array, index, shift, sibs) { \
857 .xa = array, \
858 .xa_index = index, \
859 .xa_shift = shift, \
860 .xa_sibs = sibs, \
861 .xa_offset = 0, \
862 .xa_pad = 0, \
863 .xa_node = XAS_RESTART, \
864 .xa_alloc = NULL, \
865 .xa_update = NULL \
866}
867
868/**
869 * XA_STATE() - Declare an XArray operation state.
870 * @name: Name of this operation state (usually xas).
871 * @array: Array to operate on.
872 * @index: Initial index of interest.
873 *
874 * Declare and initialise an xa_state on the stack.
875 */
876#define XA_STATE(name, array, index) \
877 struct xa_state name = __XA_STATE(array, index, 0, 0)
878
879/**
880 * XA_STATE_ORDER() - Declare an XArray operation state.
881 * @name: Name of this operation state (usually xas).
882 * @array: Array to operate on.
883 * @index: Initial index of interest.
884 * @order: Order of entry.
885 *
886 * Declare and initialise an xa_state on the stack. This variant of
887 * XA_STATE() allows you to specify the 'order' of the element you
888 * want to operate on.`
889 */
890#define XA_STATE_ORDER(name, array, index, order) \
891 struct xa_state name = __XA_STATE(array, \
892 (index >> order) << order, \
893 order - (order % XA_CHUNK_SHIFT), \
894 (1U << (order % XA_CHUNK_SHIFT)) - 1)
895
896#define xas_marked(xas, mark) xa_marked((xas)->xa, (mark))
897#define xas_trylock(xas) xa_trylock((xas)->xa)
898#define xas_lock(xas) xa_lock((xas)->xa)
899#define xas_unlock(xas) xa_unlock((xas)->xa)
900#define xas_lock_bh(xas) xa_lock_bh((xas)->xa)
901#define xas_unlock_bh(xas) xa_unlock_bh((xas)->xa)
902#define xas_lock_irq(xas) xa_lock_irq((xas)->xa)
903#define xas_unlock_irq(xas) xa_unlock_irq((xas)->xa)
904#define xas_lock_irqsave(xas, flags) \
905 xa_lock_irqsave((xas)->xa, flags)
906#define xas_unlock_irqrestore(xas, flags) \
907 xa_unlock_irqrestore((xas)->xa, flags)
908
909/**
910 * xas_error() - Return an errno stored in the xa_state.
911 * @xas: XArray operation state.
912 *
913 * Return: 0 if no error has been noted. A negative errno if one has.
914 */
915static inline int xas_error(const struct xa_state *xas)
916{
917 return xa_err(xas->xa_node);
918}
919
920/**
921 * xas_set_err() - Note an error in the xa_state.
922 * @xas: XArray operation state.
923 * @err: Negative error number.
924 *
925 * Only call this function with a negative @err; zero or positive errors
926 * will probably not behave the way you think they should. If you want
927 * to clear the error from an xa_state, use xas_reset().
928 */
929static inline void xas_set_err(struct xa_state *xas, long err)
930{
931 xas->xa_node = XA_ERROR(err);
932}
933
934/**
935 * xas_invalid() - Is the xas in a retry or error state?
936 * @xas: XArray operation state.
937 *
938 * Return: %true if the xas cannot be used for operations.
939 */
940static inline bool xas_invalid(const struct xa_state *xas)
941{
942 return (unsigned long)xas->xa_node & 3;
943}
944
945/**
946 * xas_valid() - Is the xas a valid cursor into the array?
947 * @xas: XArray operation state.
948 *
949 * Return: %true if the xas can be used for operations.
950 */
951static inline bool xas_valid(const struct xa_state *xas)
952{
953 return !xas_invalid(xas);
954}
955
956/**
957 * xas_is_node() - Does the xas point to a node?
958 * @xas: XArray operation state.
959 *
960 * Return: %true if the xas currently references a node.
961 */
962static inline bool xas_is_node(const struct xa_state *xas)
963{
964 return xas_valid(xas) && xas->xa_node;
965}
966
967/* True if the pointer is something other than a node */
968static inline bool xas_not_node(struct xa_node *node)
969{
970 return ((unsigned long)node & 3) || !node;
971}
972
973/* True if the node represents RESTART or an error */
974static inline bool xas_frozen(struct xa_node *node)
975{
976 return (unsigned long)node & 2;
977}
978
979/* True if the node represents head-of-tree, RESTART or BOUNDS */
980static inline bool xas_top(struct xa_node *node)
981{
982 return node <= XAS_RESTART;
983}
984
985/**
986 * xas_reset() - Reset an XArray operation state.
987 * @xas: XArray operation state.
988 *
989 * Resets the error or walk state of the @xas so future walks of the
990 * array will start from the root. Use this if you have dropped the
991 * xarray lock and want to reuse the xa_state.
992 *
993 * Context: Any context.
994 */
995static inline void xas_reset(struct xa_state *xas)
996{
997 xas->xa_node = XAS_RESTART;
998}
999
1000/**
1001 * xas_retry() - Retry the operation if appropriate.
1002 * @xas: XArray operation state.
1003 * @entry: Entry from xarray.
1004 *
1005 * The advanced functions may sometimes return an internal entry, such as
1006 * a retry entry or a zero entry. This function sets up the @xas to restart
1007 * the walk from the head of the array if needed.
1008 *
1009 * Context: Any context.
1010 * Return: true if the operation needs to be retried.
1011 */
1012static inline bool xas_retry(struct xa_state *xas, const void *entry)
1013{
1014 if (xa_is_zero(entry))
1015 return true;
1016 if (!xa_is_retry(entry))
1017 return false;
1018 xas_reset(xas);
1019 return true;
1020}
1021
1022void *xas_load(struct xa_state *);
1023void *xas_store(struct xa_state *, void *entry);
1024void *xas_find(struct xa_state *, unsigned long max);
1025void *xas_find_conflict(struct xa_state *);
1026
1027bool xas_get_mark(const struct xa_state *, xa_mark_t);
1028void xas_set_mark(const struct xa_state *, xa_mark_t);
1029void xas_clear_mark(const struct xa_state *, xa_mark_t);
1030void *xas_find_marked(struct xa_state *, unsigned long max, xa_mark_t);
1031void xas_init_marks(const struct xa_state *);
1032
1033bool xas_nomem(struct xa_state *, gfp_t);
1034void xas_pause(struct xa_state *);
1035
1036void xas_create_range(struct xa_state *);
1037
1038/**
1039 * xas_reload() - Refetch an entry from the xarray.
1040 * @xas: XArray operation state.
1041 *
1042 * Use this function to check that a previously loaded entry still has
1043 * the same value. This is useful for the lockless pagecache lookup where
1044 * we walk the array with only the RCU lock to protect us, lock the page,
1045 * then check that the page hasn't moved since we looked it up.
1046 *
1047 * The caller guarantees that @xas is still valid. If it may be in an
1048 * error or restart state, call xas_load() instead.
1049 *
1050 * Return: The entry at this location in the xarray.
1051 */
1052static inline void *xas_reload(struct xa_state *xas)
1053{
1054 struct xa_node *node = xas->xa_node;
1055
1056 if (node)
1057 return xa_entry(xas->xa, node, xas->xa_offset);
1058 return xa_head(xas->xa);
1059}
1060
1061/**
1062 * xas_set() - Set up XArray operation state for a different index.
1063 * @xas: XArray operation state.
1064 * @index: New index into the XArray.
1065 *
1066 * Move the operation state to refer to a different index. This will
1067 * have the effect of starting a walk from the top; see xas_next()
1068 * to move to an adjacent index.
1069 */
1070static inline void xas_set(struct xa_state *xas, unsigned long index)
1071{
1072 xas->xa_index = index;
1073 xas->xa_node = XAS_RESTART;
1074}
1075
1076/**
1077 * xas_set_order() - Set up XArray operation state for a multislot entry.
1078 * @xas: XArray operation state.
1079 * @index: Target of the operation.
1080 * @order: Entry occupies 2^@order indices.
1081 */
1082static inline void xas_set_order(struct xa_state *xas, unsigned long index,
1083 unsigned int order)
1084{
1085#ifdef CONFIG_XARRAY_MULTI
1086 xas->xa_index = order < BITS_PER_LONG ? (index >> order) << order : 0;
1087 xas->xa_shift = order - (order % XA_CHUNK_SHIFT);
1088 xas->xa_sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1;
1089 xas->xa_node = XAS_RESTART;
1090#else
1091 BUG_ON(order > 0);
1092 xas_set(xas, index);
1093#endif
1094}
1095
1096/**
1097 * xas_set_update() - Set up XArray operation state for a callback.
1098 * @xas: XArray operation state.
1099 * @update: Function to call when updating a node.
1100 *
1101 * The XArray can notify a caller after it has updated an xa_node.
1102 * This is advanced functionality and is only needed by the page cache.
1103 */
1104static inline void xas_set_update(struct xa_state *xas, xa_update_node_t update)
1105{
1106 xas->xa_update = update;
1107}
1108
1109/**
1110 * xas_next_entry() - Advance iterator to next present entry.
1111 * @xas: XArray operation state.
1112 * @max: Highest index to return.
1113 *
1114 * xas_next_entry() is an inline function to optimise xarray traversal for
1115 * speed. It is equivalent to calling xas_find(), and will call xas_find()
1116 * for all the hard cases.
1117 *
1118 * Return: The next present entry after the one currently referred to by @xas.
1119 */
1120static inline void *xas_next_entry(struct xa_state *xas, unsigned long max)
1121{
1122 struct xa_node *node = xas->xa_node;
1123 void *entry;
1124
1125 if (unlikely(xas_not_node(node) || node->shift ||
1126 xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK)))
1127 return xas_find(xas, max);
1128
1129 do {
1130 if (unlikely(xas->xa_index >= max))
1131 return xas_find(xas, max);
1132 if (unlikely(xas->xa_offset == XA_CHUNK_MASK))
1133 return xas_find(xas, max);
1134 entry = xa_entry(xas->xa, node, xas->xa_offset + 1);
1135 if (unlikely(xa_is_internal(entry)))
1136 return xas_find(xas, max);
1137 xas->xa_offset++;
1138 xas->xa_index++;
1139 } while (!entry);
1140
1141 return entry;
1142}
1143
1144/* Private */
1145static inline unsigned int xas_find_chunk(struct xa_state *xas, bool advance,
1146 xa_mark_t mark)
1147{
1148 unsigned long *addr = xas->xa_node->marks[(__force unsigned)mark];
1149 unsigned int offset = xas->xa_offset;
1150
1151 if (advance)
1152 offset++;
1153 if (XA_CHUNK_SIZE == BITS_PER_LONG) {
1154 if (offset < XA_CHUNK_SIZE) {
1155 unsigned long data = *addr & (~0UL << offset);
1156 if (data)
1157 return __ffs(data);
1158 }
1159 return XA_CHUNK_SIZE;
1160 }
1161
1162 return find_next_bit(addr, XA_CHUNK_SIZE, offset);
1163}
1164
1165/**
1166 * xas_next_marked() - Advance iterator to next marked entry.
1167 * @xas: XArray operation state.
1168 * @max: Highest index to return.
1169 * @mark: Mark to search for.
1170 *
1171 * xas_next_marked() is an inline function to optimise xarray traversal for
1172 * speed. It is equivalent to calling xas_find_marked(), and will call
1173 * xas_find_marked() for all the hard cases.
1174 *
1175 * Return: The next marked entry after the one currently referred to by @xas.
1176 */
1177static inline void *xas_next_marked(struct xa_state *xas, unsigned long max,
1178 xa_mark_t mark)
1179{
1180 struct xa_node *node = xas->xa_node;
1181 unsigned int offset;
1182
1183 if (unlikely(xas_not_node(node) || node->shift))
1184 return xas_find_marked(xas, max, mark);
1185 offset = xas_find_chunk(xas, true, mark);
1186 xas->xa_offset = offset;
1187 xas->xa_index = (xas->xa_index & ~XA_CHUNK_MASK) + offset;
1188 if (xas->xa_index > max)
1189 return NULL;
1190 if (offset == XA_CHUNK_SIZE)
1191 return xas_find_marked(xas, max, mark);
1192 return xa_entry(xas->xa, node, offset);
1193}
1194
1195/*
1196 * If iterating while holding a lock, drop the lock and reschedule
1197 * every %XA_CHECK_SCHED loops.
1198 */
1199enum {
1200 XA_CHECK_SCHED = 4096,
1201};
1202
1203/**
1204 * xas_for_each() - Iterate over a range of an XArray.
1205 * @xas: XArray operation state.
1206 * @entry: Entry retrieved from the array.
1207 * @max: Maximum index to retrieve from array.
1208 *
1209 * The loop body will be executed for each entry present in the xarray
1210 * between the current xas position and @max. @entry will be set to
1211 * the entry retrieved from the xarray. It is safe to delete entries
1212 * from the array in the loop body. You should hold either the RCU lock
1213 * or the xa_lock while iterating. If you need to drop the lock, call
1214 * xas_pause() first.
1215 */
1216#define xas_for_each(xas, entry, max) \
1217 for (entry = xas_find(xas, max); entry; \
1218 entry = xas_next_entry(xas, max))
1219
1220/**
1221 * xas_for_each_marked() - Iterate over a range of an XArray.
1222 * @xas: XArray operation state.
1223 * @entry: Entry retrieved from the array.
1224 * @max: Maximum index to retrieve from array.
1225 * @mark: Mark to search for.
1226 *
1227 * The loop body will be executed for each marked entry in the xarray
1228 * between the current xas position and @max. @entry will be set to
1229 * the entry retrieved from the xarray. It is safe to delete entries
1230 * from the array in the loop body. You should hold either the RCU lock
1231 * or the xa_lock while iterating. If you need to drop the lock, call
1232 * xas_pause() first.
1233 */
1234#define xas_for_each_marked(xas, entry, max, mark) \
1235 for (entry = xas_find_marked(xas, max, mark); entry; \
1236 entry = xas_next_marked(xas, max, mark))
1237
1238/**
1239 * xas_for_each_conflict() - Iterate over a range of an XArray.
1240 * @xas: XArray operation state.
1241 * @entry: Entry retrieved from the array.
1242 *
1243 * The loop body will be executed for each entry in the XArray that lies
1244 * within the range specified by @xas. If the loop completes successfully,
1245 * any entries that lie in this range will be replaced by @entry. The caller
1246 * may break out of the loop; if they do so, the contents of the XArray will
1247 * be unchanged. The operation may fail due to an out of memory condition.
1248 * The caller may also call xa_set_err() to exit the loop while setting an
1249 * error to record the reason.
1250 */
1251#define xas_for_each_conflict(xas, entry) \
1252 while ((entry = xas_find_conflict(xas)))
1253
1254void *__xas_next(struct xa_state *);
1255void *__xas_prev(struct xa_state *);
1256
1257/**
1258 * xas_prev() - Move iterator to previous index.
1259 * @xas: XArray operation state.
1260 *
1261 * If the @xas was in an error state, it will remain in an error state
1262 * and this function will return %NULL. If the @xas has never been walked,
1263 * it will have the effect of calling xas_load(). Otherwise one will be
1264 * subtracted from the index and the state will be walked to the correct
1265 * location in the array for the next operation.
1266 *
1267 * If the iterator was referencing index 0, this function wraps
1268 * around to %ULONG_MAX.
1269 *
1270 * Return: The entry at the new index. This may be %NULL or an internal
1271 * entry.
1272 */
1273static inline void *xas_prev(struct xa_state *xas)
1274{
1275 struct xa_node *node = xas->xa_node;
1276
1277 if (unlikely(xas_not_node(node) || node->shift ||
1278 xas->xa_offset == 0))
1279 return __xas_prev(xas);
1280
1281 xas->xa_index--;
1282 xas->xa_offset--;
1283 return xa_entry(xas->xa, node, xas->xa_offset);
1284}
1285
1286/**
1287 * xas_next() - Move state to next index.
1288 * @xas: XArray operation state.
1289 *
1290 * If the @xas was in an error state, it will remain in an error state
1291 * and this function will return %NULL. If the @xas has never been walked,
1292 * it will have the effect of calling xas_load(). Otherwise one will be
1293 * added to the index and the state will be walked to the correct
1294 * location in the array for the next operation.
1295 *
1296 * If the iterator was referencing index %ULONG_MAX, this function wraps
1297 * around to 0.
1298 *
1299 * Return: The entry at the new index. This may be %NULL or an internal
1300 * entry.
1301 */
1302static inline void *xas_next(struct xa_state *xas)
1303{
1304 struct xa_node *node = xas->xa_node;
1305
1306 if (unlikely(xas_not_node(node) || node->shift ||
1307 xas->xa_offset == XA_CHUNK_MASK))
1308 return __xas_next(xas);
1309
1310 xas->xa_index++;
1311 xas->xa_offset++;
1312 return xa_entry(xas->xa, node, xas->xa_offset);
1313}
1314
24#endif /* _LINUX_XARRAY_H */ 1315#endif /* _LINUX_XARRAY_H */
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 620fc4d2559a..9eced2cc9f94 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -1,47 +1,21 @@
1/* SPDX-License-Identifier: GPL-2.0 */ 1/* SPDX-License-Identifier: GPL-2.0 */
2/* Copyright(c) 2015 Intel Corporation. All rights reserved. */ 2/* Copyright(c) 2015 Intel Corporation. All rights reserved. */
3#include <linux/radix-tree.h>
4#include <linux/device.h> 3#include <linux/device.h>
5#include <linux/types.h>
6#include <linux/pfn_t.h>
7#include <linux/io.h> 4#include <linux/io.h>
8#include <linux/kasan.h> 5#include <linux/kasan.h>
9#include <linux/mm.h>
10#include <linux/memory_hotplug.h> 6#include <linux/memory_hotplug.h>
7#include <linux/mm.h>
8#include <linux/pfn_t.h>
11#include <linux/swap.h> 9#include <linux/swap.h>
12#include <linux/swapops.h> 10#include <linux/swapops.h>
11#include <linux/types.h>
13#include <linux/wait_bit.h> 12#include <linux/wait_bit.h>
13#include <linux/xarray.h>
14 14
15static DEFINE_MUTEX(pgmap_lock); 15static DEFINE_XARRAY(pgmap_array);
16static RADIX_TREE(pgmap_radix, GFP_KERNEL);
17#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) 16#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
18#define SECTION_SIZE (1UL << PA_SECTION_SHIFT) 17#define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
19 18
20static unsigned long order_at(struct resource *res, unsigned long pgoff)
21{
22 unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff;
23 unsigned long nr_pages, mask;
24
25 nr_pages = PHYS_PFN(resource_size(res));
26 if (nr_pages == pgoff)
27 return ULONG_MAX;
28
29 /*
30 * What is the largest aligned power-of-2 range available from
31 * this resource pgoff to the end of the resource range,
32 * considering the alignment of the current pgoff?
33 */
34 mask = phys_pgoff | rounddown_pow_of_two(nr_pages - pgoff);
35 if (!mask)
36 return ULONG_MAX;
37
38 return find_first_bit(&mask, BITS_PER_LONG);
39}
40
41#define foreach_order_pgoff(res, order, pgoff) \
42 for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \
43 pgoff += 1UL << order, order = order_at((res), pgoff))
44
45#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) 19#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
46vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, 20vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
47 unsigned long addr, 21 unsigned long addr,
@@ -70,18 +44,10 @@ vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
70EXPORT_SYMBOL(device_private_entry_fault); 44EXPORT_SYMBOL(device_private_entry_fault);
71#endif /* CONFIG_DEVICE_PRIVATE */ 45#endif /* CONFIG_DEVICE_PRIVATE */
72 46
73static void pgmap_radix_release(struct resource *res, unsigned long end_pgoff) 47static void pgmap_array_delete(struct resource *res)
74{ 48{
75 unsigned long pgoff, order; 49 xa_store_range(&pgmap_array, PHYS_PFN(res->start), PHYS_PFN(res->end),
76 50 NULL, GFP_KERNEL);
77 mutex_lock(&pgmap_lock);
78 foreach_order_pgoff(res, order, pgoff) {
79 if (pgoff >= end_pgoff)
80 break;
81 radix_tree_delete(&pgmap_radix, PHYS_PFN(res->start) + pgoff);
82 }
83 mutex_unlock(&pgmap_lock);
84
85 synchronize_rcu(); 51 synchronize_rcu();
86} 52}
87 53
@@ -142,7 +108,7 @@ static void devm_memremap_pages_release(void *data)
142 mem_hotplug_done(); 108 mem_hotplug_done();
143 109
144 untrack_pfn(NULL, PHYS_PFN(align_start), align_size); 110 untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
145 pgmap_radix_release(res, -1); 111 pgmap_array_delete(res);
146 dev_WARN_ONCE(dev, pgmap->altmap.alloc, 112 dev_WARN_ONCE(dev, pgmap->altmap.alloc,
147 "%s: failed to free all reserved pages\n", __func__); 113 "%s: failed to free all reserved pages\n", __func__);
148} 114}
@@ -177,7 +143,6 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
177 struct resource *res = &pgmap->res; 143 struct resource *res = &pgmap->res;
178 struct dev_pagemap *conflict_pgmap; 144 struct dev_pagemap *conflict_pgmap;
179 pgprot_t pgprot = PAGE_KERNEL; 145 pgprot_t pgprot = PAGE_KERNEL;
180 unsigned long pgoff, order;
181 int error, nid, is_ram; 146 int error, nid, is_ram;
182 147
183 align_start = res->start & ~(SECTION_SIZE - 1); 148 align_start = res->start & ~(SECTION_SIZE - 1);
@@ -216,20 +181,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
216 181
217 pgmap->dev = dev; 182 pgmap->dev = dev;
218 183
219 mutex_lock(&pgmap_lock); 184 error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(res->start),
220 error = 0; 185 PHYS_PFN(res->end), pgmap, GFP_KERNEL));
221
222 foreach_order_pgoff(res, order, pgoff) {
223 error = __radix_tree_insert(&pgmap_radix,
224 PHYS_PFN(res->start) + pgoff, order, pgmap);
225 if (error) {
226 dev_err(dev, "%s: failed: %d\n", __func__, error);
227 break;
228 }
229 }
230 mutex_unlock(&pgmap_lock);
231 if (error) 186 if (error)
232 goto err_radix; 187 goto err_array;
233 188
234 nid = dev_to_node(dev); 189 nid = dev_to_node(dev);
235 if (nid < 0) 190 if (nid < 0)
@@ -274,8 +229,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
274 err_kasan: 229 err_kasan:
275 untrack_pfn(NULL, PHYS_PFN(align_start), align_size); 230 untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
276 err_pfn_remap: 231 err_pfn_remap:
277 err_radix: 232 pgmap_array_delete(res);
278 pgmap_radix_release(res, pgoff); 233 err_array:
279 return ERR_PTR(error); 234 return ERR_PTR(error);
280} 235}
281EXPORT_SYMBOL(devm_memremap_pages); 236EXPORT_SYMBOL(devm_memremap_pages);
@@ -315,7 +270,7 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
315 270
316 /* fall back to slow path lookup */ 271 /* fall back to slow path lookup */
317 rcu_read_lock(); 272 rcu_read_lock();
318 pgmap = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys)); 273 pgmap = xa_load(&pgmap_array, PHYS_PFN(phys));
319 if (pgmap && !percpu_ref_tryget_live(pgmap->ref)) 274 if (pgmap && !percpu_ref_tryget_live(pgmap->ref))
320 pgmap = NULL; 275 pgmap = NULL;
321 rcu_read_unlock(); 276 rcu_read_unlock();
diff --git a/lib/Kconfig b/lib/Kconfig
index d82f20609939..d1573a16aa92 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -399,8 +399,11 @@ config INTERVAL_TREE
399 399
400 for more information. 400 for more information.
401 401
402config RADIX_TREE_MULTIORDER 402config XARRAY_MULTI
403 bool 403 bool
404 help
405 Support entries which occupy multiple consecutive indices in the
406 XArray.
404 407
405config ASSOCIATIVE_ARRAY 408config ASSOCIATIVE_ARRAY
406 bool 409 bool
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 04adfc3b185e..e0ba05e6f6bd 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1813,6 +1813,9 @@ config TEST_BITFIELD
1813config TEST_UUID 1813config TEST_UUID
1814 tristate "Test functions located in the uuid module at runtime" 1814 tristate "Test functions located in the uuid module at runtime"
1815 1815
1816config TEST_XARRAY
1817 tristate "Test the XArray code at runtime"
1818
1816config TEST_OVERFLOW 1819config TEST_OVERFLOW
1817 tristate "Test check_*_overflow() functions at runtime" 1820 tristate "Test check_*_overflow() functions at runtime"
1818 1821
diff --git a/lib/Makefile b/lib/Makefile
index fa3eb1b4c0e3..3d341f59f756 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -18,7 +18,7 @@ KCOV_INSTRUMENT_debugobjects.o := n
18KCOV_INSTRUMENT_dynamic_debug.o := n 18KCOV_INSTRUMENT_dynamic_debug.o := n
19 19
20lib-y := ctype.o string.o vsprintf.o cmdline.o \ 20lib-y := ctype.o string.o vsprintf.o cmdline.o \
21 rbtree.o radix-tree.o timerqueue.o\ 21 rbtree.o radix-tree.o timerqueue.o xarray.o \
22 idr.o int_sqrt.o extable.o \ 22 idr.o int_sqrt.o extable.o \
23 sha1.o chacha20.o irq_regs.o argv_split.o \ 23 sha1.o chacha20.o irq_regs.o argv_split.o \
24 flex_proportions.o ratelimit.o show_mem.o \ 24 flex_proportions.o ratelimit.o show_mem.o \
@@ -68,6 +68,7 @@ obj-$(CONFIG_TEST_PRINTF) += test_printf.o
68obj-$(CONFIG_TEST_BITMAP) += test_bitmap.o 68obj-$(CONFIG_TEST_BITMAP) += test_bitmap.o
69obj-$(CONFIG_TEST_BITFIELD) += test_bitfield.o 69obj-$(CONFIG_TEST_BITFIELD) += test_bitfield.o
70obj-$(CONFIG_TEST_UUID) += test_uuid.o 70obj-$(CONFIG_TEST_UUID) += test_uuid.o
71obj-$(CONFIG_TEST_XARRAY) += test_xarray.o
71obj-$(CONFIG_TEST_PARMAN) += test_parman.o 72obj-$(CONFIG_TEST_PARMAN) += test_parman.o
72obj-$(CONFIG_TEST_KMOD) += test_kmod.o 73obj-$(CONFIG_TEST_KMOD) += test_kmod.o
73obj-$(CONFIG_TEST_DEBUG_VIRTUAL) += test_debug_virtual.o 74obj-$(CONFIG_TEST_DEBUG_VIRTUAL) += test_debug_virtual.o
diff --git a/lib/idr.c b/lib/idr.c
index fab2fd5bc326..cb1db9b8d3f6 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -6,8 +6,6 @@
6#include <linux/spinlock.h> 6#include <linux/spinlock.h>
7#include <linux/xarray.h> 7#include <linux/xarray.h>
8 8
9DEFINE_PER_CPU(struct ida_bitmap *, ida_bitmap);
10
11/** 9/**
12 * idr_alloc_u32() - Allocate an ID. 10 * idr_alloc_u32() - Allocate an ID.
13 * @idr: IDR handle. 11 * @idr: IDR handle.
@@ -39,10 +37,8 @@ int idr_alloc_u32(struct idr *idr, void *ptr, u32 *nextid,
39 unsigned int base = idr->idr_base; 37 unsigned int base = idr->idr_base;
40 unsigned int id = *nextid; 38 unsigned int id = *nextid;
41 39
42 if (WARN_ON_ONCE(radix_tree_is_internal_node(ptr))) 40 if (WARN_ON_ONCE(!(idr->idr_rt.xa_flags & ROOT_IS_IDR)))
43 return -EINVAL; 41 idr->idr_rt.xa_flags |= IDR_RT_MARKER;
44 if (WARN_ON_ONCE(!(idr->idr_rt.gfp_mask & ROOT_IS_IDR)))
45 idr->idr_rt.gfp_mask |= IDR_RT_MARKER;
46 42
47 id = (id < base) ? 0 : id - base; 43 id = (id < base) ? 0 : id - base;
48 radix_tree_iter_init(&iter, id); 44 radix_tree_iter_init(&iter, id);
@@ -295,15 +291,13 @@ void *idr_replace(struct idr *idr, void *ptr, unsigned long id)
295 void __rcu **slot = NULL; 291 void __rcu **slot = NULL;
296 void *entry; 292 void *entry;
297 293
298 if (WARN_ON_ONCE(radix_tree_is_internal_node(ptr)))
299 return ERR_PTR(-EINVAL);
300 id -= idr->idr_base; 294 id -= idr->idr_base;
301 295
302 entry = __radix_tree_lookup(&idr->idr_rt, id, &node, &slot); 296 entry = __radix_tree_lookup(&idr->idr_rt, id, &node, &slot);
303 if (!slot || radix_tree_tag_get(&idr->idr_rt, id, IDR_FREE)) 297 if (!slot || radix_tree_tag_get(&idr->idr_rt, id, IDR_FREE))
304 return ERR_PTR(-ENOENT); 298 return ERR_PTR(-ENOENT);
305 299
306 __radix_tree_replace(&idr->idr_rt, node, slot, ptr, NULL); 300 __radix_tree_replace(&idr->idr_rt, node, slot, ptr);
307 301
308 return entry; 302 return entry;
309} 303}
@@ -324,6 +318,9 @@ EXPORT_SYMBOL(idr_replace);
324 * free the individual IDs in it. You can use ida_is_empty() to find 318 * free the individual IDs in it. You can use ida_is_empty() to find
325 * out whether the IDA has any IDs currently allocated. 319 * out whether the IDA has any IDs currently allocated.
326 * 320 *
321 * The IDA handles its own locking. It is safe to call any of the IDA
322 * functions without synchronisation in your code.
323 *
327 * IDs are currently limited to the range [0-INT_MAX]. If this is an awkward 324 * IDs are currently limited to the range [0-INT_MAX]. If this is an awkward
328 * limitation, it should be quite straightforward to raise the maximum. 325 * limitation, it should be quite straightforward to raise the maximum.
329 */ 326 */
@@ -331,161 +328,197 @@ EXPORT_SYMBOL(idr_replace);
331/* 328/*
332 * Developer's notes: 329 * Developer's notes:
333 * 330 *
334 * The IDA uses the functionality provided by the IDR & radix tree to store 331 * The IDA uses the functionality provided by the XArray to store bitmaps in
335 * bitmaps in each entry. The IDR_FREE tag means there is at least one bit 332 * each entry. The XA_FREE_MARK is only cleared when all bits in the bitmap
336 * free, unlike the IDR where it means at least one entry is free. 333 * have been set.
337 * 334 *
338 * I considered telling the radix tree that each slot is an order-10 node 335 * I considered telling the XArray that each slot is an order-10 node
339 * and storing the bit numbers in the radix tree, but the radix tree can't 336 * and indexing by bit number, but the XArray can't allow a single multi-index
340 * allow a single multiorder entry at index 0, which would significantly 337 * entry in the head, which would significantly increase memory consumption
341 * increase memory consumption for the IDA. So instead we divide the index 338 * for the IDA. So instead we divide the index by the number of bits in the
342 * by the number of bits in the leaf bitmap before doing a radix tree lookup. 339 * leaf bitmap before doing a radix tree lookup.
343 * 340 *
344 * As an optimisation, if there are only a few low bits set in any given 341 * As an optimisation, if there are only a few low bits set in any given
345 * leaf, instead of allocating a 128-byte bitmap, we use the 'exceptional 342 * leaf, instead of allocating a 128-byte bitmap, we store the bits
346 * entry' functionality of the radix tree to store BITS_PER_LONG - 2 bits 343 * as a value entry. Value entries never have the XA_FREE_MARK cleared
347 * directly in the entry. By being really tricksy, we could store 344 * because we can always convert them into a bitmap entry.
348 * BITS_PER_LONG - 1 bits, but there're diminishing returns after optimising 345 *
349 * for 0-3 allocated IDs. 346 * It would be possible to optimise further; once we've run out of a
350 * 347 * single 128-byte bitmap, we currently switch to a 576-byte node, put
351 * We allow the radix tree 'exceptional' count to get out of date. Nothing 348 * the 128-byte bitmap in the first entry and then start allocating extra
352 * in the IDA nor the radix tree code checks it. If it becomes important 349 * 128-byte entries. We could instead use the 512 bytes of the node's
353 * to maintain an accurate exceptional count, switch the rcu_assign_pointer() 350 * data as a bitmap before moving to that scheme. I do not believe this
354 * calls to radix_tree_iter_replace() which will correct the exceptional 351 * is a worthwhile optimisation; Rasmus Villemoes surveyed the current
355 * count. 352 * users of the IDA and almost none of them use more than 1024 entries.
356 * 353 * Those that do use more than the 8192 IDs that the 512 bytes would
357 * The IDA always requires a lock to alloc/free. If we add a 'test_bit' 354 * provide.
355 *
356 * The IDA always uses a lock to alloc/free. If we add a 'test_bit'
358 * equivalent, it will still need locking. Going to RCU lookup would require 357 * equivalent, it will still need locking. Going to RCU lookup would require
359 * using RCU to free bitmaps, and that's not trivial without embedding an 358 * using RCU to free bitmaps, and that's not trivial without embedding an
360 * RCU head in the bitmap, which adds a 2-pointer overhead to each 128-byte 359 * RCU head in the bitmap, which adds a 2-pointer overhead to each 128-byte
361 * bitmap, which is excessive. 360 * bitmap, which is excessive.
362 */ 361 */
363 362
364#define IDA_MAX (0x80000000U / IDA_BITMAP_BITS - 1) 363/**
365 364 * ida_alloc_range() - Allocate an unused ID.
366static int ida_get_new_above(struct ida *ida, int start) 365 * @ida: IDA handle.
366 * @min: Lowest ID to allocate.
367 * @max: Highest ID to allocate.
368 * @gfp: Memory allocation flags.
369 *
370 * Allocate an ID between @min and @max, inclusive. The allocated ID will
371 * not exceed %INT_MAX, even if @max is larger.
372 *
373 * Context: Any context.
374 * Return: The allocated ID, or %-ENOMEM if memory could not be allocated,
375 * or %-ENOSPC if there are no free IDs.
376 */
377int ida_alloc_range(struct ida *ida, unsigned int min, unsigned int max,
378 gfp_t gfp)
367{ 379{
368 struct radix_tree_root *root = &ida->ida_rt; 380 XA_STATE(xas, &ida->xa, min / IDA_BITMAP_BITS);
369 void __rcu **slot; 381 unsigned bit = min % IDA_BITMAP_BITS;
370 struct radix_tree_iter iter; 382 unsigned long flags;
371 struct ida_bitmap *bitmap; 383 struct ida_bitmap *bitmap, *alloc = NULL;
372 unsigned long index; 384
373 unsigned bit, ebit; 385 if ((int)min < 0)
374 int new; 386 return -ENOSPC;
375 387
376 index = start / IDA_BITMAP_BITS; 388 if ((int)max < 0)
377 bit = start % IDA_BITMAP_BITS; 389 max = INT_MAX;
378 ebit = bit + RADIX_TREE_EXCEPTIONAL_SHIFT; 390
379 391retry:
380 slot = radix_tree_iter_init(&iter, index); 392 xas_lock_irqsave(&xas, flags);
381 for (;;) { 393next:
382 if (slot) 394 bitmap = xas_find_marked(&xas, max / IDA_BITMAP_BITS, XA_FREE_MARK);
383 slot = radix_tree_next_slot(slot, &iter, 395 if (xas.xa_index > min / IDA_BITMAP_BITS)
384 RADIX_TREE_ITER_TAGGED); 396 bit = 0;
385 if (!slot) { 397 if (xas.xa_index * IDA_BITMAP_BITS + bit > max)
386 slot = idr_get_free(root, &iter, GFP_NOWAIT, IDA_MAX); 398 goto nospc;
387 if (IS_ERR(slot)) { 399
388 if (slot == ERR_PTR(-ENOMEM)) 400 if (xa_is_value(bitmap)) {
389 return -EAGAIN; 401 unsigned long tmp = xa_to_value(bitmap);
390 return PTR_ERR(slot); 402
403 if (bit < BITS_PER_XA_VALUE) {
404 bit = find_next_zero_bit(&tmp, BITS_PER_XA_VALUE, bit);
405 if (xas.xa_index * IDA_BITMAP_BITS + bit > max)
406 goto nospc;
407 if (bit < BITS_PER_XA_VALUE) {
408 tmp |= 1UL << bit;
409 xas_store(&xas, xa_mk_value(tmp));
410 goto out;
391 } 411 }
392 } 412 }
393 if (iter.index > index) { 413 bitmap = alloc;
394 bit = 0; 414 if (!bitmap)
395 ebit = RADIX_TREE_EXCEPTIONAL_SHIFT; 415 bitmap = kzalloc(sizeof(*bitmap), GFP_NOWAIT);
396 } 416 if (!bitmap)
397 new = iter.index * IDA_BITMAP_BITS; 417 goto alloc;
398 bitmap = rcu_dereference_raw(*slot); 418 bitmap->bitmap[0] = tmp;
399 if (radix_tree_exception(bitmap)) { 419 xas_store(&xas, bitmap);
400 unsigned long tmp = (unsigned long)bitmap; 420 if (xas_error(&xas)) {
401 ebit = find_next_zero_bit(&tmp, BITS_PER_LONG, ebit); 421 bitmap->bitmap[0] = 0;
402 if (ebit < BITS_PER_LONG) { 422 goto out;
403 tmp |= 1UL << ebit;
404 rcu_assign_pointer(*slot, (void *)tmp);
405 return new + ebit -
406 RADIX_TREE_EXCEPTIONAL_SHIFT;
407 }
408 bitmap = this_cpu_xchg(ida_bitmap, NULL);
409 if (!bitmap)
410 return -EAGAIN;
411 bitmap->bitmap[0] = tmp >> RADIX_TREE_EXCEPTIONAL_SHIFT;
412 rcu_assign_pointer(*slot, bitmap);
413 } 423 }
424 }
414 425
415 if (bitmap) { 426 if (bitmap) {
416 bit = find_next_zero_bit(bitmap->bitmap, 427 bit = find_next_zero_bit(bitmap->bitmap, IDA_BITMAP_BITS, bit);
417 IDA_BITMAP_BITS, bit); 428 if (xas.xa_index * IDA_BITMAP_BITS + bit > max)
418 new += bit; 429 goto nospc;
419 if (new < 0) 430 if (bit == IDA_BITMAP_BITS)
420 return -ENOSPC; 431 goto next;
421 if (bit == IDA_BITMAP_BITS)
422 continue;
423 432
424 __set_bit(bit, bitmap->bitmap); 433 __set_bit(bit, bitmap->bitmap);
425 if (bitmap_full(bitmap->bitmap, IDA_BITMAP_BITS)) 434 if (bitmap_full(bitmap->bitmap, IDA_BITMAP_BITS))
426 radix_tree_iter_tag_clear(root, &iter, 435 xas_clear_mark(&xas, XA_FREE_MARK);
427 IDR_FREE); 436 } else {
437 if (bit < BITS_PER_XA_VALUE) {
438 bitmap = xa_mk_value(1UL << bit);
428 } else { 439 } else {
429 new += bit; 440 bitmap = alloc;
430 if (new < 0)
431 return -ENOSPC;
432 if (ebit < BITS_PER_LONG) {
433 bitmap = (void *)((1UL << ebit) |
434 RADIX_TREE_EXCEPTIONAL_ENTRY);
435 radix_tree_iter_replace(root, &iter, slot,
436 bitmap);
437 return new;
438 }
439 bitmap = this_cpu_xchg(ida_bitmap, NULL);
440 if (!bitmap) 441 if (!bitmap)
441 return -EAGAIN; 442 bitmap = kzalloc(sizeof(*bitmap), GFP_NOWAIT);
443 if (!bitmap)
444 goto alloc;
442 __set_bit(bit, bitmap->bitmap); 445 __set_bit(bit, bitmap->bitmap);
443 radix_tree_iter_replace(root, &iter, slot, bitmap);
444 } 446 }
445 447 xas_store(&xas, bitmap);
446 return new; 448 }
449out:
450 xas_unlock_irqrestore(&xas, flags);
451 if (xas_nomem(&xas, gfp)) {
452 xas.xa_index = min / IDA_BITMAP_BITS;
453 bit = min % IDA_BITMAP_BITS;
454 goto retry;
447 } 455 }
456 if (bitmap != alloc)
457 kfree(alloc);
458 if (xas_error(&xas))
459 return xas_error(&xas);
460 return xas.xa_index * IDA_BITMAP_BITS + bit;
461alloc:
462 xas_unlock_irqrestore(&xas, flags);
463 alloc = kzalloc(sizeof(*bitmap), gfp);
464 if (!alloc)
465 return -ENOMEM;
466 xas_set(&xas, min / IDA_BITMAP_BITS);
467 bit = min % IDA_BITMAP_BITS;
468 goto retry;
469nospc:
470 xas_unlock_irqrestore(&xas, flags);
471 return -ENOSPC;
448} 472}
473EXPORT_SYMBOL(ida_alloc_range);
449 474
450static void ida_remove(struct ida *ida, int id) 475/**
476 * ida_free() - Release an allocated ID.
477 * @ida: IDA handle.
478 * @id: Previously allocated ID.
479 *
480 * Context: Any context.
481 */
482void ida_free(struct ida *ida, unsigned int id)
451{ 483{
452 unsigned long index = id / IDA_BITMAP_BITS; 484 XA_STATE(xas, &ida->xa, id / IDA_BITMAP_BITS);
453 unsigned offset = id % IDA_BITMAP_BITS; 485 unsigned bit = id % IDA_BITMAP_BITS;
454 struct ida_bitmap *bitmap; 486 struct ida_bitmap *bitmap;
455 unsigned long *btmp; 487 unsigned long flags;
456 struct radix_tree_iter iter;
457 void __rcu **slot;
458 488
459 slot = radix_tree_iter_lookup(&ida->ida_rt, &iter, index); 489 BUG_ON((int)id < 0);
460 if (!slot) 490
461 goto err; 491 xas_lock_irqsave(&xas, flags);
492 bitmap = xas_load(&xas);
462 493
463 bitmap = rcu_dereference_raw(*slot); 494 if (xa_is_value(bitmap)) {
464 if (radix_tree_exception(bitmap)) { 495 unsigned long v = xa_to_value(bitmap);
465 btmp = (unsigned long *)slot; 496 if (bit >= BITS_PER_XA_VALUE)
466 offset += RADIX_TREE_EXCEPTIONAL_SHIFT;
467 if (offset >= BITS_PER_LONG)
468 goto err; 497 goto err;
498 if (!(v & (1UL << bit)))
499 goto err;
500 v &= ~(1UL << bit);
501 if (!v)
502 goto delete;
503 xas_store(&xas, xa_mk_value(v));
469 } else { 504 } else {
470 btmp = bitmap->bitmap; 505 if (!test_bit(bit, bitmap->bitmap))
471 } 506 goto err;
472 if (!test_bit(offset, btmp)) 507 __clear_bit(bit, bitmap->bitmap);
473 goto err; 508 xas_set_mark(&xas, XA_FREE_MARK);
474 509 if (bitmap_empty(bitmap->bitmap, IDA_BITMAP_BITS)) {
475 __clear_bit(offset, btmp); 510 kfree(bitmap);
476 radix_tree_iter_tag_set(&ida->ida_rt, &iter, IDR_FREE); 511delete:
477 if (radix_tree_exception(bitmap)) { 512 xas_store(&xas, NULL);
478 if (rcu_dereference_raw(*slot) == 513 }
479 (void *)RADIX_TREE_EXCEPTIONAL_ENTRY)
480 radix_tree_iter_delete(&ida->ida_rt, &iter, slot);
481 } else if (bitmap_empty(btmp, IDA_BITMAP_BITS)) {
482 kfree(bitmap);
483 radix_tree_iter_delete(&ida->ida_rt, &iter, slot);
484 } 514 }
515 xas_unlock_irqrestore(&xas, flags);
485 return; 516 return;
486 err: 517 err:
518 xas_unlock_irqrestore(&xas, flags);
487 WARN(1, "ida_free called for id=%d which is not allocated.\n", id); 519 WARN(1, "ida_free called for id=%d which is not allocated.\n", id);
488} 520}
521EXPORT_SYMBOL(ida_free);
489 522
490/** 523/**
491 * ida_destroy() - Free all IDs. 524 * ida_destroy() - Free all IDs.
@@ -500,80 +533,60 @@ static void ida_remove(struct ida *ida, int id)
500 */ 533 */
501void ida_destroy(struct ida *ida) 534void ida_destroy(struct ida *ida)
502{ 535{
536 XA_STATE(xas, &ida->xa, 0);
537 struct ida_bitmap *bitmap;
503 unsigned long flags; 538 unsigned long flags;
504 struct radix_tree_iter iter;
505 void __rcu **slot;
506 539
507 xa_lock_irqsave(&ida->ida_rt, flags); 540 xas_lock_irqsave(&xas, flags);
508 radix_tree_for_each_slot(slot, &ida->ida_rt, &iter, 0) { 541 xas_for_each(&xas, bitmap, ULONG_MAX) {
509 struct ida_bitmap *bitmap = rcu_dereference_raw(*slot); 542 if (!xa_is_value(bitmap))
510 if (!radix_tree_exception(bitmap))
511 kfree(bitmap); 543 kfree(bitmap);
512 radix_tree_iter_delete(&ida->ida_rt, &iter, slot); 544 xas_store(&xas, NULL);
513 } 545 }
514 xa_unlock_irqrestore(&ida->ida_rt, flags); 546 xas_unlock_irqrestore(&xas, flags);
515} 547}
516EXPORT_SYMBOL(ida_destroy); 548EXPORT_SYMBOL(ida_destroy);
517 549
518/** 550#ifndef __KERNEL__
519 * ida_alloc_range() - Allocate an unused ID. 551extern void xa_dump_index(unsigned long index, unsigned int shift);
520 * @ida: IDA handle. 552#define IDA_CHUNK_SHIFT ilog2(IDA_BITMAP_BITS)
521 * @min: Lowest ID to allocate.
522 * @max: Highest ID to allocate.
523 * @gfp: Memory allocation flags.
524 *
525 * Allocate an ID between @min and @max, inclusive. The allocated ID will
526 * not exceed %INT_MAX, even if @max is larger.
527 *
528 * Context: Any context.
529 * Return: The allocated ID, or %-ENOMEM if memory could not be allocated,
530 * or %-ENOSPC if there are no free IDs.
531 */
532int ida_alloc_range(struct ida *ida, unsigned int min, unsigned int max,
533 gfp_t gfp)
534{
535 int id = 0;
536 unsigned long flags;
537 553
538 if ((int)min < 0) 554static void ida_dump_entry(void *entry, unsigned long index)
539 return -ENOSPC; 555{
540 556 unsigned long i;
541 if ((int)max < 0) 557
542 max = INT_MAX; 558 if (!entry)
543 559 return;
544again: 560
545 xa_lock_irqsave(&ida->ida_rt, flags); 561 if (xa_is_node(entry)) {
546 id = ida_get_new_above(ida, min); 562 struct xa_node *node = xa_to_node(entry);
547 if (id > (int)max) { 563 unsigned int shift = node->shift + IDA_CHUNK_SHIFT +
548 ida_remove(ida, id); 564 XA_CHUNK_SHIFT;
549 id = -ENOSPC; 565
550 } 566 xa_dump_index(index * IDA_BITMAP_BITS, shift);
551 xa_unlock_irqrestore(&ida->ida_rt, flags); 567 xa_dump_node(node);
568 for (i = 0; i < XA_CHUNK_SIZE; i++)
569 ida_dump_entry(node->slots[i],
570 index | (i << node->shift));
571 } else if (xa_is_value(entry)) {
572 xa_dump_index(index * IDA_BITMAP_BITS, ilog2(BITS_PER_LONG));
573 pr_cont("value: data %lx [%px]\n", xa_to_value(entry), entry);
574 } else {
575 struct ida_bitmap *bitmap = entry;
552 576
553 if (unlikely(id == -EAGAIN)) { 577 xa_dump_index(index * IDA_BITMAP_BITS, IDA_CHUNK_SHIFT);
554 if (!ida_pre_get(ida, gfp)) 578 pr_cont("bitmap: %p data", bitmap);
555 return -ENOMEM; 579 for (i = 0; i < IDA_BITMAP_LONGS; i++)
556 goto again; 580 pr_cont(" %lx", bitmap->bitmap[i]);
581 pr_cont("\n");
557 } 582 }
558
559 return id;
560} 583}
561EXPORT_SYMBOL(ida_alloc_range);
562 584
563/** 585static void ida_dump(struct ida *ida)
564 * ida_free() - Release an allocated ID.
565 * @ida: IDA handle.
566 * @id: Previously allocated ID.
567 *
568 * Context: Any context.
569 */
570void ida_free(struct ida *ida, unsigned int id)
571{ 586{
572 unsigned long flags; 587 struct xarray *xa = &ida->xa;
573 588 pr_debug("ida: %p node %p free %d\n", ida, xa->xa_head,
574 BUG_ON((int)id < 0); 589 xa->xa_flags >> ROOT_TAG_SHIFT);
575 xa_lock_irqsave(&ida->ida_rt, flags); 590 ida_dump_entry(xa->xa_head, 0);
576 ida_remove(ida, id);
577 xa_unlock_irqrestore(&ida->ida_rt, flags);
578} 591}
579EXPORT_SYMBOL(ida_free); 592#endif
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index bc03ecc4dfd2..1106bb6aa01e 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -38,15 +38,13 @@
38#include <linux/rcupdate.h> 38#include <linux/rcupdate.h>
39#include <linux/slab.h> 39#include <linux/slab.h>
40#include <linux/string.h> 40#include <linux/string.h>
41#include <linux/xarray.h>
41 42
42 43
43/* Number of nodes in fully populated tree of given height */
44static unsigned long height_to_maxnodes[RADIX_TREE_MAX_PATH + 1] __read_mostly;
45
46/* 44/*
47 * Radix tree node cache. 45 * Radix tree node cache.
48 */ 46 */
49static struct kmem_cache *radix_tree_node_cachep; 47struct kmem_cache *radix_tree_node_cachep;
50 48
51/* 49/*
52 * The radix tree is variable-height, so an insert operation not only has 50 * The radix tree is variable-height, so an insert operation not only has
@@ -98,24 +96,7 @@ static inline void *node_to_entry(void *ptr)
98 return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE); 96 return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE);
99} 97}
100 98
101#define RADIX_TREE_RETRY node_to_entry(NULL) 99#define RADIX_TREE_RETRY XA_RETRY_ENTRY
102
103#ifdef CONFIG_RADIX_TREE_MULTIORDER
104/* Sibling slots point directly to another slot in the same node */
105static inline
106bool is_sibling_entry(const struct radix_tree_node *parent, void *node)
107{
108 void __rcu **ptr = node;
109 return (parent->slots <= ptr) &&
110 (ptr < parent->slots + RADIX_TREE_MAP_SIZE);
111}
112#else
113static inline
114bool is_sibling_entry(const struct radix_tree_node *parent, void *node)
115{
116 return false;
117}
118#endif
119 100
120static inline unsigned long 101static inline unsigned long
121get_slot_offset(const struct radix_tree_node *parent, void __rcu **slot) 102get_slot_offset(const struct radix_tree_node *parent, void __rcu **slot)
@@ -129,24 +110,13 @@ static unsigned int radix_tree_descend(const struct radix_tree_node *parent,
129 unsigned int offset = (index >> parent->shift) & RADIX_TREE_MAP_MASK; 110 unsigned int offset = (index >> parent->shift) & RADIX_TREE_MAP_MASK;
130 void __rcu **entry = rcu_dereference_raw(parent->slots[offset]); 111 void __rcu **entry = rcu_dereference_raw(parent->slots[offset]);
131 112
132#ifdef CONFIG_RADIX_TREE_MULTIORDER
133 if (radix_tree_is_internal_node(entry)) {
134 if (is_sibling_entry(parent, entry)) {
135 void __rcu **sibentry;
136 sibentry = (void __rcu **) entry_to_node(entry);
137 offset = get_slot_offset(parent, sibentry);
138 entry = rcu_dereference_raw(*sibentry);
139 }
140 }
141#endif
142
143 *nodep = (void *)entry; 113 *nodep = (void *)entry;
144 return offset; 114 return offset;
145} 115}
146 116
147static inline gfp_t root_gfp_mask(const struct radix_tree_root *root) 117static inline gfp_t root_gfp_mask(const struct radix_tree_root *root)
148{ 118{
149 return root->gfp_mask & (__GFP_BITS_MASK & ~GFP_ZONEMASK); 119 return root->xa_flags & (__GFP_BITS_MASK & ~GFP_ZONEMASK);
150} 120}
151 121
152static inline void tag_set(struct radix_tree_node *node, unsigned int tag, 122static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
@@ -169,32 +139,32 @@ static inline int tag_get(const struct radix_tree_node *node, unsigned int tag,
169 139
170static inline void root_tag_set(struct radix_tree_root *root, unsigned tag) 140static inline void root_tag_set(struct radix_tree_root *root, unsigned tag)
171{ 141{
172 root->gfp_mask |= (__force gfp_t)(1 << (tag + ROOT_TAG_SHIFT)); 142 root->xa_flags |= (__force gfp_t)(1 << (tag + ROOT_TAG_SHIFT));
173} 143}
174 144
175static inline void root_tag_clear(struct radix_tree_root *root, unsigned tag) 145static inline void root_tag_clear(struct radix_tree_root *root, unsigned tag)
176{ 146{
177 root->gfp_mask &= (__force gfp_t)~(1 << (tag + ROOT_TAG_SHIFT)); 147 root->xa_flags &= (__force gfp_t)~(1 << (tag + ROOT_TAG_SHIFT));
178} 148}
179 149
180static inline void root_tag_clear_all(struct radix_tree_root *root) 150static inline void root_tag_clear_all(struct radix_tree_root *root)
181{ 151{
182 root->gfp_mask &= (1 << ROOT_TAG_SHIFT) - 1; 152 root->xa_flags &= (__force gfp_t)((1 << ROOT_TAG_SHIFT) - 1);
183} 153}
184 154
185static inline int root_tag_get(const struct radix_tree_root *root, unsigned tag) 155static inline int root_tag_get(const struct radix_tree_root *root, unsigned tag)
186{ 156{
187 return (__force int)root->gfp_mask & (1 << (tag + ROOT_TAG_SHIFT)); 157 return (__force int)root->xa_flags & (1 << (tag + ROOT_TAG_SHIFT));
188} 158}
189 159
190static inline unsigned root_tags_get(const struct radix_tree_root *root) 160static inline unsigned root_tags_get(const struct radix_tree_root *root)
191{ 161{
192 return (__force unsigned)root->gfp_mask >> ROOT_TAG_SHIFT; 162 return (__force unsigned)root->xa_flags >> ROOT_TAG_SHIFT;
193} 163}
194 164
195static inline bool is_idr(const struct radix_tree_root *root) 165static inline bool is_idr(const struct radix_tree_root *root)
196{ 166{
197 return !!(root->gfp_mask & ROOT_IS_IDR); 167 return !!(root->xa_flags & ROOT_IS_IDR);
198} 168}
199 169
200/* 170/*
@@ -254,7 +224,7 @@ radix_tree_find_next_bit(struct radix_tree_node *node, unsigned int tag,
254 224
255static unsigned int iter_offset(const struct radix_tree_iter *iter) 225static unsigned int iter_offset(const struct radix_tree_iter *iter)
256{ 226{
257 return (iter->index >> iter_shift(iter)) & RADIX_TREE_MAP_MASK; 227 return iter->index & RADIX_TREE_MAP_MASK;
258} 228}
259 229
260/* 230/*
@@ -277,99 +247,6 @@ static unsigned long next_index(unsigned long index,
277 return (index & ~node_maxindex(node)) + (offset << node->shift); 247 return (index & ~node_maxindex(node)) + (offset << node->shift);
278} 248}
279 249
280#ifndef __KERNEL__
281static void dump_node(struct radix_tree_node *node, unsigned long index)
282{
283 unsigned long i;
284
285 pr_debug("radix node: %p offset %d indices %lu-%lu parent %p tags %lx %lx %lx shift %d count %d exceptional %d\n",
286 node, node->offset, index, index | node_maxindex(node),
287 node->parent,
288 node->tags[0][0], node->tags[1][0], node->tags[2][0],
289 node->shift, node->count, node->exceptional);
290
291 for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
292 unsigned long first = index | (i << node->shift);
293 unsigned long last = first | ((1UL << node->shift) - 1);
294 void *entry = node->slots[i];
295 if (!entry)
296 continue;
297 if (entry == RADIX_TREE_RETRY) {
298 pr_debug("radix retry offset %ld indices %lu-%lu parent %p\n",
299 i, first, last, node);
300 } else if (!radix_tree_is_internal_node(entry)) {
301 pr_debug("radix entry %p offset %ld indices %lu-%lu parent %p\n",
302 entry, i, first, last, node);
303 } else if (is_sibling_entry(node, entry)) {
304 pr_debug("radix sblng %p offset %ld indices %lu-%lu parent %p val %p\n",
305 entry, i, first, last, node,
306 *(void **)entry_to_node(entry));
307 } else {
308 dump_node(entry_to_node(entry), first);
309 }
310 }
311}
312
313/* For debug */
314static void radix_tree_dump(struct radix_tree_root *root)
315{
316 pr_debug("radix root: %p rnode %p tags %x\n",
317 root, root->rnode,
318 root->gfp_mask >> ROOT_TAG_SHIFT);
319 if (!radix_tree_is_internal_node(root->rnode))
320 return;
321 dump_node(entry_to_node(root->rnode), 0);
322}
323
324static void dump_ida_node(void *entry, unsigned long index)
325{
326 unsigned long i;
327
328 if (!entry)
329 return;
330
331 if (radix_tree_is_internal_node(entry)) {
332 struct radix_tree_node *node = entry_to_node(entry);
333
334 pr_debug("ida node: %p offset %d indices %lu-%lu parent %p free %lx shift %d count %d\n",
335 node, node->offset, index * IDA_BITMAP_BITS,
336 ((index | node_maxindex(node)) + 1) *
337 IDA_BITMAP_BITS - 1,
338 node->parent, node->tags[0][0], node->shift,
339 node->count);
340 for (i = 0; i < RADIX_TREE_MAP_SIZE; i++)
341 dump_ida_node(node->slots[i],
342 index | (i << node->shift));
343 } else if (radix_tree_exceptional_entry(entry)) {
344 pr_debug("ida excp: %p offset %d indices %lu-%lu data %lx\n",
345 entry, (int)(index & RADIX_TREE_MAP_MASK),
346 index * IDA_BITMAP_BITS,
347 index * IDA_BITMAP_BITS + BITS_PER_LONG -
348 RADIX_TREE_EXCEPTIONAL_SHIFT,
349 (unsigned long)entry >>
350 RADIX_TREE_EXCEPTIONAL_SHIFT);
351 } else {
352 struct ida_bitmap *bitmap = entry;
353
354 pr_debug("ida btmp: %p offset %d indices %lu-%lu data", bitmap,
355 (int)(index & RADIX_TREE_MAP_MASK),
356 index * IDA_BITMAP_BITS,
357 (index + 1) * IDA_BITMAP_BITS - 1);
358 for (i = 0; i < IDA_BITMAP_LONGS; i++)
359 pr_cont(" %lx", bitmap->bitmap[i]);
360 pr_cont("\n");
361 }
362}
363
364static void ida_dump(struct ida *ida)
365{
366 struct radix_tree_root *root = &ida->ida_rt;
367 pr_debug("ida: %p node %p free %d\n", ida, root->rnode,
368 root->gfp_mask >> ROOT_TAG_SHIFT);
369 dump_ida_node(root->rnode, 0);
370}
371#endif
372
373/* 250/*
374 * This assumes that the caller has performed appropriate preallocation, and 251 * This assumes that the caller has performed appropriate preallocation, and
375 * that the caller has pinned this thread of control to the current CPU. 252 * that the caller has pinned this thread of control to the current CPU.
@@ -378,7 +255,7 @@ static struct radix_tree_node *
378radix_tree_node_alloc(gfp_t gfp_mask, struct radix_tree_node *parent, 255radix_tree_node_alloc(gfp_t gfp_mask, struct radix_tree_node *parent,
379 struct radix_tree_root *root, 256 struct radix_tree_root *root,
380 unsigned int shift, unsigned int offset, 257 unsigned int shift, unsigned int offset,
381 unsigned int count, unsigned int exceptional) 258 unsigned int count, unsigned int nr_values)
382{ 259{
383 struct radix_tree_node *ret = NULL; 260 struct radix_tree_node *ret = NULL;
384 261
@@ -425,14 +302,14 @@ out:
425 ret->shift = shift; 302 ret->shift = shift;
426 ret->offset = offset; 303 ret->offset = offset;
427 ret->count = count; 304 ret->count = count;
428 ret->exceptional = exceptional; 305 ret->nr_values = nr_values;
429 ret->parent = parent; 306 ret->parent = parent;
430 ret->root = root; 307 ret->array = root;
431 } 308 }
432 return ret; 309 return ret;
433} 310}
434 311
435static void radix_tree_node_rcu_free(struct rcu_head *head) 312void radix_tree_node_rcu_free(struct rcu_head *head)
436{ 313{
437 struct radix_tree_node *node = 314 struct radix_tree_node *node =
438 container_of(head, struct radix_tree_node, rcu_head); 315 container_of(head, struct radix_tree_node, rcu_head);
@@ -530,77 +407,10 @@ int radix_tree_maybe_preload(gfp_t gfp_mask)
530} 407}
531EXPORT_SYMBOL(radix_tree_maybe_preload); 408EXPORT_SYMBOL(radix_tree_maybe_preload);
532 409
533#ifdef CONFIG_RADIX_TREE_MULTIORDER
534/*
535 * Preload with enough objects to ensure that we can split a single entry
536 * of order @old_order into many entries of size @new_order
537 */
538int radix_tree_split_preload(unsigned int old_order, unsigned int new_order,
539 gfp_t gfp_mask)
540{
541 unsigned top = 1 << (old_order % RADIX_TREE_MAP_SHIFT);
542 unsigned layers = (old_order / RADIX_TREE_MAP_SHIFT) -
543 (new_order / RADIX_TREE_MAP_SHIFT);
544 unsigned nr = 0;
545
546 WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask));
547 BUG_ON(new_order >= old_order);
548
549 while (layers--)
550 nr = nr * RADIX_TREE_MAP_SIZE + 1;
551 return __radix_tree_preload(gfp_mask, top * nr);
552}
553#endif
554
555/*
556 * The same as function above, but preload number of nodes required to insert
557 * (1 << order) continuous naturally-aligned elements.
558 */
559int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
560{
561 unsigned long nr_subtrees;
562 int nr_nodes, subtree_height;
563
564 /* Preloading doesn't help anything with this gfp mask, skip it */
565 if (!gfpflags_allow_blocking(gfp_mask)) {
566 preempt_disable();
567 return 0;
568 }
569
570 /*
571 * Calculate number and height of fully populated subtrees it takes to
572 * store (1 << order) elements.
573 */
574 nr_subtrees = 1 << order;
575 for (subtree_height = 0; nr_subtrees > RADIX_TREE_MAP_SIZE;
576 subtree_height++)
577 nr_subtrees >>= RADIX_TREE_MAP_SHIFT;
578
579 /*
580 * The worst case is zero height tree with a single item at index 0 and
581 * then inserting items starting at ULONG_MAX - (1 << order).
582 *
583 * This requires RADIX_TREE_MAX_PATH nodes to build branch from root to
584 * 0-index item.
585 */
586 nr_nodes = RADIX_TREE_MAX_PATH;
587
588 /* Plus branch to fully populated subtrees. */
589 nr_nodes += RADIX_TREE_MAX_PATH - subtree_height;
590
591 /* Root node is shared. */
592 nr_nodes--;
593
594 /* Plus nodes required to build subtrees. */
595 nr_nodes += nr_subtrees * height_to_maxnodes[subtree_height];
596
597 return __radix_tree_preload(gfp_mask, nr_nodes);
598}
599
600static unsigned radix_tree_load_root(const struct radix_tree_root *root, 410static unsigned radix_tree_load_root(const struct radix_tree_root *root,
601 struct radix_tree_node **nodep, unsigned long *maxindex) 411 struct radix_tree_node **nodep, unsigned long *maxindex)
602{ 412{
603 struct radix_tree_node *node = rcu_dereference_raw(root->rnode); 413 struct radix_tree_node *node = rcu_dereference_raw(root->xa_head);
604 414
605 *nodep = node; 415 *nodep = node;
606 416
@@ -629,7 +439,7 @@ static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp,
629 while (index > shift_maxindex(maxshift)) 439 while (index > shift_maxindex(maxshift))
630 maxshift += RADIX_TREE_MAP_SHIFT; 440 maxshift += RADIX_TREE_MAP_SHIFT;
631 441
632 entry = rcu_dereference_raw(root->rnode); 442 entry = rcu_dereference_raw(root->xa_head);
633 if (!entry && (!is_idr(root) || root_tag_get(root, IDR_FREE))) 443 if (!entry && (!is_idr(root) || root_tag_get(root, IDR_FREE)))
634 goto out; 444 goto out;
635 445
@@ -656,9 +466,9 @@ static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp,
656 BUG_ON(shift > BITS_PER_LONG); 466 BUG_ON(shift > BITS_PER_LONG);
657 if (radix_tree_is_internal_node(entry)) { 467 if (radix_tree_is_internal_node(entry)) {
658 entry_to_node(entry)->parent = node; 468 entry_to_node(entry)->parent = node;
659 } else if (radix_tree_exceptional_entry(entry)) { 469 } else if (xa_is_value(entry)) {
660 /* Moving an exceptional root->rnode to a node */ 470 /* Moving a value entry root->xa_head to a node */
661 node->exceptional = 1; 471 node->nr_values = 1;
662 } 472 }
663 /* 473 /*
664 * entry was already in the radix tree, so we do not need 474 * entry was already in the radix tree, so we do not need
@@ -666,7 +476,7 @@ static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp,
666 */ 476 */
667 node->slots[0] = (void __rcu *)entry; 477 node->slots[0] = (void __rcu *)entry;
668 entry = node_to_entry(node); 478 entry = node_to_entry(node);
669 rcu_assign_pointer(root->rnode, entry); 479 rcu_assign_pointer(root->xa_head, entry);
670 shift += RADIX_TREE_MAP_SHIFT; 480 shift += RADIX_TREE_MAP_SHIFT;
671 } while (shift <= maxshift); 481 } while (shift <= maxshift);
672out: 482out:
@@ -677,13 +487,12 @@ out:
677 * radix_tree_shrink - shrink radix tree to minimum height 487 * radix_tree_shrink - shrink radix tree to minimum height
678 * @root radix tree root 488 * @root radix tree root
679 */ 489 */
680static inline bool radix_tree_shrink(struct radix_tree_root *root, 490static inline bool radix_tree_shrink(struct radix_tree_root *root)
681 radix_tree_update_node_t update_node)
682{ 491{
683 bool shrunk = false; 492 bool shrunk = false;
684 493
685 for (;;) { 494 for (;;) {
686 struct radix_tree_node *node = rcu_dereference_raw(root->rnode); 495 struct radix_tree_node *node = rcu_dereference_raw(root->xa_head);
687 struct radix_tree_node *child; 496 struct radix_tree_node *child;
688 497
689 if (!radix_tree_is_internal_node(node)) 498 if (!radix_tree_is_internal_node(node))
@@ -692,15 +501,20 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root,
692 501
693 /* 502 /*
694 * The candidate node has more than one child, or its child 503 * The candidate node has more than one child, or its child
695 * is not at the leftmost slot, or the child is a multiorder 504 * is not at the leftmost slot, we cannot shrink.
696 * entry, we cannot shrink.
697 */ 505 */
698 if (node->count != 1) 506 if (node->count != 1)
699 break; 507 break;
700 child = rcu_dereference_raw(node->slots[0]); 508 child = rcu_dereference_raw(node->slots[0]);
701 if (!child) 509 if (!child)
702 break; 510 break;
703 if (!radix_tree_is_internal_node(child) && node->shift) 511
512 /*
513 * For an IDR, we must not shrink entry 0 into the root in
514 * case somebody calls idr_replace() with a pointer that
515 * appears to be an internal entry
516 */
517 if (!node->shift && is_idr(root))
704 break; 518 break;
705 519
706 if (radix_tree_is_internal_node(child)) 520 if (radix_tree_is_internal_node(child))
@@ -711,9 +525,9 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root,
711 * moving the node from one part of the tree to another: if it 525 * moving the node from one part of the tree to another: if it
712 * was safe to dereference the old pointer to it 526 * was safe to dereference the old pointer to it
713 * (node->slots[0]), it will be safe to dereference the new 527 * (node->slots[0]), it will be safe to dereference the new
714 * one (root->rnode) as far as dependent read barriers go. 528 * one (root->xa_head) as far as dependent read barriers go.
715 */ 529 */
716 root->rnode = (void __rcu *)child; 530 root->xa_head = (void __rcu *)child;
717 if (is_idr(root) && !tag_get(node, IDR_FREE, 0)) 531 if (is_idr(root) && !tag_get(node, IDR_FREE, 0))
718 root_tag_clear(root, IDR_FREE); 532 root_tag_clear(root, IDR_FREE);
719 533
@@ -738,8 +552,6 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root,
738 node->count = 0; 552 node->count = 0;
739 if (!radix_tree_is_internal_node(child)) { 553 if (!radix_tree_is_internal_node(child)) {
740 node->slots[0] = (void __rcu *)RADIX_TREE_RETRY; 554 node->slots[0] = (void __rcu *)RADIX_TREE_RETRY;
741 if (update_node)
742 update_node(node);
743 } 555 }
744 556
745 WARN_ON_ONCE(!list_empty(&node->private_list)); 557 WARN_ON_ONCE(!list_empty(&node->private_list));
@@ -751,8 +563,7 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root,
751} 563}
752 564
753static bool delete_node(struct radix_tree_root *root, 565static bool delete_node(struct radix_tree_root *root,
754 struct radix_tree_node *node, 566 struct radix_tree_node *node)
755 radix_tree_update_node_t update_node)
756{ 567{
757 bool deleted = false; 568 bool deleted = false;
758 569
@@ -761,9 +572,8 @@ static bool delete_node(struct radix_tree_root *root,
761 572
762 if (node->count) { 573 if (node->count) {
763 if (node_to_entry(node) == 574 if (node_to_entry(node) ==
764 rcu_dereference_raw(root->rnode)) 575 rcu_dereference_raw(root->xa_head))
765 deleted |= radix_tree_shrink(root, 576 deleted |= radix_tree_shrink(root);
766 update_node);
767 return deleted; 577 return deleted;
768 } 578 }
769 579
@@ -778,7 +588,7 @@ static bool delete_node(struct radix_tree_root *root,
778 */ 588 */
779 if (!is_idr(root)) 589 if (!is_idr(root))
780 root_tag_clear_all(root); 590 root_tag_clear_all(root);
781 root->rnode = NULL; 591 root->xa_head = NULL;
782 } 592 }
783 593
784 WARN_ON_ONCE(!list_empty(&node->private_list)); 594 WARN_ON_ONCE(!list_empty(&node->private_list));
@@ -795,7 +605,6 @@ static bool delete_node(struct radix_tree_root *root,
795 * __radix_tree_create - create a slot in a radix tree 605 * __radix_tree_create - create a slot in a radix tree
796 * @root: radix tree root 606 * @root: radix tree root
797 * @index: index key 607 * @index: index key
798 * @order: index occupies 2^order aligned slots
799 * @nodep: returns node 608 * @nodep: returns node
800 * @slotp: returns slot 609 * @slotp: returns slot
801 * 610 *
@@ -803,36 +612,34 @@ static bool delete_node(struct radix_tree_root *root,
803 * at position @index in the radix tree @root. 612 * at position @index in the radix tree @root.
804 * 613 *
805 * Until there is more than one item in the tree, no nodes are 614 * Until there is more than one item in the tree, no nodes are
806 * allocated and @root->rnode is used as a direct slot instead of 615 * allocated and @root->xa_head is used as a direct slot instead of
807 * pointing to a node, in which case *@nodep will be NULL. 616 * pointing to a node, in which case *@nodep will be NULL.
808 * 617 *
809 * Returns -ENOMEM, or 0 for success. 618 * Returns -ENOMEM, or 0 for success.
810 */ 619 */
811int __radix_tree_create(struct radix_tree_root *root, unsigned long index, 620static int __radix_tree_create(struct radix_tree_root *root,
812 unsigned order, struct radix_tree_node **nodep, 621 unsigned long index, struct radix_tree_node **nodep,
813 void __rcu ***slotp) 622 void __rcu ***slotp)
814{ 623{
815 struct radix_tree_node *node = NULL, *child; 624 struct radix_tree_node *node = NULL, *child;
816 void __rcu **slot = (void __rcu **)&root->rnode; 625 void __rcu **slot = (void __rcu **)&root->xa_head;
817 unsigned long maxindex; 626 unsigned long maxindex;
818 unsigned int shift, offset = 0; 627 unsigned int shift, offset = 0;
819 unsigned long max = index | ((1UL << order) - 1); 628 unsigned long max = index;
820 gfp_t gfp = root_gfp_mask(root); 629 gfp_t gfp = root_gfp_mask(root);
821 630
822 shift = radix_tree_load_root(root, &child, &maxindex); 631 shift = radix_tree_load_root(root, &child, &maxindex);
823 632
824 /* Make sure the tree is high enough. */ 633 /* Make sure the tree is high enough. */
825 if (order > 0 && max == ((1UL << order) - 1))
826 max++;
827 if (max > maxindex) { 634 if (max > maxindex) {
828 int error = radix_tree_extend(root, gfp, max, shift); 635 int error = radix_tree_extend(root, gfp, max, shift);
829 if (error < 0) 636 if (error < 0)
830 return error; 637 return error;
831 shift = error; 638 shift = error;
832 child = rcu_dereference_raw(root->rnode); 639 child = rcu_dereference_raw(root->xa_head);
833 } 640 }
834 641
835 while (shift > order) { 642 while (shift > 0) {
836 shift -= RADIX_TREE_MAP_SHIFT; 643 shift -= RADIX_TREE_MAP_SHIFT;
837 if (child == NULL) { 644 if (child == NULL) {
838 /* Have to add a child node. */ 645 /* Have to add a child node. */
@@ -875,8 +682,7 @@ static void radix_tree_free_nodes(struct radix_tree_node *node)
875 682
876 for (;;) { 683 for (;;) {
877 void *entry = rcu_dereference_raw(child->slots[offset]); 684 void *entry = rcu_dereference_raw(child->slots[offset]);
878 if (radix_tree_is_internal_node(entry) && 685 if (xa_is_node(entry) && child->shift) {
879 !is_sibling_entry(child, entry)) {
880 child = entry_to_node(entry); 686 child = entry_to_node(entry);
881 offset = 0; 687 offset = 0;
882 continue; 688 continue;
@@ -894,96 +700,30 @@ static void radix_tree_free_nodes(struct radix_tree_node *node)
894 } 700 }
895} 701}
896 702
897#ifdef CONFIG_RADIX_TREE_MULTIORDER
898static inline int insert_entries(struct radix_tree_node *node, 703static inline int insert_entries(struct radix_tree_node *node,
899 void __rcu **slot, void *item, unsigned order, bool replace) 704 void __rcu **slot, void *item, bool replace)
900{
901 struct radix_tree_node *child;
902 unsigned i, n, tag, offset, tags = 0;
903
904 if (node) {
905 if (order > node->shift)
906 n = 1 << (order - node->shift);
907 else
908 n = 1;
909 offset = get_slot_offset(node, slot);
910 } else {
911 n = 1;
912 offset = 0;
913 }
914
915 if (n > 1) {
916 offset = offset & ~(n - 1);
917 slot = &node->slots[offset];
918 }
919 child = node_to_entry(slot);
920
921 for (i = 0; i < n; i++) {
922 if (slot[i]) {
923 if (replace) {
924 node->count--;
925 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
926 if (tag_get(node, tag, offset + i))
927 tags |= 1 << tag;
928 } else
929 return -EEXIST;
930 }
931 }
932
933 for (i = 0; i < n; i++) {
934 struct radix_tree_node *old = rcu_dereference_raw(slot[i]);
935 if (i) {
936 rcu_assign_pointer(slot[i], child);
937 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
938 if (tags & (1 << tag))
939 tag_clear(node, tag, offset + i);
940 } else {
941 rcu_assign_pointer(slot[i], item);
942 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
943 if (tags & (1 << tag))
944 tag_set(node, tag, offset);
945 }
946 if (radix_tree_is_internal_node(old) &&
947 !is_sibling_entry(node, old) &&
948 (old != RADIX_TREE_RETRY))
949 radix_tree_free_nodes(old);
950 if (radix_tree_exceptional_entry(old))
951 node->exceptional--;
952 }
953 if (node) {
954 node->count += n;
955 if (radix_tree_exceptional_entry(item))
956 node->exceptional += n;
957 }
958 return n;
959}
960#else
961static inline int insert_entries(struct radix_tree_node *node,
962 void __rcu **slot, void *item, unsigned order, bool replace)
963{ 705{
964 if (*slot) 706 if (*slot)
965 return -EEXIST; 707 return -EEXIST;
966 rcu_assign_pointer(*slot, item); 708 rcu_assign_pointer(*slot, item);
967 if (node) { 709 if (node) {
968 node->count++; 710 node->count++;
969 if (radix_tree_exceptional_entry(item)) 711 if (xa_is_value(item))
970 node->exceptional++; 712 node->nr_values++;
971 } 713 }
972 return 1; 714 return 1;
973} 715}
974#endif
975 716
976/** 717/**
977 * __radix_tree_insert - insert into a radix tree 718 * __radix_tree_insert - insert into a radix tree
978 * @root: radix tree root 719 * @root: radix tree root
979 * @index: index key 720 * @index: index key
980 * @order: key covers the 2^order indices around index
981 * @item: item to insert 721 * @item: item to insert
982 * 722 *
983 * Insert an item into the radix tree at position @index. 723 * Insert an item into the radix tree at position @index.
984 */ 724 */
985int __radix_tree_insert(struct radix_tree_root *root, unsigned long index, 725int radix_tree_insert(struct radix_tree_root *root, unsigned long index,
986 unsigned order, void *item) 726 void *item)
987{ 727{
988 struct radix_tree_node *node; 728 struct radix_tree_node *node;
989 void __rcu **slot; 729 void __rcu **slot;
@@ -991,11 +731,11 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index,
991 731
992 BUG_ON(radix_tree_is_internal_node(item)); 732 BUG_ON(radix_tree_is_internal_node(item));
993 733
994 error = __radix_tree_create(root, index, order, &node, &slot); 734 error = __radix_tree_create(root, index, &node, &slot);
995 if (error) 735 if (error)
996 return error; 736 return error;
997 737
998 error = insert_entries(node, slot, item, order, false); 738 error = insert_entries(node, slot, item, false);
999 if (error < 0) 739 if (error < 0)
1000 return error; 740 return error;
1001 741
@@ -1010,7 +750,7 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index,
1010 750
1011 return 0; 751 return 0;
1012} 752}
1013EXPORT_SYMBOL(__radix_tree_insert); 753EXPORT_SYMBOL(radix_tree_insert);
1014 754
1015/** 755/**
1016 * __radix_tree_lookup - lookup an item in a radix tree 756 * __radix_tree_lookup - lookup an item in a radix tree
@@ -1023,7 +763,7 @@ EXPORT_SYMBOL(__radix_tree_insert);
1023 * tree @root. 763 * tree @root.
1024 * 764 *
1025 * Until there is more than one item in the tree, no nodes are 765 * Until there is more than one item in the tree, no nodes are
1026 * allocated and @root->rnode is used as a direct slot instead of 766 * allocated and @root->xa_head is used as a direct slot instead of
1027 * pointing to a node, in which case *@nodep will be NULL. 767 * pointing to a node, in which case *@nodep will be NULL.
1028 */ 768 */
1029void *__radix_tree_lookup(const struct radix_tree_root *root, 769void *__radix_tree_lookup(const struct radix_tree_root *root,
@@ -1036,7 +776,7 @@ void *__radix_tree_lookup(const struct radix_tree_root *root,
1036 776
1037 restart: 777 restart:
1038 parent = NULL; 778 parent = NULL;
1039 slot = (void __rcu **)&root->rnode; 779 slot = (void __rcu **)&root->xa_head;
1040 radix_tree_load_root(root, &node, &maxindex); 780 radix_tree_load_root(root, &node, &maxindex);
1041 if (index > maxindex) 781 if (index > maxindex)
1042 return NULL; 782 return NULL;
@@ -1049,6 +789,8 @@ void *__radix_tree_lookup(const struct radix_tree_root *root,
1049 parent = entry_to_node(node); 789 parent = entry_to_node(node);
1050 offset = radix_tree_descend(parent, &node, index); 790 offset = radix_tree_descend(parent, &node, index);
1051 slot = parent->slots + offset; 791 slot = parent->slots + offset;
792 if (parent->shift == 0)
793 break;
1052 } 794 }
1053 795
1054 if (nodep) 796 if (nodep)
@@ -1100,36 +842,12 @@ void *radix_tree_lookup(const struct radix_tree_root *root, unsigned long index)
1100} 842}
1101EXPORT_SYMBOL(radix_tree_lookup); 843EXPORT_SYMBOL(radix_tree_lookup);
1102 844
1103static inline void replace_sibling_entries(struct radix_tree_node *node,
1104 void __rcu **slot, int count, int exceptional)
1105{
1106#ifdef CONFIG_RADIX_TREE_MULTIORDER
1107 void *ptr = node_to_entry(slot);
1108 unsigned offset = get_slot_offset(node, slot) + 1;
1109
1110 while (offset < RADIX_TREE_MAP_SIZE) {
1111 if (rcu_dereference_raw(node->slots[offset]) != ptr)
1112 break;
1113 if (count < 0) {
1114 node->slots[offset] = NULL;
1115 node->count--;
1116 }
1117 node->exceptional += exceptional;
1118 offset++;
1119 }
1120#endif
1121}
1122
1123static void replace_slot(void __rcu **slot, void *item, 845static void replace_slot(void __rcu **slot, void *item,
1124 struct radix_tree_node *node, int count, int exceptional) 846 struct radix_tree_node *node, int count, int values)
1125{ 847{
1126 if (WARN_ON_ONCE(radix_tree_is_internal_node(item))) 848 if (node && (count || values)) {
1127 return;
1128
1129 if (node && (count || exceptional)) {
1130 node->count += count; 849 node->count += count;
1131 node->exceptional += exceptional; 850 node->nr_values += values;
1132 replace_sibling_entries(node, slot, count, exceptional);
1133 } 851 }
1134 852
1135 rcu_assign_pointer(*slot, item); 853 rcu_assign_pointer(*slot, item);
@@ -1172,37 +890,31 @@ static int calculate_count(struct radix_tree_root *root,
1172 * @node: pointer to tree node 890 * @node: pointer to tree node
1173 * @slot: pointer to slot in @node 891 * @slot: pointer to slot in @node
1174 * @item: new item to store in the slot. 892 * @item: new item to store in the slot.
1175 * @update_node: callback for changing leaf nodes
1176 * 893 *
1177 * For use with __radix_tree_lookup(). Caller must hold tree write locked 894 * For use with __radix_tree_lookup(). Caller must hold tree write locked
1178 * across slot lookup and replacement. 895 * across slot lookup and replacement.
1179 */ 896 */
1180void __radix_tree_replace(struct radix_tree_root *root, 897void __radix_tree_replace(struct radix_tree_root *root,
1181 struct radix_tree_node *node, 898 struct radix_tree_node *node,
1182 void __rcu **slot, void *item, 899 void __rcu **slot, void *item)
1183 radix_tree_update_node_t update_node)
1184{ 900{
1185 void *old = rcu_dereference_raw(*slot); 901 void *old = rcu_dereference_raw(*slot);
1186 int exceptional = !!radix_tree_exceptional_entry(item) - 902 int values = !!xa_is_value(item) - !!xa_is_value(old);
1187 !!radix_tree_exceptional_entry(old);
1188 int count = calculate_count(root, node, slot, item, old); 903 int count = calculate_count(root, node, slot, item, old);
1189 904
1190 /* 905 /*
1191 * This function supports replacing exceptional entries and 906 * This function supports replacing value entries and
1192 * deleting entries, but that needs accounting against the 907 * deleting entries, but that needs accounting against the
1193 * node unless the slot is root->rnode. 908 * node unless the slot is root->xa_head.
1194 */ 909 */
1195 WARN_ON_ONCE(!node && (slot != (void __rcu **)&root->rnode) && 910 WARN_ON_ONCE(!node && (slot != (void __rcu **)&root->xa_head) &&
1196 (count || exceptional)); 911 (count || values));
1197 replace_slot(slot, item, node, count, exceptional); 912 replace_slot(slot, item, node, count, values);
1198 913
1199 if (!node) 914 if (!node)
1200 return; 915 return;
1201 916
1202 if (update_node) 917 delete_node(root, node);
1203 update_node(node);
1204
1205 delete_node(root, node, update_node);
1206} 918}
1207 919
1208/** 920/**
@@ -1211,12 +923,12 @@ void __radix_tree_replace(struct radix_tree_root *root,
1211 * @slot: pointer to slot 923 * @slot: pointer to slot
1212 * @item: new item to store in the slot. 924 * @item: new item to store in the slot.
1213 * 925 *
1214 * For use with radix_tree_lookup_slot(), radix_tree_gang_lookup_slot(), 926 * For use with radix_tree_lookup_slot() and
1215 * radix_tree_gang_lookup_tag_slot(). Caller must hold tree write locked 927 * radix_tree_gang_lookup_tag_slot(). Caller must hold tree write locked
1216 * across slot lookup and replacement. 928 * across slot lookup and replacement.
1217 * 929 *
1218 * NOTE: This cannot be used to switch between non-entries (empty slots), 930 * NOTE: This cannot be used to switch between non-entries (empty slots),
1219 * regular entries, and exceptional entries, as that requires accounting 931 * regular entries, and value entries, as that requires accounting
1220 * inside the radix tree node. When switching from one type of entry or 932 * inside the radix tree node. When switching from one type of entry or
1221 * deleting, use __radix_tree_lookup() and __radix_tree_replace() or 933 * deleting, use __radix_tree_lookup() and __radix_tree_replace() or
1222 * radix_tree_iter_replace(). 934 * radix_tree_iter_replace().
@@ -1224,7 +936,7 @@ void __radix_tree_replace(struct radix_tree_root *root,
1224void radix_tree_replace_slot(struct radix_tree_root *root, 936void radix_tree_replace_slot(struct radix_tree_root *root,
1225 void __rcu **slot, void *item) 937 void __rcu **slot, void *item)
1226{ 938{
1227 __radix_tree_replace(root, NULL, slot, item, NULL); 939 __radix_tree_replace(root, NULL, slot, item);
1228} 940}
1229EXPORT_SYMBOL(radix_tree_replace_slot); 941EXPORT_SYMBOL(radix_tree_replace_slot);
1230 942
@@ -1234,162 +946,16 @@ EXPORT_SYMBOL(radix_tree_replace_slot);
1234 * @slot: pointer to slot 946 * @slot: pointer to slot
1235 * @item: new item to store in the slot. 947 * @item: new item to store in the slot.
1236 * 948 *
1237 * For use with radix_tree_split() and radix_tree_for_each_slot(). 949 * For use with radix_tree_for_each_slot().
1238 * Caller must hold tree write locked across split and replacement. 950 * Caller must hold tree write locked.
1239 */ 951 */
1240void radix_tree_iter_replace(struct radix_tree_root *root, 952void radix_tree_iter_replace(struct radix_tree_root *root,
1241 const struct radix_tree_iter *iter, 953 const struct radix_tree_iter *iter,
1242 void __rcu **slot, void *item) 954 void __rcu **slot, void *item)
1243{ 955{
1244 __radix_tree_replace(root, iter->node, slot, item, NULL); 956 __radix_tree_replace(root, iter->node, slot, item);
1245} 957}
1246 958
1247#ifdef CONFIG_RADIX_TREE_MULTIORDER
1248/**
1249 * radix_tree_join - replace multiple entries with one multiorder entry
1250 * @root: radix tree root
1251 * @index: an index inside the new entry
1252 * @order: order of the new entry
1253 * @item: new entry
1254 *
1255 * Call this function to replace several entries with one larger entry.
1256 * The existing entries are presumed to not need freeing as a result of
1257 * this call.
1258 *
1259 * The replacement entry will have all the tags set on it that were set
1260 * on any of the entries it is replacing.
1261 */
1262int radix_tree_join(struct radix_tree_root *root, unsigned long index,
1263 unsigned order, void *item)
1264{
1265 struct radix_tree_node *node;
1266 void __rcu **slot;
1267 int error;
1268
1269 BUG_ON(radix_tree_is_internal_node(item));
1270
1271 error = __radix_tree_create(root, index, order, &node, &slot);
1272 if (!error)
1273 error = insert_entries(node, slot, item, order, true);
1274 if (error > 0)
1275 error = 0;
1276
1277 return error;
1278}
1279
1280/**
1281 * radix_tree_split - Split an entry into smaller entries
1282 * @root: radix tree root
1283 * @index: An index within the large entry
1284 * @order: Order of new entries
1285 *
1286 * Call this function as the first step in replacing a multiorder entry
1287 * with several entries of lower order. After this function returns,
1288 * loop over the relevant portion of the tree using radix_tree_for_each_slot()
1289 * and call radix_tree_iter_replace() to set up each new entry.
1290 *
1291 * The tags from this entry are replicated to all the new entries.
1292 *
1293 * The radix tree should be locked against modification during the entire
1294 * replacement operation. Lock-free lookups will see RADIX_TREE_RETRY which
1295 * should prompt RCU walkers to restart the lookup from the root.
1296 */
1297int radix_tree_split(struct radix_tree_root *root, unsigned long index,
1298 unsigned order)
1299{
1300 struct radix_tree_node *parent, *node, *child;
1301 void __rcu **slot;
1302 unsigned int offset, end;
1303 unsigned n, tag, tags = 0;
1304 gfp_t gfp = root_gfp_mask(root);
1305
1306 if (!__radix_tree_lookup(root, index, &parent, &slot))
1307 return -ENOENT;
1308 if (!parent)
1309 return -ENOENT;
1310
1311 offset = get_slot_offset(parent, slot);
1312
1313 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
1314 if (tag_get(parent, tag, offset))
1315 tags |= 1 << tag;
1316
1317 for (end = offset + 1; end < RADIX_TREE_MAP_SIZE; end++) {
1318 if (!is_sibling_entry(parent,
1319 rcu_dereference_raw(parent->slots[end])))
1320 break;
1321 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
1322 if (tags & (1 << tag))
1323 tag_set(parent, tag, end);
1324 /* rcu_assign_pointer ensures tags are set before RETRY */
1325 rcu_assign_pointer(parent->slots[end], RADIX_TREE_RETRY);
1326 }
1327 rcu_assign_pointer(parent->slots[offset], RADIX_TREE_RETRY);
1328 parent->exceptional -= (end - offset);
1329
1330 if (order == parent->shift)
1331 return 0;
1332 if (order > parent->shift) {
1333 while (offset < end)
1334 offset += insert_entries(parent, &parent->slots[offset],
1335 RADIX_TREE_RETRY, order, true);
1336 return 0;
1337 }
1338
1339 node = parent;
1340
1341 for (;;) {
1342 if (node->shift > order) {
1343 child = radix_tree_node_alloc(gfp, node, root,
1344 node->shift - RADIX_TREE_MAP_SHIFT,
1345 offset, 0, 0);
1346 if (!child)
1347 goto nomem;
1348 if (node != parent) {
1349 node->count++;
1350 rcu_assign_pointer(node->slots[offset],
1351 node_to_entry(child));
1352 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
1353 if (tags & (1 << tag))
1354 tag_set(node, tag, offset);
1355 }
1356
1357 node = child;
1358 offset = 0;
1359 continue;
1360 }
1361
1362 n = insert_entries(node, &node->slots[offset],
1363 RADIX_TREE_RETRY, order, false);
1364 BUG_ON(n > RADIX_TREE_MAP_SIZE);
1365
1366 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
1367 if (tags & (1 << tag))
1368 tag_set(node, tag, offset);
1369 offset += n;
1370
1371 while (offset == RADIX_TREE_MAP_SIZE) {
1372 if (node == parent)
1373 break;
1374 offset = node->offset;
1375 child = node;
1376 node = node->parent;
1377 rcu_assign_pointer(node->slots[offset],
1378 node_to_entry(child));
1379 offset++;
1380 }
1381 if ((node == parent) && (offset == end))
1382 return 0;
1383 }
1384
1385 nomem:
1386 /* Shouldn't happen; did user forget to preload? */
1387 /* TODO: free all the allocated nodes */
1388 WARN_ON(1);
1389 return -ENOMEM;
1390}
1391#endif
1392
1393static void node_tag_set(struct radix_tree_root *root, 959static void node_tag_set(struct radix_tree_root *root,
1394 struct radix_tree_node *node, 960 struct radix_tree_node *node,
1395 unsigned int tag, unsigned int offset) 961 unsigned int tag, unsigned int offset)
@@ -1447,18 +1013,6 @@ void *radix_tree_tag_set(struct radix_tree_root *root,
1447} 1013}
1448EXPORT_SYMBOL(radix_tree_tag_set); 1014EXPORT_SYMBOL(radix_tree_tag_set);
1449 1015
1450/**
1451 * radix_tree_iter_tag_set - set a tag on the current iterator entry
1452 * @root: radix tree root
1453 * @iter: iterator state
1454 * @tag: tag to set
1455 */
1456void radix_tree_iter_tag_set(struct radix_tree_root *root,
1457 const struct radix_tree_iter *iter, unsigned int tag)
1458{
1459 node_tag_set(root, iter->node, tag, iter_offset(iter));
1460}
1461
1462static void node_tag_clear(struct radix_tree_root *root, 1016static void node_tag_clear(struct radix_tree_root *root,
1463 struct radix_tree_node *node, 1017 struct radix_tree_node *node,
1464 unsigned int tag, unsigned int offset) 1018 unsigned int tag, unsigned int offset)
@@ -1574,14 +1128,6 @@ int radix_tree_tag_get(const struct radix_tree_root *root,
1574} 1128}
1575EXPORT_SYMBOL(radix_tree_tag_get); 1129EXPORT_SYMBOL(radix_tree_tag_get);
1576 1130
1577static inline void __set_iter_shift(struct radix_tree_iter *iter,
1578 unsigned int shift)
1579{
1580#ifdef CONFIG_RADIX_TREE_MULTIORDER
1581 iter->shift = shift;
1582#endif
1583}
1584
1585/* Construct iter->tags bit-mask from node->tags[tag] array */ 1131/* Construct iter->tags bit-mask from node->tags[tag] array */
1586static void set_iter_tags(struct radix_tree_iter *iter, 1132static void set_iter_tags(struct radix_tree_iter *iter,
1587 struct radix_tree_node *node, unsigned offset, 1133 struct radix_tree_node *node, unsigned offset,
@@ -1608,92 +1154,11 @@ static void set_iter_tags(struct radix_tree_iter *iter,
1608 } 1154 }
1609} 1155}
1610 1156
1611#ifdef CONFIG_RADIX_TREE_MULTIORDER
1612static void __rcu **skip_siblings(struct radix_tree_node **nodep,
1613 void __rcu **slot, struct radix_tree_iter *iter)
1614{
1615 while (iter->index < iter->next_index) {
1616 *nodep = rcu_dereference_raw(*slot);
1617 if (*nodep && !is_sibling_entry(iter->node, *nodep))
1618 return slot;
1619 slot++;
1620 iter->index = __radix_tree_iter_add(iter, 1);
1621 iter->tags >>= 1;
1622 }
1623
1624 *nodep = NULL;
1625 return NULL;
1626}
1627
1628void __rcu **__radix_tree_next_slot(void __rcu **slot,
1629 struct radix_tree_iter *iter, unsigned flags)
1630{
1631 unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK;
1632 struct radix_tree_node *node;
1633
1634 slot = skip_siblings(&node, slot, iter);
1635
1636 while (radix_tree_is_internal_node(node)) {
1637 unsigned offset;
1638 unsigned long next_index;
1639
1640 if (node == RADIX_TREE_RETRY)
1641 return slot;
1642 node = entry_to_node(node);
1643 iter->node = node;
1644 iter->shift = node->shift;
1645
1646 if (flags & RADIX_TREE_ITER_TAGGED) {
1647 offset = radix_tree_find_next_bit(node, tag, 0);
1648 if (offset == RADIX_TREE_MAP_SIZE)
1649 return NULL;
1650 slot = &node->slots[offset];
1651 iter->index = __radix_tree_iter_add(iter, offset);
1652 set_iter_tags(iter, node, offset, tag);
1653 node = rcu_dereference_raw(*slot);
1654 } else {
1655 offset = 0;
1656 slot = &node->slots[0];
1657 for (;;) {
1658 node = rcu_dereference_raw(*slot);
1659 if (node)
1660 break;
1661 slot++;
1662 offset++;
1663 if (offset == RADIX_TREE_MAP_SIZE)
1664 return NULL;
1665 }
1666 iter->index = __radix_tree_iter_add(iter, offset);
1667 }
1668 if ((flags & RADIX_TREE_ITER_CONTIG) && (offset > 0))
1669 goto none;
1670 next_index = (iter->index | shift_maxindex(iter->shift)) + 1;
1671 if (next_index < iter->next_index)
1672 iter->next_index = next_index;
1673 }
1674
1675 return slot;
1676 none:
1677 iter->next_index = 0;
1678 return NULL;
1679}
1680EXPORT_SYMBOL(__radix_tree_next_slot);
1681#else
1682static void __rcu **skip_siblings(struct radix_tree_node **nodep,
1683 void __rcu **slot, struct radix_tree_iter *iter)
1684{
1685 return slot;
1686}
1687#endif
1688
1689void __rcu **radix_tree_iter_resume(void __rcu **slot, 1157void __rcu **radix_tree_iter_resume(void __rcu **slot,
1690 struct radix_tree_iter *iter) 1158 struct radix_tree_iter *iter)
1691{ 1159{
1692 struct radix_tree_node *node;
1693
1694 slot++; 1160 slot++;
1695 iter->index = __radix_tree_iter_add(iter, 1); 1161 iter->index = __radix_tree_iter_add(iter, 1);
1696 skip_siblings(&node, slot, iter);
1697 iter->next_index = iter->index; 1162 iter->next_index = iter->index;
1698 iter->tags = 0; 1163 iter->tags = 0;
1699 return NULL; 1164 return NULL;
@@ -1744,8 +1209,7 @@ void __rcu **radix_tree_next_chunk(const struct radix_tree_root *root,
1744 iter->next_index = maxindex + 1; 1209 iter->next_index = maxindex + 1;
1745 iter->tags = 1; 1210 iter->tags = 1;
1746 iter->node = NULL; 1211 iter->node = NULL;
1747 __set_iter_shift(iter, 0); 1212 return (void __rcu **)&root->xa_head;
1748 return (void __rcu **)&root->rnode;
1749 } 1213 }
1750 1214
1751 do { 1215 do {
@@ -1765,8 +1229,6 @@ void __rcu **radix_tree_next_chunk(const struct radix_tree_root *root,
1765 while (++offset < RADIX_TREE_MAP_SIZE) { 1229 while (++offset < RADIX_TREE_MAP_SIZE) {
1766 void *slot = rcu_dereference_raw( 1230 void *slot = rcu_dereference_raw(
1767 node->slots[offset]); 1231 node->slots[offset]);
1768 if (is_sibling_entry(node, slot))
1769 continue;
1770 if (slot) 1232 if (slot)
1771 break; 1233 break;
1772 } 1234 }
@@ -1784,13 +1246,12 @@ void __rcu **radix_tree_next_chunk(const struct radix_tree_root *root,
1784 goto restart; 1246 goto restart;
1785 if (child == RADIX_TREE_RETRY) 1247 if (child == RADIX_TREE_RETRY)
1786 break; 1248 break;
1787 } while (radix_tree_is_internal_node(child)); 1249 } while (node->shift && radix_tree_is_internal_node(child));
1788 1250
1789 /* Update the iterator state */ 1251 /* Update the iterator state */
1790 iter->index = (index &~ node_maxindex(node)) | (offset << node->shift); 1252 iter->index = (index &~ node_maxindex(node)) | offset;
1791 iter->next_index = (index | node_maxindex(node)) + 1; 1253 iter->next_index = (index | node_maxindex(node)) + 1;
1792 iter->node = node; 1254 iter->node = node;
1793 __set_iter_shift(iter, node->shift);
1794 1255
1795 if (flags & RADIX_TREE_ITER_TAGGED) 1256 if (flags & RADIX_TREE_ITER_TAGGED)
1796 set_iter_tags(iter, node, offset, tag); 1257 set_iter_tags(iter, node, offset, tag);
@@ -1847,48 +1308,6 @@ radix_tree_gang_lookup(const struct radix_tree_root *root, void **results,
1847EXPORT_SYMBOL(radix_tree_gang_lookup); 1308EXPORT_SYMBOL(radix_tree_gang_lookup);
1848 1309
1849/** 1310/**
1850 * radix_tree_gang_lookup_slot - perform multiple slot lookup on radix tree
1851 * @root: radix tree root
1852 * @results: where the results of the lookup are placed
1853 * @indices: where their indices should be placed (but usually NULL)
1854 * @first_index: start the lookup from this key
1855 * @max_items: place up to this many items at *results
1856 *
1857 * Performs an index-ascending scan of the tree for present items. Places
1858 * their slots at *@results and returns the number of items which were
1859 * placed at *@results.
1860 *
1861 * The implementation is naive.
1862 *
1863 * Like radix_tree_gang_lookup as far as RCU and locking goes. Slots must
1864 * be dereferenced with radix_tree_deref_slot, and if using only RCU
1865 * protection, radix_tree_deref_slot may fail requiring a retry.
1866 */
1867unsigned int
1868radix_tree_gang_lookup_slot(const struct radix_tree_root *root,
1869 void __rcu ***results, unsigned long *indices,
1870 unsigned long first_index, unsigned int max_items)
1871{
1872 struct radix_tree_iter iter;
1873 void __rcu **slot;
1874 unsigned int ret = 0;
1875
1876 if (unlikely(!max_items))
1877 return 0;
1878
1879 radix_tree_for_each_slot(slot, root, &iter, first_index) {
1880 results[ret] = slot;
1881 if (indices)
1882 indices[ret] = iter.index;
1883 if (++ret == max_items)
1884 break;
1885 }
1886
1887 return ret;
1888}
1889EXPORT_SYMBOL(radix_tree_gang_lookup_slot);
1890
1891/**
1892 * radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree 1311 * radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree
1893 * based on a tag 1312 * based on a tag
1894 * @root: radix tree root 1313 * @root: radix tree root
@@ -1964,28 +1383,11 @@ radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *root,
1964} 1383}
1965EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot); 1384EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot);
1966 1385
1967/**
1968 * __radix_tree_delete_node - try to free node after clearing a slot
1969 * @root: radix tree root
1970 * @node: node containing @index
1971 * @update_node: callback for changing leaf nodes
1972 *
1973 * After clearing the slot at @index in @node from radix tree
1974 * rooted at @root, call this function to attempt freeing the
1975 * node and shrinking the tree.
1976 */
1977void __radix_tree_delete_node(struct radix_tree_root *root,
1978 struct radix_tree_node *node,
1979 radix_tree_update_node_t update_node)
1980{
1981 delete_node(root, node, update_node);
1982}
1983
1984static bool __radix_tree_delete(struct radix_tree_root *root, 1386static bool __radix_tree_delete(struct radix_tree_root *root,
1985 struct radix_tree_node *node, void __rcu **slot) 1387 struct radix_tree_node *node, void __rcu **slot)
1986{ 1388{
1987 void *old = rcu_dereference_raw(*slot); 1389 void *old = rcu_dereference_raw(*slot);
1988 int exceptional = radix_tree_exceptional_entry(old) ? -1 : 0; 1390 int values = xa_is_value(old) ? -1 : 0;
1989 unsigned offset = get_slot_offset(node, slot); 1391 unsigned offset = get_slot_offset(node, slot);
1990 int tag; 1392 int tag;
1991 1393
@@ -1995,8 +1397,8 @@ static bool __radix_tree_delete(struct radix_tree_root *root,
1995 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) 1397 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
1996 node_tag_clear(root, node, tag, offset); 1398 node_tag_clear(root, node, tag, offset);
1997 1399
1998 replace_slot(slot, NULL, node, -1, exceptional); 1400 replace_slot(slot, NULL, node, -1, values);
1999 return node && delete_node(root, node, NULL); 1401 return node && delete_node(root, node);
2000} 1402}
2001 1403
2002/** 1404/**
@@ -2068,19 +1470,6 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
2068} 1470}
2069EXPORT_SYMBOL(radix_tree_delete); 1471EXPORT_SYMBOL(radix_tree_delete);
2070 1472
2071void radix_tree_clear_tags(struct radix_tree_root *root,
2072 struct radix_tree_node *node,
2073 void __rcu **slot)
2074{
2075 if (node) {
2076 unsigned int tag, offset = get_slot_offset(node, slot);
2077 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
2078 node_tag_clear(root, node, tag, offset);
2079 } else {
2080 root_tag_clear_all(root);
2081 }
2082}
2083
2084/** 1473/**
2085 * radix_tree_tagged - test whether any items in the tree are tagged 1474 * radix_tree_tagged - test whether any items in the tree are tagged
2086 * @root: radix tree root 1475 * @root: radix tree root
@@ -2106,33 +1495,12 @@ void idr_preload(gfp_t gfp_mask)
2106} 1495}
2107EXPORT_SYMBOL(idr_preload); 1496EXPORT_SYMBOL(idr_preload);
2108 1497
2109int ida_pre_get(struct ida *ida, gfp_t gfp)
2110{
2111 /*
2112 * The IDA API has no preload_end() equivalent. Instead,
2113 * ida_get_new() can return -EAGAIN, prompting the caller
2114 * to return to the ida_pre_get() step.
2115 */
2116 if (!__radix_tree_preload(gfp, IDA_PRELOAD_SIZE))
2117 preempt_enable();
2118
2119 if (!this_cpu_read(ida_bitmap)) {
2120 struct ida_bitmap *bitmap = kzalloc(sizeof(*bitmap), gfp);
2121 if (!bitmap)
2122 return 0;
2123 if (this_cpu_cmpxchg(ida_bitmap, NULL, bitmap))
2124 kfree(bitmap);
2125 }
2126
2127 return 1;
2128}
2129
2130void __rcu **idr_get_free(struct radix_tree_root *root, 1498void __rcu **idr_get_free(struct radix_tree_root *root,
2131 struct radix_tree_iter *iter, gfp_t gfp, 1499 struct radix_tree_iter *iter, gfp_t gfp,
2132 unsigned long max) 1500 unsigned long max)
2133{ 1501{
2134 struct radix_tree_node *node = NULL, *child; 1502 struct radix_tree_node *node = NULL, *child;
2135 void __rcu **slot = (void __rcu **)&root->rnode; 1503 void __rcu **slot = (void __rcu **)&root->xa_head;
2136 unsigned long maxindex, start = iter->next_index; 1504 unsigned long maxindex, start = iter->next_index;
2137 unsigned int shift, offset = 0; 1505 unsigned int shift, offset = 0;
2138 1506
@@ -2148,8 +1516,10 @@ void __rcu **idr_get_free(struct radix_tree_root *root,
2148 if (error < 0) 1516 if (error < 0)
2149 return ERR_PTR(error); 1517 return ERR_PTR(error);
2150 shift = error; 1518 shift = error;
2151 child = rcu_dereference_raw(root->rnode); 1519 child = rcu_dereference_raw(root->xa_head);
2152 } 1520 }
1521 if (start == 0 && shift == 0)
1522 shift = RADIX_TREE_MAP_SHIFT;
2153 1523
2154 while (shift) { 1524 while (shift) {
2155 shift -= RADIX_TREE_MAP_SHIFT; 1525 shift -= RADIX_TREE_MAP_SHIFT;
@@ -2192,7 +1562,6 @@ void __rcu **idr_get_free(struct radix_tree_root *root,
2192 else 1562 else
2193 iter->next_index = 1; 1563 iter->next_index = 1;
2194 iter->node = node; 1564 iter->node = node;
2195 __set_iter_shift(iter, shift);
2196 set_iter_tags(iter, node, offset, IDR_FREE); 1565 set_iter_tags(iter, node, offset, IDR_FREE);
2197 1566
2198 return slot; 1567 return slot;
@@ -2211,10 +1580,10 @@ void __rcu **idr_get_free(struct radix_tree_root *root,
2211 */ 1580 */
2212void idr_destroy(struct idr *idr) 1581void idr_destroy(struct idr *idr)
2213{ 1582{
2214 struct radix_tree_node *node = rcu_dereference_raw(idr->idr_rt.rnode); 1583 struct radix_tree_node *node = rcu_dereference_raw(idr->idr_rt.xa_head);
2215 if (radix_tree_is_internal_node(node)) 1584 if (radix_tree_is_internal_node(node))
2216 radix_tree_free_nodes(node); 1585 radix_tree_free_nodes(node);
2217 idr->idr_rt.rnode = NULL; 1586 idr->idr_rt.xa_head = NULL;
2218 root_tag_set(&idr->idr_rt, IDR_FREE); 1587 root_tag_set(&idr->idr_rt, IDR_FREE);
2219} 1588}
2220EXPORT_SYMBOL(idr_destroy); 1589EXPORT_SYMBOL(idr_destroy);
@@ -2228,31 +1597,6 @@ radix_tree_node_ctor(void *arg)
2228 INIT_LIST_HEAD(&node->private_list); 1597 INIT_LIST_HEAD(&node->private_list);
2229} 1598}
2230 1599
2231static __init unsigned long __maxindex(unsigned int height)
2232{
2233 unsigned int width = height * RADIX_TREE_MAP_SHIFT;
2234 int shift = RADIX_TREE_INDEX_BITS - width;
2235
2236 if (shift < 0)
2237 return ~0UL;
2238 if (shift >= BITS_PER_LONG)
2239 return 0UL;
2240 return ~0UL >> shift;
2241}
2242
2243static __init void radix_tree_init_maxnodes(void)
2244{
2245 unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1];
2246 unsigned int i, j;
2247
2248 for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++)
2249 height_to_maxindex[i] = __maxindex(i);
2250 for (i = 0; i < ARRAY_SIZE(height_to_maxnodes); i++) {
2251 for (j = i; j > 0; j--)
2252 height_to_maxnodes[i] += height_to_maxindex[j - 1] + 1;
2253 }
2254}
2255
2256static int radix_tree_cpu_dead(unsigned int cpu) 1600static int radix_tree_cpu_dead(unsigned int cpu)
2257{ 1601{
2258 struct radix_tree_preload *rtp; 1602 struct radix_tree_preload *rtp;
@@ -2266,8 +1610,6 @@ static int radix_tree_cpu_dead(unsigned int cpu)
2266 kmem_cache_free(radix_tree_node_cachep, node); 1610 kmem_cache_free(radix_tree_node_cachep, node);
2267 rtp->nr--; 1611 rtp->nr--;
2268 } 1612 }
2269 kfree(per_cpu(ida_bitmap, cpu));
2270 per_cpu(ida_bitmap, cpu) = NULL;
2271 return 0; 1613 return 0;
2272} 1614}
2273 1615
@@ -2277,11 +1619,11 @@ void __init radix_tree_init(void)
2277 1619
2278 BUILD_BUG_ON(RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT > 32); 1620 BUILD_BUG_ON(RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT > 32);
2279 BUILD_BUG_ON(ROOT_IS_IDR & ~GFP_ZONEMASK); 1621 BUILD_BUG_ON(ROOT_IS_IDR & ~GFP_ZONEMASK);
1622 BUILD_BUG_ON(XA_CHUNK_SIZE > 255);
2280 radix_tree_node_cachep = kmem_cache_create("radix_tree_node", 1623 radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
2281 sizeof(struct radix_tree_node), 0, 1624 sizeof(struct radix_tree_node), 0,
2282 SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, 1625 SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
2283 radix_tree_node_ctor); 1626 radix_tree_node_ctor);
2284 radix_tree_init_maxnodes();
2285 ret = cpuhp_setup_state_nocalls(CPUHP_RADIX_DEAD, "lib/radix:dead", 1627 ret = cpuhp_setup_state_nocalls(CPUHP_RADIX_DEAD, "lib/radix:dead",
2286 NULL, radix_tree_cpu_dead); 1628 NULL, radix_tree_cpu_dead);
2287 WARN_ON(ret < 0); 1629 WARN_ON(ret < 0);
diff --git a/lib/test_xarray.c b/lib/test_xarray.c
new file mode 100644
index 000000000000..aa47754150ce
--- /dev/null
+++ b/lib/test_xarray.c
@@ -0,0 +1,1238 @@
1// SPDX-License-Identifier: GPL-2.0+
2/*
3 * test_xarray.c: Test the XArray API
4 * Copyright (c) 2017-2018 Microsoft Corporation
5 * Author: Matthew Wilcox <willy@infradead.org>
6 */
7
8#include <linux/xarray.h>
9#include <linux/module.h>
10
11static unsigned int tests_run;
12static unsigned int tests_passed;
13
14#ifndef XA_DEBUG
15# ifdef __KERNEL__
16void xa_dump(const struct xarray *xa) { }
17# endif
18#undef XA_BUG_ON
19#define XA_BUG_ON(xa, x) do { \
20 tests_run++; \
21 if (x) { \
22 printk("BUG at %s:%d\n", __func__, __LINE__); \
23 xa_dump(xa); \
24 dump_stack(); \
25 } else { \
26 tests_passed++; \
27 } \
28} while (0)
29#endif
30
31static void *xa_store_index(struct xarray *xa, unsigned long index, gfp_t gfp)
32{
33 return xa_store(xa, index, xa_mk_value(index & LONG_MAX), gfp);
34}
35
36static void xa_alloc_index(struct xarray *xa, unsigned long index, gfp_t gfp)
37{
38 u32 id = 0;
39
40 XA_BUG_ON(xa, xa_alloc(xa, &id, UINT_MAX, xa_mk_value(index & LONG_MAX),
41 gfp) != 0);
42 XA_BUG_ON(xa, id != index);
43}
44
45static void xa_erase_index(struct xarray *xa, unsigned long index)
46{
47 XA_BUG_ON(xa, xa_erase(xa, index) != xa_mk_value(index & LONG_MAX));
48 XA_BUG_ON(xa, xa_load(xa, index) != NULL);
49}
50
51/*
52 * If anyone needs this, please move it to xarray.c. We have no current
53 * users outside the test suite because all current multislot users want
54 * to use the advanced API.
55 */
56static void *xa_store_order(struct xarray *xa, unsigned long index,
57 unsigned order, void *entry, gfp_t gfp)
58{
59 XA_STATE_ORDER(xas, xa, index, order);
60 void *curr;
61
62 do {
63 xas_lock(&xas);
64 curr = xas_store(&xas, entry);
65 xas_unlock(&xas);
66 } while (xas_nomem(&xas, gfp));
67
68 return curr;
69}
70
71static noinline void check_xa_err(struct xarray *xa)
72{
73 XA_BUG_ON(xa, xa_err(xa_store_index(xa, 0, GFP_NOWAIT)) != 0);
74 XA_BUG_ON(xa, xa_err(xa_erase(xa, 0)) != 0);
75#ifndef __KERNEL__
76 /* The kernel does not fail GFP_NOWAIT allocations */
77 XA_BUG_ON(xa, xa_err(xa_store_index(xa, 1, GFP_NOWAIT)) != -ENOMEM);
78 XA_BUG_ON(xa, xa_err(xa_store_index(xa, 1, GFP_NOWAIT)) != -ENOMEM);
79#endif
80 XA_BUG_ON(xa, xa_err(xa_store_index(xa, 1, GFP_KERNEL)) != 0);
81 XA_BUG_ON(xa, xa_err(xa_store(xa, 1, xa_mk_value(0), GFP_KERNEL)) != 0);
82 XA_BUG_ON(xa, xa_err(xa_erase(xa, 1)) != 0);
83// kills the test-suite :-(
84// XA_BUG_ON(xa, xa_err(xa_store(xa, 0, xa_mk_internal(0), 0)) != -EINVAL);
85}
86
87static noinline void check_xas_retry(struct xarray *xa)
88{
89 XA_STATE(xas, xa, 0);
90 void *entry;
91
92 xa_store_index(xa, 0, GFP_KERNEL);
93 xa_store_index(xa, 1, GFP_KERNEL);
94
95 rcu_read_lock();
96 XA_BUG_ON(xa, xas_find(&xas, ULONG_MAX) != xa_mk_value(0));
97 xa_erase_index(xa, 1);
98 XA_BUG_ON(xa, !xa_is_retry(xas_reload(&xas)));
99 XA_BUG_ON(xa, xas_retry(&xas, NULL));
100 XA_BUG_ON(xa, xas_retry(&xas, xa_mk_value(0)));
101 xas_reset(&xas);
102 XA_BUG_ON(xa, xas.xa_node != XAS_RESTART);
103 XA_BUG_ON(xa, xas_next_entry(&xas, ULONG_MAX) != xa_mk_value(0));
104 XA_BUG_ON(xa, xas.xa_node != NULL);
105
106 XA_BUG_ON(xa, xa_store_index(xa, 1, GFP_KERNEL) != NULL);
107 XA_BUG_ON(xa, !xa_is_internal(xas_reload(&xas)));
108 xas.xa_node = XAS_RESTART;
109 XA_BUG_ON(xa, xas_next_entry(&xas, ULONG_MAX) != xa_mk_value(0));
110 rcu_read_unlock();
111
112 /* Make sure we can iterate through retry entries */
113 xas_lock(&xas);
114 xas_set(&xas, 0);
115 xas_store(&xas, XA_RETRY_ENTRY);
116 xas_set(&xas, 1);
117 xas_store(&xas, XA_RETRY_ENTRY);
118
119 xas_set(&xas, 0);
120 xas_for_each(&xas, entry, ULONG_MAX) {
121 xas_store(&xas, xa_mk_value(xas.xa_index));
122 }
123 xas_unlock(&xas);
124
125 xa_erase_index(xa, 0);
126 xa_erase_index(xa, 1);
127}
128
129static noinline void check_xa_load(struct xarray *xa)
130{
131 unsigned long i, j;
132
133 for (i = 0; i < 1024; i++) {
134 for (j = 0; j < 1024; j++) {
135 void *entry = xa_load(xa, j);
136 if (j < i)
137 XA_BUG_ON(xa, xa_to_value(entry) != j);
138 else
139 XA_BUG_ON(xa, entry);
140 }
141 XA_BUG_ON(xa, xa_store_index(xa, i, GFP_KERNEL) != NULL);
142 }
143
144 for (i = 0; i < 1024; i++) {
145 for (j = 0; j < 1024; j++) {
146 void *entry = xa_load(xa, j);
147 if (j >= i)
148 XA_BUG_ON(xa, xa_to_value(entry) != j);
149 else
150 XA_BUG_ON(xa, entry);
151 }
152 xa_erase_index(xa, i);
153 }
154 XA_BUG_ON(xa, !xa_empty(xa));
155}
156
157static noinline void check_xa_mark_1(struct xarray *xa, unsigned long index)
158{
159 unsigned int order;
160 unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 8 : 1;
161
162 /* NULL elements have no marks set */
163 XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_0));
164 xa_set_mark(xa, index, XA_MARK_0);
165 XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_0));
166
167 /* Storing a pointer will not make a mark appear */
168 XA_BUG_ON(xa, xa_store_index(xa, index, GFP_KERNEL) != NULL);
169 XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_0));
170 xa_set_mark(xa, index, XA_MARK_0);
171 XA_BUG_ON(xa, !xa_get_mark(xa, index, XA_MARK_0));
172
173 /* Setting one mark will not set another mark */
174 XA_BUG_ON(xa, xa_get_mark(xa, index + 1, XA_MARK_0));
175 XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_1));
176
177 /* Storing NULL clears marks, and they can't be set again */
178 xa_erase_index(xa, index);
179 XA_BUG_ON(xa, !xa_empty(xa));
180 XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_0));
181 xa_set_mark(xa, index, XA_MARK_0);
182 XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_0));
183
184 /*
185 * Storing a multi-index entry over entries with marks gives the
186 * entire entry the union of the marks
187 */
188 BUG_ON((index % 4) != 0);
189 for (order = 2; order < max_order; order++) {
190 unsigned long base = round_down(index, 1UL << order);
191 unsigned long next = base + (1UL << order);
192 unsigned long i;
193
194 XA_BUG_ON(xa, xa_store_index(xa, index + 1, GFP_KERNEL));
195 xa_set_mark(xa, index + 1, XA_MARK_0);
196 XA_BUG_ON(xa, xa_store_index(xa, index + 2, GFP_KERNEL));
197 xa_set_mark(xa, index + 2, XA_MARK_1);
198 XA_BUG_ON(xa, xa_store_index(xa, next, GFP_KERNEL));
199 xa_store_order(xa, index, order, xa_mk_value(index),
200 GFP_KERNEL);
201 for (i = base; i < next; i++) {
202 XA_STATE(xas, xa, i);
203 unsigned int seen = 0;
204 void *entry;
205
206 XA_BUG_ON(xa, !xa_get_mark(xa, i, XA_MARK_0));
207 XA_BUG_ON(xa, !xa_get_mark(xa, i, XA_MARK_1));
208 XA_BUG_ON(xa, xa_get_mark(xa, i, XA_MARK_2));
209
210 /* We should see two elements in the array */
211 xas_for_each(&xas, entry, ULONG_MAX)
212 seen++;
213 XA_BUG_ON(xa, seen != 2);
214
215 /* One of which is marked */
216 xas_set(&xas, 0);
217 seen = 0;
218 xas_for_each_marked(&xas, entry, ULONG_MAX, XA_MARK_0)
219 seen++;
220 XA_BUG_ON(xa, seen != 1);
221 }
222 XA_BUG_ON(xa, xa_get_mark(xa, next, XA_MARK_0));
223 XA_BUG_ON(xa, xa_get_mark(xa, next, XA_MARK_1));
224 XA_BUG_ON(xa, xa_get_mark(xa, next, XA_MARK_2));
225 xa_erase_index(xa, index);
226 xa_erase_index(xa, next);
227 XA_BUG_ON(xa, !xa_empty(xa));
228 }
229 XA_BUG_ON(xa, !xa_empty(xa));
230}
231
232static noinline void check_xa_mark_2(struct xarray *xa)
233{
234 XA_STATE(xas, xa, 0);
235 unsigned long index;
236 unsigned int count = 0;
237 void *entry;
238
239 xa_store_index(xa, 0, GFP_KERNEL);
240 xa_set_mark(xa, 0, XA_MARK_0);
241 xas_lock(&xas);
242 xas_load(&xas);
243 xas_init_marks(&xas);
244 xas_unlock(&xas);
245 XA_BUG_ON(xa, !xa_get_mark(xa, 0, XA_MARK_0) == 0);
246
247 for (index = 3500; index < 4500; index++) {
248 xa_store_index(xa, index, GFP_KERNEL);
249 xa_set_mark(xa, index, XA_MARK_0);
250 }
251
252 xas_reset(&xas);
253 rcu_read_lock();
254 xas_for_each_marked(&xas, entry, ULONG_MAX, XA_MARK_0)
255 count++;
256 rcu_read_unlock();
257 XA_BUG_ON(xa, count != 1000);
258
259 xas_lock(&xas);
260 xas_for_each(&xas, entry, ULONG_MAX) {
261 xas_init_marks(&xas);
262 XA_BUG_ON(xa, !xa_get_mark(xa, xas.xa_index, XA_MARK_0));
263 XA_BUG_ON(xa, !xas_get_mark(&xas, XA_MARK_0));
264 }
265 xas_unlock(&xas);
266
267 xa_destroy(xa);
268}
269
270static noinline void check_xa_mark(struct xarray *xa)
271{
272 unsigned long index;
273
274 for (index = 0; index < 16384; index += 4)
275 check_xa_mark_1(xa, index);
276
277 check_xa_mark_2(xa);
278}
279
280static noinline void check_xa_shrink(struct xarray *xa)
281{
282 XA_STATE(xas, xa, 1);
283 struct xa_node *node;
284 unsigned int order;
285 unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 15 : 1;
286
287 XA_BUG_ON(xa, !xa_empty(xa));
288 XA_BUG_ON(xa, xa_store_index(xa, 0, GFP_KERNEL) != NULL);
289 XA_BUG_ON(xa, xa_store_index(xa, 1, GFP_KERNEL) != NULL);
290
291 /*
292 * Check that erasing the entry at 1 shrinks the tree and properly
293 * marks the node as being deleted.
294 */
295 xas_lock(&xas);
296 XA_BUG_ON(xa, xas_load(&xas) != xa_mk_value(1));
297 node = xas.xa_node;
298 XA_BUG_ON(xa, xa_entry_locked(xa, node, 0) != xa_mk_value(0));
299 XA_BUG_ON(xa, xas_store(&xas, NULL) != xa_mk_value(1));
300 XA_BUG_ON(xa, xa_load(xa, 1) != NULL);
301 XA_BUG_ON(xa, xas.xa_node != XAS_BOUNDS);
302 XA_BUG_ON(xa, xa_entry_locked(xa, node, 0) != XA_RETRY_ENTRY);
303 XA_BUG_ON(xa, xas_load(&xas) != NULL);
304 xas_unlock(&xas);
305 XA_BUG_ON(xa, xa_load(xa, 0) != xa_mk_value(0));
306 xa_erase_index(xa, 0);
307 XA_BUG_ON(xa, !xa_empty(xa));
308
309 for (order = 0; order < max_order; order++) {
310 unsigned long max = (1UL << order) - 1;
311 xa_store_order(xa, 0, order, xa_mk_value(0), GFP_KERNEL);
312 XA_BUG_ON(xa, xa_load(xa, max) != xa_mk_value(0));
313 XA_BUG_ON(xa, xa_load(xa, max + 1) != NULL);
314 rcu_read_lock();
315 node = xa_head(xa);
316 rcu_read_unlock();
317 XA_BUG_ON(xa, xa_store_index(xa, ULONG_MAX, GFP_KERNEL) !=
318 NULL);
319 rcu_read_lock();
320 XA_BUG_ON(xa, xa_head(xa) == node);
321 rcu_read_unlock();
322 XA_BUG_ON(xa, xa_load(xa, max + 1) != NULL);
323 xa_erase_index(xa, ULONG_MAX);
324 XA_BUG_ON(xa, xa->xa_head != node);
325 xa_erase_index(xa, 0);
326 }
327}
328
329static noinline void check_cmpxchg(struct xarray *xa)
330{
331 void *FIVE = xa_mk_value(5);
332 void *SIX = xa_mk_value(6);
333 void *LOTS = xa_mk_value(12345678);
334
335 XA_BUG_ON(xa, !xa_empty(xa));
336 XA_BUG_ON(xa, xa_store_index(xa, 12345678, GFP_KERNEL) != NULL);
337 XA_BUG_ON(xa, xa_insert(xa, 12345678, xa, GFP_KERNEL) != -EEXIST);
338 XA_BUG_ON(xa, xa_cmpxchg(xa, 12345678, SIX, FIVE, GFP_KERNEL) != LOTS);
339 XA_BUG_ON(xa, xa_cmpxchg(xa, 12345678, LOTS, FIVE, GFP_KERNEL) != LOTS);
340 XA_BUG_ON(xa, xa_cmpxchg(xa, 12345678, FIVE, LOTS, GFP_KERNEL) != FIVE);
341 XA_BUG_ON(xa, xa_cmpxchg(xa, 5, FIVE, NULL, GFP_KERNEL) != NULL);
342 XA_BUG_ON(xa, xa_cmpxchg(xa, 5, NULL, FIVE, GFP_KERNEL) != NULL);
343 xa_erase_index(xa, 12345678);
344 xa_erase_index(xa, 5);
345 XA_BUG_ON(xa, !xa_empty(xa));
346}
347
348static noinline void check_reserve(struct xarray *xa)
349{
350 void *entry;
351 unsigned long index = 0;
352
353 /* An array with a reserved entry is not empty */
354 XA_BUG_ON(xa, !xa_empty(xa));
355 xa_reserve(xa, 12345678, GFP_KERNEL);
356 XA_BUG_ON(xa, xa_empty(xa));
357 XA_BUG_ON(xa, xa_load(xa, 12345678));
358 xa_release(xa, 12345678);
359 XA_BUG_ON(xa, !xa_empty(xa));
360
361 /* Releasing a used entry does nothing */
362 xa_reserve(xa, 12345678, GFP_KERNEL);
363 XA_BUG_ON(xa, xa_store_index(xa, 12345678, GFP_NOWAIT) != NULL);
364 xa_release(xa, 12345678);
365 xa_erase_index(xa, 12345678);
366 XA_BUG_ON(xa, !xa_empty(xa));
367
368 /* cmpxchg sees a reserved entry as NULL */
369 xa_reserve(xa, 12345678, GFP_KERNEL);
370 XA_BUG_ON(xa, xa_cmpxchg(xa, 12345678, NULL, xa_mk_value(12345678),
371 GFP_NOWAIT) != NULL);
372 xa_release(xa, 12345678);
373 xa_erase_index(xa, 12345678);
374 XA_BUG_ON(xa, !xa_empty(xa));
375
376 /* Can iterate through a reserved entry */
377 xa_store_index(xa, 5, GFP_KERNEL);
378 xa_reserve(xa, 6, GFP_KERNEL);
379 xa_store_index(xa, 7, GFP_KERNEL);
380
381 xa_for_each(xa, entry, index, ULONG_MAX, XA_PRESENT) {
382 XA_BUG_ON(xa, index != 5 && index != 7);
383 }
384 xa_destroy(xa);
385}
386
387static noinline void check_xas_erase(struct xarray *xa)
388{
389 XA_STATE(xas, xa, 0);
390 void *entry;
391 unsigned long i, j;
392
393 for (i = 0; i < 200; i++) {
394 for (j = i; j < 2 * i + 17; j++) {
395 xas_set(&xas, j);
396 do {
397 xas_lock(&xas);
398 xas_store(&xas, xa_mk_value(j));
399 xas_unlock(&xas);
400 } while (xas_nomem(&xas, GFP_KERNEL));
401 }
402
403 xas_set(&xas, ULONG_MAX);
404 do {
405 xas_lock(&xas);
406 xas_store(&xas, xa_mk_value(0));
407 xas_unlock(&xas);
408 } while (xas_nomem(&xas, GFP_KERNEL));
409
410 xas_lock(&xas);
411 xas_store(&xas, NULL);
412
413 xas_set(&xas, 0);
414 j = i;
415 xas_for_each(&xas, entry, ULONG_MAX) {
416 XA_BUG_ON(xa, entry != xa_mk_value(j));
417 xas_store(&xas, NULL);
418 j++;
419 }
420 xas_unlock(&xas);
421 XA_BUG_ON(xa, !xa_empty(xa));
422 }
423}
424
425#ifdef CONFIG_XARRAY_MULTI
426static noinline void check_multi_store_1(struct xarray *xa, unsigned long index,
427 unsigned int order)
428{
429 XA_STATE(xas, xa, index);
430 unsigned long min = index & ~((1UL << order) - 1);
431 unsigned long max = min + (1UL << order);
432
433 xa_store_order(xa, index, order, xa_mk_value(index), GFP_KERNEL);
434 XA_BUG_ON(xa, xa_load(xa, min) != xa_mk_value(index));
435 XA_BUG_ON(xa, xa_load(xa, max - 1) != xa_mk_value(index));
436 XA_BUG_ON(xa, xa_load(xa, max) != NULL);
437 XA_BUG_ON(xa, xa_load(xa, min - 1) != NULL);
438
439 XA_BUG_ON(xa, xas_store(&xas, xa_mk_value(min)) != xa_mk_value(index));
440 XA_BUG_ON(xa, xa_load(xa, min) != xa_mk_value(min));
441 XA_BUG_ON(xa, xa_load(xa, max - 1) != xa_mk_value(min));
442 XA_BUG_ON(xa, xa_load(xa, max) != NULL);
443 XA_BUG_ON(xa, xa_load(xa, min - 1) != NULL);
444
445 xa_erase_index(xa, min);
446 XA_BUG_ON(xa, !xa_empty(xa));
447}
448
449static noinline void check_multi_store_2(struct xarray *xa, unsigned long index,
450 unsigned int order)
451{
452 XA_STATE(xas, xa, index);
453 xa_store_order(xa, index, order, xa_mk_value(0), GFP_KERNEL);
454
455 XA_BUG_ON(xa, xas_store(&xas, xa_mk_value(1)) != xa_mk_value(0));
456 XA_BUG_ON(xa, xas.xa_index != index);
457 XA_BUG_ON(xa, xas_store(&xas, NULL) != xa_mk_value(1));
458 XA_BUG_ON(xa, !xa_empty(xa));
459}
460#endif
461
462static noinline void check_multi_store(struct xarray *xa)
463{
464#ifdef CONFIG_XARRAY_MULTI
465 unsigned long i, j, k;
466 unsigned int max_order = (sizeof(long) == 4) ? 30 : 60;
467
468 /* Loading from any position returns the same value */
469 xa_store_order(xa, 0, 1, xa_mk_value(0), GFP_KERNEL);
470 XA_BUG_ON(xa, xa_load(xa, 0) != xa_mk_value(0));
471 XA_BUG_ON(xa, xa_load(xa, 1) != xa_mk_value(0));
472 XA_BUG_ON(xa, xa_load(xa, 2) != NULL);
473 rcu_read_lock();
474 XA_BUG_ON(xa, xa_to_node(xa_head(xa))->count != 2);
475 XA_BUG_ON(xa, xa_to_node(xa_head(xa))->nr_values != 2);
476 rcu_read_unlock();
477
478 /* Storing adjacent to the value does not alter the value */
479 xa_store(xa, 3, xa, GFP_KERNEL);
480 XA_BUG_ON(xa, xa_load(xa, 0) != xa_mk_value(0));
481 XA_BUG_ON(xa, xa_load(xa, 1) != xa_mk_value(0));
482 XA_BUG_ON(xa, xa_load(xa, 2) != NULL);
483 rcu_read_lock();
484 XA_BUG_ON(xa, xa_to_node(xa_head(xa))->count != 3);
485 XA_BUG_ON(xa, xa_to_node(xa_head(xa))->nr_values != 2);
486 rcu_read_unlock();
487
488 /* Overwriting multiple indexes works */
489 xa_store_order(xa, 0, 2, xa_mk_value(1), GFP_KERNEL);
490 XA_BUG_ON(xa, xa_load(xa, 0) != xa_mk_value(1));
491 XA_BUG_ON(xa, xa_load(xa, 1) != xa_mk_value(1));
492 XA_BUG_ON(xa, xa_load(xa, 2) != xa_mk_value(1));
493 XA_BUG_ON(xa, xa_load(xa, 3) != xa_mk_value(1));
494 XA_BUG_ON(xa, xa_load(xa, 4) != NULL);
495 rcu_read_lock();
496 XA_BUG_ON(xa, xa_to_node(xa_head(xa))->count != 4);
497 XA_BUG_ON(xa, xa_to_node(xa_head(xa))->nr_values != 4);
498 rcu_read_unlock();
499
500 /* We can erase multiple values with a single store */
501 xa_store_order(xa, 0, 63, NULL, GFP_KERNEL);
502 XA_BUG_ON(xa, !xa_empty(xa));
503
504 /* Even when the first slot is empty but the others aren't */
505 xa_store_index(xa, 1, GFP_KERNEL);
506 xa_store_index(xa, 2, GFP_KERNEL);
507 xa_store_order(xa, 0, 2, NULL, GFP_KERNEL);
508 XA_BUG_ON(xa, !xa_empty(xa));
509
510 for (i = 0; i < max_order; i++) {
511 for (j = 0; j < max_order; j++) {
512 xa_store_order(xa, 0, i, xa_mk_value(i), GFP_KERNEL);
513 xa_store_order(xa, 0, j, xa_mk_value(j), GFP_KERNEL);
514
515 for (k = 0; k < max_order; k++) {
516 void *entry = xa_load(xa, (1UL << k) - 1);
517 if ((i < k) && (j < k))
518 XA_BUG_ON(xa, entry != NULL);
519 else
520 XA_BUG_ON(xa, entry != xa_mk_value(j));
521 }
522
523 xa_erase(xa, 0);
524 XA_BUG_ON(xa, !xa_empty(xa));
525 }
526 }
527
528 for (i = 0; i < 20; i++) {
529 check_multi_store_1(xa, 200, i);
530 check_multi_store_1(xa, 0, i);
531 check_multi_store_1(xa, (1UL << i) + 1, i);
532 }
533 check_multi_store_2(xa, 4095, 9);
534#endif
535}
536
537static DEFINE_XARRAY_ALLOC(xa0);
538
539static noinline void check_xa_alloc(void)
540{
541 int i;
542 u32 id;
543
544 /* An empty array should assign 0 to the first alloc */
545 xa_alloc_index(&xa0, 0, GFP_KERNEL);
546
547 /* Erasing it should make the array empty again */
548 xa_erase_index(&xa0, 0);
549 XA_BUG_ON(&xa0, !xa_empty(&xa0));
550
551 /* And it should assign 0 again */
552 xa_alloc_index(&xa0, 0, GFP_KERNEL);
553
554 /* The next assigned ID should be 1 */
555 xa_alloc_index(&xa0, 1, GFP_KERNEL);
556 xa_erase_index(&xa0, 1);
557
558 /* Storing a value should mark it used */
559 xa_store_index(&xa0, 1, GFP_KERNEL);
560 xa_alloc_index(&xa0, 2, GFP_KERNEL);
561
562 /* If we then erase 0, it should be free */
563 xa_erase_index(&xa0, 0);
564 xa_alloc_index(&xa0, 0, GFP_KERNEL);
565
566 xa_erase_index(&xa0, 1);
567 xa_erase_index(&xa0, 2);
568
569 for (i = 1; i < 5000; i++) {
570 xa_alloc_index(&xa0, i, GFP_KERNEL);
571 }
572
573 xa_destroy(&xa0);
574
575 id = 0xfffffffeU;
576 XA_BUG_ON(&xa0, xa_alloc(&xa0, &id, UINT_MAX, xa_mk_value(0),
577 GFP_KERNEL) != 0);
578 XA_BUG_ON(&xa0, id != 0xfffffffeU);
579 XA_BUG_ON(&xa0, xa_alloc(&xa0, &id, UINT_MAX, xa_mk_value(0),
580 GFP_KERNEL) != 0);
581 XA_BUG_ON(&xa0, id != 0xffffffffU);
582 XA_BUG_ON(&xa0, xa_alloc(&xa0, &id, UINT_MAX, xa_mk_value(0),
583 GFP_KERNEL) != -ENOSPC);
584 XA_BUG_ON(&xa0, id != 0xffffffffU);
585 xa_destroy(&xa0);
586}
587
588static noinline void __check_store_iter(struct xarray *xa, unsigned long start,
589 unsigned int order, unsigned int present)
590{
591 XA_STATE_ORDER(xas, xa, start, order);
592 void *entry;
593 unsigned int count = 0;
594
595retry:
596 xas_lock(&xas);
597 xas_for_each_conflict(&xas, entry) {
598 XA_BUG_ON(xa, !xa_is_value(entry));
599 XA_BUG_ON(xa, entry < xa_mk_value(start));
600 XA_BUG_ON(xa, entry > xa_mk_value(start + (1UL << order) - 1));
601 count++;
602 }
603 xas_store(&xas, xa_mk_value(start));
604 xas_unlock(&xas);
605 if (xas_nomem(&xas, GFP_KERNEL)) {
606 count = 0;
607 goto retry;
608 }
609 XA_BUG_ON(xa, xas_error(&xas));
610 XA_BUG_ON(xa, count != present);
611 XA_BUG_ON(xa, xa_load(xa, start) != xa_mk_value(start));
612 XA_BUG_ON(xa, xa_load(xa, start + (1UL << order) - 1) !=
613 xa_mk_value(start));
614 xa_erase_index(xa, start);
615}
616
617static noinline void check_store_iter(struct xarray *xa)
618{
619 unsigned int i, j;
620 unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 20 : 1;
621
622 for (i = 0; i < max_order; i++) {
623 unsigned int min = 1 << i;
624 unsigned int max = (2 << i) - 1;
625 __check_store_iter(xa, 0, i, 0);
626 XA_BUG_ON(xa, !xa_empty(xa));
627 __check_store_iter(xa, min, i, 0);
628 XA_BUG_ON(xa, !xa_empty(xa));
629
630 xa_store_index(xa, min, GFP_KERNEL);
631 __check_store_iter(xa, min, i, 1);
632 XA_BUG_ON(xa, !xa_empty(xa));
633 xa_store_index(xa, max, GFP_KERNEL);
634 __check_store_iter(xa, min, i, 1);
635 XA_BUG_ON(xa, !xa_empty(xa));
636
637 for (j = 0; j < min; j++)
638 xa_store_index(xa, j, GFP_KERNEL);
639 __check_store_iter(xa, 0, i, min);
640 XA_BUG_ON(xa, !xa_empty(xa));
641 for (j = 0; j < min; j++)
642 xa_store_index(xa, min + j, GFP_KERNEL);
643 __check_store_iter(xa, min, i, min);
644 XA_BUG_ON(xa, !xa_empty(xa));
645 }
646#ifdef CONFIG_XARRAY_MULTI
647 xa_store_index(xa, 63, GFP_KERNEL);
648 xa_store_index(xa, 65, GFP_KERNEL);
649 __check_store_iter(xa, 64, 2, 1);
650 xa_erase_index(xa, 63);
651#endif
652 XA_BUG_ON(xa, !xa_empty(xa));
653}
654
655static noinline void check_multi_find(struct xarray *xa)
656{
657#ifdef CONFIG_XARRAY_MULTI
658 unsigned long index;
659
660 xa_store_order(xa, 12, 2, xa_mk_value(12), GFP_KERNEL);
661 XA_BUG_ON(xa, xa_store_index(xa, 16, GFP_KERNEL) != NULL);
662
663 index = 0;
664 XA_BUG_ON(xa, xa_find(xa, &index, ULONG_MAX, XA_PRESENT) !=
665 xa_mk_value(12));
666 XA_BUG_ON(xa, index != 12);
667 index = 13;
668 XA_BUG_ON(xa, xa_find(xa, &index, ULONG_MAX, XA_PRESENT) !=
669 xa_mk_value(12));
670 XA_BUG_ON(xa, (index < 12) || (index >= 16));
671 XA_BUG_ON(xa, xa_find_after(xa, &index, ULONG_MAX, XA_PRESENT) !=
672 xa_mk_value(16));
673 XA_BUG_ON(xa, index != 16);
674
675 xa_erase_index(xa, 12);
676 xa_erase_index(xa, 16);
677 XA_BUG_ON(xa, !xa_empty(xa));
678#endif
679}
680
681static noinline void check_multi_find_2(struct xarray *xa)
682{
683 unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 10 : 1;
684 unsigned int i, j;
685 void *entry;
686
687 for (i = 0; i < max_order; i++) {
688 unsigned long index = 1UL << i;
689 for (j = 0; j < index; j++) {
690 XA_STATE(xas, xa, j + index);
691 xa_store_index(xa, index - 1, GFP_KERNEL);
692 xa_store_order(xa, index, i, xa_mk_value(index),
693 GFP_KERNEL);
694 rcu_read_lock();
695 xas_for_each(&xas, entry, ULONG_MAX) {
696 xa_erase_index(xa, index);
697 }
698 rcu_read_unlock();
699 xa_erase_index(xa, index - 1);
700 XA_BUG_ON(xa, !xa_empty(xa));
701 }
702 }
703}
704
705static noinline void check_find(struct xarray *xa)
706{
707 unsigned long i, j, k;
708
709 XA_BUG_ON(xa, !xa_empty(xa));
710
711 /*
712 * Check xa_find with all pairs between 0 and 99 inclusive,
713 * starting at every index between 0 and 99
714 */
715 for (i = 0; i < 100; i++) {
716 XA_BUG_ON(xa, xa_store_index(xa, i, GFP_KERNEL) != NULL);
717 xa_set_mark(xa, i, XA_MARK_0);
718 for (j = 0; j < i; j++) {
719 XA_BUG_ON(xa, xa_store_index(xa, j, GFP_KERNEL) !=
720 NULL);
721 xa_set_mark(xa, j, XA_MARK_0);
722 for (k = 0; k < 100; k++) {
723 unsigned long index = k;
724 void *entry = xa_find(xa, &index, ULONG_MAX,
725 XA_PRESENT);
726 if (k <= j)
727 XA_BUG_ON(xa, index != j);
728 else if (k <= i)
729 XA_BUG_ON(xa, index != i);
730 else
731 XA_BUG_ON(xa, entry != NULL);
732
733 index = k;
734 entry = xa_find(xa, &index, ULONG_MAX,
735 XA_MARK_0);
736 if (k <= j)
737 XA_BUG_ON(xa, index != j);
738 else if (k <= i)
739 XA_BUG_ON(xa, index != i);
740 else
741 XA_BUG_ON(xa, entry != NULL);
742 }
743 xa_erase_index(xa, j);
744 XA_BUG_ON(xa, xa_get_mark(xa, j, XA_MARK_0));
745 XA_BUG_ON(xa, !xa_get_mark(xa, i, XA_MARK_0));
746 }
747 xa_erase_index(xa, i);
748 XA_BUG_ON(xa, xa_get_mark(xa, i, XA_MARK_0));
749 }
750 XA_BUG_ON(xa, !xa_empty(xa));
751 check_multi_find(xa);
752 check_multi_find_2(xa);
753}
754
755/* See find_swap_entry() in mm/shmem.c */
756static noinline unsigned long xa_find_entry(struct xarray *xa, void *item)
757{
758 XA_STATE(xas, xa, 0);
759 unsigned int checked = 0;
760 void *entry;
761
762 rcu_read_lock();
763 xas_for_each(&xas, entry, ULONG_MAX) {
764 if (xas_retry(&xas, entry))
765 continue;
766 if (entry == item)
767 break;
768 checked++;
769 if ((checked % 4) != 0)
770 continue;
771 xas_pause(&xas);
772 }
773 rcu_read_unlock();
774
775 return entry ? xas.xa_index : -1;
776}
777
778static noinline void check_find_entry(struct xarray *xa)
779{
780#ifdef CONFIG_XARRAY_MULTI
781 unsigned int order;
782 unsigned long offset, index;
783
784 for (order = 0; order < 20; order++) {
785 for (offset = 0; offset < (1UL << (order + 3));
786 offset += (1UL << order)) {
787 for (index = 0; index < (1UL << (order + 5));
788 index += (1UL << order)) {
789 xa_store_order(xa, index, order,
790 xa_mk_value(index), GFP_KERNEL);
791 XA_BUG_ON(xa, xa_load(xa, index) !=
792 xa_mk_value(index));
793 XA_BUG_ON(xa, xa_find_entry(xa,
794 xa_mk_value(index)) != index);
795 }
796 XA_BUG_ON(xa, xa_find_entry(xa, xa) != -1);
797 xa_destroy(xa);
798 }
799 }
800#endif
801
802 XA_BUG_ON(xa, xa_find_entry(xa, xa) != -1);
803 xa_store_index(xa, ULONG_MAX, GFP_KERNEL);
804 XA_BUG_ON(xa, xa_find_entry(xa, xa) != -1);
805 XA_BUG_ON(xa, xa_find_entry(xa, xa_mk_value(LONG_MAX)) != -1);
806 xa_erase_index(xa, ULONG_MAX);
807 XA_BUG_ON(xa, !xa_empty(xa));
808}
809
810static noinline void check_move_small(struct xarray *xa, unsigned long idx)
811{
812 XA_STATE(xas, xa, 0);
813 unsigned long i;
814
815 xa_store_index(xa, 0, GFP_KERNEL);
816 xa_store_index(xa, idx, GFP_KERNEL);
817
818 rcu_read_lock();
819 for (i = 0; i < idx * 4; i++) {
820 void *entry = xas_next(&xas);
821 if (i <= idx)
822 XA_BUG_ON(xa, xas.xa_node == XAS_RESTART);
823 XA_BUG_ON(xa, xas.xa_index != i);
824 if (i == 0 || i == idx)
825 XA_BUG_ON(xa, entry != xa_mk_value(i));
826 else
827 XA_BUG_ON(xa, entry != NULL);
828 }
829 xas_next(&xas);
830 XA_BUG_ON(xa, xas.xa_index != i);
831
832 do {
833 void *entry = xas_prev(&xas);
834 i--;
835 if (i <= idx)
836 XA_BUG_ON(xa, xas.xa_node == XAS_RESTART);
837 XA_BUG_ON(xa, xas.xa_index != i);
838 if (i == 0 || i == idx)
839 XA_BUG_ON(xa, entry != xa_mk_value(i));
840 else
841 XA_BUG_ON(xa, entry != NULL);
842 } while (i > 0);
843
844 xas_set(&xas, ULONG_MAX);
845 XA_BUG_ON(xa, xas_next(&xas) != NULL);
846 XA_BUG_ON(xa, xas.xa_index != ULONG_MAX);
847 XA_BUG_ON(xa, xas_next(&xas) != xa_mk_value(0));
848 XA_BUG_ON(xa, xas.xa_index != 0);
849 XA_BUG_ON(xa, xas_prev(&xas) != NULL);
850 XA_BUG_ON(xa, xas.xa_index != ULONG_MAX);
851 rcu_read_unlock();
852
853 xa_erase_index(xa, 0);
854 xa_erase_index(xa, idx);
855 XA_BUG_ON(xa, !xa_empty(xa));
856}
857
858static noinline void check_move(struct xarray *xa)
859{
860 XA_STATE(xas, xa, (1 << 16) - 1);
861 unsigned long i;
862
863 for (i = 0; i < (1 << 16); i++)
864 XA_BUG_ON(xa, xa_store_index(xa, i, GFP_KERNEL) != NULL);
865
866 rcu_read_lock();
867 do {
868 void *entry = xas_prev(&xas);
869 i--;
870 XA_BUG_ON(xa, entry != xa_mk_value(i));
871 XA_BUG_ON(xa, i != xas.xa_index);
872 } while (i != 0);
873
874 XA_BUG_ON(xa, xas_prev(&xas) != NULL);
875 XA_BUG_ON(xa, xas.xa_index != ULONG_MAX);
876
877 do {
878 void *entry = xas_next(&xas);
879 XA_BUG_ON(xa, entry != xa_mk_value(i));
880 XA_BUG_ON(xa, i != xas.xa_index);
881 i++;
882 } while (i < (1 << 16));
883 rcu_read_unlock();
884
885 for (i = (1 << 8); i < (1 << 15); i++)
886 xa_erase_index(xa, i);
887
888 i = xas.xa_index;
889
890 rcu_read_lock();
891 do {
892 void *entry = xas_prev(&xas);
893 i--;
894 if ((i < (1 << 8)) || (i >= (1 << 15)))
895 XA_BUG_ON(xa, entry != xa_mk_value(i));
896 else
897 XA_BUG_ON(xa, entry != NULL);
898 XA_BUG_ON(xa, i != xas.xa_index);
899 } while (i != 0);
900
901 XA_BUG_ON(xa, xas_prev(&xas) != NULL);
902 XA_BUG_ON(xa, xas.xa_index != ULONG_MAX);
903
904 do {
905 void *entry = xas_next(&xas);
906 if ((i < (1 << 8)) || (i >= (1 << 15)))
907 XA_BUG_ON(xa, entry != xa_mk_value(i));
908 else
909 XA_BUG_ON(xa, entry != NULL);
910 XA_BUG_ON(xa, i != xas.xa_index);
911 i++;
912 } while (i < (1 << 16));
913 rcu_read_unlock();
914
915 xa_destroy(xa);
916
917 for (i = 0; i < 16; i++)
918 check_move_small(xa, 1UL << i);
919
920 for (i = 2; i < 16; i++)
921 check_move_small(xa, (1UL << i) - 1);
922}
923
924static noinline void xa_store_many_order(struct xarray *xa,
925 unsigned long index, unsigned order)
926{
927 XA_STATE_ORDER(xas, xa, index, order);
928 unsigned int i = 0;
929
930 do {
931 xas_lock(&xas);
932 XA_BUG_ON(xa, xas_find_conflict(&xas));
933 xas_create_range(&xas);
934 if (xas_error(&xas))
935 goto unlock;
936 for (i = 0; i < (1U << order); i++) {
937 XA_BUG_ON(xa, xas_store(&xas, xa_mk_value(index + i)));
938 xas_next(&xas);
939 }
940unlock:
941 xas_unlock(&xas);
942 } while (xas_nomem(&xas, GFP_KERNEL));
943
944 XA_BUG_ON(xa, xas_error(&xas));
945}
946
947static noinline void check_create_range_1(struct xarray *xa,
948 unsigned long index, unsigned order)
949{
950 unsigned long i;
951
952 xa_store_many_order(xa, index, order);
953 for (i = index; i < index + (1UL << order); i++)
954 xa_erase_index(xa, i);
955 XA_BUG_ON(xa, !xa_empty(xa));
956}
957
958static noinline void check_create_range_2(struct xarray *xa, unsigned order)
959{
960 unsigned long i;
961 unsigned long nr = 1UL << order;
962
963 for (i = 0; i < nr * nr; i += nr)
964 xa_store_many_order(xa, i, order);
965 for (i = 0; i < nr * nr; i++)
966 xa_erase_index(xa, i);
967 XA_BUG_ON(xa, !xa_empty(xa));
968}
969
970static noinline void check_create_range_3(void)
971{
972 XA_STATE(xas, NULL, 0);
973 xas_set_err(&xas, -EEXIST);
974 xas_create_range(&xas);
975 XA_BUG_ON(NULL, xas_error(&xas) != -EEXIST);
976}
977
978static noinline void check_create_range_4(struct xarray *xa,
979 unsigned long index, unsigned order)
980{
981 XA_STATE_ORDER(xas, xa, index, order);
982 unsigned long base = xas.xa_index;
983 unsigned long i = 0;
984
985 xa_store_index(xa, index, GFP_KERNEL);
986 do {
987 xas_lock(&xas);
988 xas_create_range(&xas);
989 if (xas_error(&xas))
990 goto unlock;
991 for (i = 0; i < (1UL << order); i++) {
992 void *old = xas_store(&xas, xa_mk_value(base + i));
993 if (xas.xa_index == index)
994 XA_BUG_ON(xa, old != xa_mk_value(base + i));
995 else
996 XA_BUG_ON(xa, old != NULL);
997 xas_next(&xas);
998 }
999unlock:
1000 xas_unlock(&xas);
1001 } while (xas_nomem(&xas, GFP_KERNEL));
1002
1003 XA_BUG_ON(xa, xas_error(&xas));
1004
1005 for (i = base; i < base + (1UL << order); i++)
1006 xa_erase_index(xa, i);
1007 XA_BUG_ON(xa, !xa_empty(xa));
1008}
1009
1010static noinline void check_create_range(struct xarray *xa)
1011{
1012 unsigned int order;
1013 unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 12 : 1;
1014
1015 for (order = 0; order < max_order; order++) {
1016 check_create_range_1(xa, 0, order);
1017 check_create_range_1(xa, 1U << order, order);
1018 check_create_range_1(xa, 2U << order, order);
1019 check_create_range_1(xa, 3U << order, order);
1020 check_create_range_1(xa, 1U << 24, order);
1021 if (order < 10)
1022 check_create_range_2(xa, order);
1023
1024 check_create_range_4(xa, 0, order);
1025 check_create_range_4(xa, 1U << order, order);
1026 check_create_range_4(xa, 2U << order, order);
1027 check_create_range_4(xa, 3U << order, order);
1028 check_create_range_4(xa, 1U << 24, order);
1029
1030 check_create_range_4(xa, 1, order);
1031 check_create_range_4(xa, (1U << order) + 1, order);
1032 check_create_range_4(xa, (2U << order) + 1, order);
1033 check_create_range_4(xa, (2U << order) - 1, order);
1034 check_create_range_4(xa, (3U << order) + 1, order);
1035 check_create_range_4(xa, (3U << order) - 1, order);
1036 check_create_range_4(xa, (1U << 24) + 1, order);
1037 }
1038
1039 check_create_range_3();
1040}
1041
1042static noinline void __check_store_range(struct xarray *xa, unsigned long first,
1043 unsigned long last)
1044{
1045#ifdef CONFIG_XARRAY_MULTI
1046 xa_store_range(xa, first, last, xa_mk_value(first), GFP_KERNEL);
1047
1048 XA_BUG_ON(xa, xa_load(xa, first) != xa_mk_value(first));
1049 XA_BUG_ON(xa, xa_load(xa, last) != xa_mk_value(first));
1050 XA_BUG_ON(xa, xa_load(xa, first - 1) != NULL);
1051 XA_BUG_ON(xa, xa_load(xa, last + 1) != NULL);
1052
1053 xa_store_range(xa, first, last, NULL, GFP_KERNEL);
1054#endif
1055
1056 XA_BUG_ON(xa, !xa_empty(xa));
1057}
1058
1059static noinline void check_store_range(struct xarray *xa)
1060{
1061 unsigned long i, j;
1062
1063 for (i = 0; i < 128; i++) {
1064 for (j = i; j < 128; j++) {
1065 __check_store_range(xa, i, j);
1066 __check_store_range(xa, 128 + i, 128 + j);
1067 __check_store_range(xa, 4095 + i, 4095 + j);
1068 __check_store_range(xa, 4096 + i, 4096 + j);
1069 __check_store_range(xa, 123456 + i, 123456 + j);
1070 __check_store_range(xa, UINT_MAX + i, UINT_MAX + j);
1071 }
1072 }
1073}
1074
1075static LIST_HEAD(shadow_nodes);
1076
1077static void test_update_node(struct xa_node *node)
1078{
1079 if (node->count && node->count == node->nr_values) {
1080 if (list_empty(&node->private_list))
1081 list_add(&shadow_nodes, &node->private_list);
1082 } else {
1083 if (!list_empty(&node->private_list))
1084 list_del_init(&node->private_list);
1085 }
1086}
1087
1088static noinline void shadow_remove(struct xarray *xa)
1089{
1090 struct xa_node *node;
1091
1092 xa_lock(xa);
1093 while ((node = list_first_entry_or_null(&shadow_nodes,
1094 struct xa_node, private_list))) {
1095 XA_STATE(xas, node->array, 0);
1096 XA_BUG_ON(xa, node->array != xa);
1097 list_del_init(&node->private_list);
1098 xas.xa_node = xa_parent_locked(node->array, node);
1099 xas.xa_offset = node->offset;
1100 xas.xa_shift = node->shift + XA_CHUNK_SHIFT;
1101 xas_set_update(&xas, test_update_node);
1102 xas_store(&xas, NULL);
1103 }
1104 xa_unlock(xa);
1105}
1106
1107static noinline void check_workingset(struct xarray *xa, unsigned long index)
1108{
1109 XA_STATE(xas, xa, index);
1110 xas_set_update(&xas, test_update_node);
1111
1112 do {
1113 xas_lock(&xas);
1114 xas_store(&xas, xa_mk_value(0));
1115 xas_next(&xas);
1116 xas_store(&xas, xa_mk_value(1));
1117 xas_unlock(&xas);
1118 } while (xas_nomem(&xas, GFP_KERNEL));
1119
1120 XA_BUG_ON(xa, list_empty(&shadow_nodes));
1121
1122 xas_lock(&xas);
1123 xas_next(&xas);
1124 xas_store(&xas, &xas);
1125 XA_BUG_ON(xa, !list_empty(&shadow_nodes));
1126
1127 xas_store(&xas, xa_mk_value(2));
1128 xas_unlock(&xas);
1129 XA_BUG_ON(xa, list_empty(&shadow_nodes));
1130
1131 shadow_remove(xa);
1132 XA_BUG_ON(xa, !list_empty(&shadow_nodes));
1133 XA_BUG_ON(xa, !xa_empty(xa));
1134}
1135
1136/*
1137 * Check that the pointer / value / sibling entries are accounted the
1138 * way we expect them to be.
1139 */
1140static noinline void check_account(struct xarray *xa)
1141{
1142#ifdef CONFIG_XARRAY_MULTI
1143 unsigned int order;
1144
1145 for (order = 1; order < 12; order++) {
1146 XA_STATE(xas, xa, 1 << order);
1147
1148 xa_store_order(xa, 0, order, xa, GFP_KERNEL);
1149 xas_load(&xas);
1150 XA_BUG_ON(xa, xas.xa_node->count == 0);
1151 XA_BUG_ON(xa, xas.xa_node->count > (1 << order));
1152 XA_BUG_ON(xa, xas.xa_node->nr_values != 0);
1153
1154 xa_store_order(xa, 1 << order, order, xa_mk_value(1 << order),
1155 GFP_KERNEL);
1156 XA_BUG_ON(xa, xas.xa_node->count != xas.xa_node->nr_values * 2);
1157
1158 xa_erase(xa, 1 << order);
1159 XA_BUG_ON(xa, xas.xa_node->nr_values != 0);
1160
1161 xa_erase(xa, 0);
1162 XA_BUG_ON(xa, !xa_empty(xa));
1163 }
1164#endif
1165}
1166
1167static noinline void check_destroy(struct xarray *xa)
1168{
1169 unsigned long index;
1170
1171 XA_BUG_ON(xa, !xa_empty(xa));
1172
1173 /* Destroying an empty array is a no-op */
1174 xa_destroy(xa);
1175 XA_BUG_ON(xa, !xa_empty(xa));
1176
1177 /* Destroying an array with a single entry */
1178 for (index = 0; index < 1000; index++) {
1179 xa_store_index(xa, index, GFP_KERNEL);
1180 XA_BUG_ON(xa, xa_empty(xa));
1181 xa_destroy(xa);
1182 XA_BUG_ON(xa, !xa_empty(xa));
1183 }
1184
1185 /* Destroying an array with a single entry at ULONG_MAX */
1186 xa_store(xa, ULONG_MAX, xa, GFP_KERNEL);
1187 XA_BUG_ON(xa, xa_empty(xa));
1188 xa_destroy(xa);
1189 XA_BUG_ON(xa, !xa_empty(xa));
1190
1191#ifdef CONFIG_XARRAY_MULTI
1192 /* Destroying an array with a multi-index entry */
1193 xa_store_order(xa, 1 << 11, 11, xa, GFP_KERNEL);
1194 XA_BUG_ON(xa, xa_empty(xa));
1195 xa_destroy(xa);
1196 XA_BUG_ON(xa, !xa_empty(xa));
1197#endif
1198}
1199
1200static DEFINE_XARRAY(array);
1201
1202static int xarray_checks(void)
1203{
1204 check_xa_err(&array);
1205 check_xas_retry(&array);
1206 check_xa_load(&array);
1207 check_xa_mark(&array);
1208 check_xa_shrink(&array);
1209 check_xas_erase(&array);
1210 check_cmpxchg(&array);
1211 check_reserve(&array);
1212 check_multi_store(&array);
1213 check_xa_alloc();
1214 check_find(&array);
1215 check_find_entry(&array);
1216 check_account(&array);
1217 check_destroy(&array);
1218 check_move(&array);
1219 check_create_range(&array);
1220 check_store_range(&array);
1221 check_store_iter(&array);
1222
1223 check_workingset(&array, 0);
1224 check_workingset(&array, 64);
1225 check_workingset(&array, 4096);
1226
1227 printk("XArray: %u of %u tests passed\n", tests_passed, tests_run);
1228 return (tests_run == tests_passed) ? 0 : -EINVAL;
1229}
1230
1231static void xarray_exit(void)
1232{
1233}
1234
1235module_init(xarray_checks);
1236module_exit(xarray_exit);
1237MODULE_AUTHOR("Matthew Wilcox <willy@infradead.org>");
1238MODULE_LICENSE("GPL");
diff --git a/lib/xarray.c b/lib/xarray.c
new file mode 100644
index 000000000000..8b176f009c08
--- /dev/null
+++ b/lib/xarray.c
@@ -0,0 +1,2036 @@
1// SPDX-License-Identifier: GPL-2.0+
2/*
3 * XArray implementation
4 * Copyright (c) 2017 Microsoft Corporation
5 * Author: Matthew Wilcox <willy@infradead.org>
6 */
7
8#include <linux/bitmap.h>
9#include <linux/export.h>
10#include <linux/list.h>
11#include <linux/slab.h>
12#include <linux/xarray.h>
13
14/*
15 * Coding conventions in this file:
16 *
17 * @xa is used to refer to the entire xarray.
18 * @xas is the 'xarray operation state'. It may be either a pointer to
19 * an xa_state, or an xa_state stored on the stack. This is an unfortunate
20 * ambiguity.
21 * @index is the index of the entry being operated on
22 * @mark is an xa_mark_t; a small number indicating one of the mark bits.
23 * @node refers to an xa_node; usually the primary one being operated on by
24 * this function.
25 * @offset is the index into the slots array inside an xa_node.
26 * @parent refers to the @xa_node closer to the head than @node.
27 * @entry refers to something stored in a slot in the xarray
28 */
29
30static inline unsigned int xa_lock_type(const struct xarray *xa)
31{
32 return (__force unsigned int)xa->xa_flags & 3;
33}
34
35static inline void xas_lock_type(struct xa_state *xas, unsigned int lock_type)
36{
37 if (lock_type == XA_LOCK_IRQ)
38 xas_lock_irq(xas);
39 else if (lock_type == XA_LOCK_BH)
40 xas_lock_bh(xas);
41 else
42 xas_lock(xas);
43}
44
45static inline void xas_unlock_type(struct xa_state *xas, unsigned int lock_type)
46{
47 if (lock_type == XA_LOCK_IRQ)
48 xas_unlock_irq(xas);
49 else if (lock_type == XA_LOCK_BH)
50 xas_unlock_bh(xas);
51 else
52 xas_unlock(xas);
53}
54
55static inline bool xa_track_free(const struct xarray *xa)
56{
57 return xa->xa_flags & XA_FLAGS_TRACK_FREE;
58}
59
60static inline void xa_mark_set(struct xarray *xa, xa_mark_t mark)
61{
62 if (!(xa->xa_flags & XA_FLAGS_MARK(mark)))
63 xa->xa_flags |= XA_FLAGS_MARK(mark);
64}
65
66static inline void xa_mark_clear(struct xarray *xa, xa_mark_t mark)
67{
68 if (xa->xa_flags & XA_FLAGS_MARK(mark))
69 xa->xa_flags &= ~(XA_FLAGS_MARK(mark));
70}
71
72static inline unsigned long *node_marks(struct xa_node *node, xa_mark_t mark)
73{
74 return node->marks[(__force unsigned)mark];
75}
76
77static inline bool node_get_mark(struct xa_node *node,
78 unsigned int offset, xa_mark_t mark)
79{
80 return test_bit(offset, node_marks(node, mark));
81}
82
83/* returns true if the bit was set */
84static inline bool node_set_mark(struct xa_node *node, unsigned int offset,
85 xa_mark_t mark)
86{
87 return __test_and_set_bit(offset, node_marks(node, mark));
88}
89
90/* returns true if the bit was set */
91static inline bool node_clear_mark(struct xa_node *node, unsigned int offset,
92 xa_mark_t mark)
93{
94 return __test_and_clear_bit(offset, node_marks(node, mark));
95}
96
97static inline bool node_any_mark(struct xa_node *node, xa_mark_t mark)
98{
99 return !bitmap_empty(node_marks(node, mark), XA_CHUNK_SIZE);
100}
101
102static inline void node_mark_all(struct xa_node *node, xa_mark_t mark)
103{
104 bitmap_fill(node_marks(node, mark), XA_CHUNK_SIZE);
105}
106
107#define mark_inc(mark) do { \
108 mark = (__force xa_mark_t)((__force unsigned)(mark) + 1); \
109} while (0)
110
111/*
112 * xas_squash_marks() - Merge all marks to the first entry
113 * @xas: Array operation state.
114 *
115 * Set a mark on the first entry if any entry has it set. Clear marks on
116 * all sibling entries.
117 */
118static void xas_squash_marks(const struct xa_state *xas)
119{
120 unsigned int mark = 0;
121 unsigned int limit = xas->xa_offset + xas->xa_sibs + 1;
122
123 if (!xas->xa_sibs)
124 return;
125
126 do {
127 unsigned long *marks = xas->xa_node->marks[mark];
128 if (find_next_bit(marks, limit, xas->xa_offset + 1) == limit)
129 continue;
130 __set_bit(xas->xa_offset, marks);
131 bitmap_clear(marks, xas->xa_offset + 1, xas->xa_sibs);
132 } while (mark++ != (__force unsigned)XA_MARK_MAX);
133}
134
135/* extracts the offset within this node from the index */
136static unsigned int get_offset(unsigned long index, struct xa_node *node)
137{
138 return (index >> node->shift) & XA_CHUNK_MASK;
139}
140
141static void xas_set_offset(struct xa_state *xas)
142{
143 xas->xa_offset = get_offset(xas->xa_index, xas->xa_node);
144}
145
146/* move the index either forwards (find) or backwards (sibling slot) */
147static void xas_move_index(struct xa_state *xas, unsigned long offset)
148{
149 unsigned int shift = xas->xa_node->shift;
150 xas->xa_index &= ~XA_CHUNK_MASK << shift;
151 xas->xa_index += offset << shift;
152}
153
154static void xas_advance(struct xa_state *xas)
155{
156 xas->xa_offset++;
157 xas_move_index(xas, xas->xa_offset);
158}
159
160static void *set_bounds(struct xa_state *xas)
161{
162 xas->xa_node = XAS_BOUNDS;
163 return NULL;
164}
165
166/*
167 * Starts a walk. If the @xas is already valid, we assume that it's on
168 * the right path and just return where we've got to. If we're in an
169 * error state, return NULL. If the index is outside the current scope
170 * of the xarray, return NULL without changing @xas->xa_node. Otherwise
171 * set @xas->xa_node to NULL and return the current head of the array.
172 */
173static void *xas_start(struct xa_state *xas)
174{
175 void *entry;
176
177 if (xas_valid(xas))
178 return xas_reload(xas);
179 if (xas_error(xas))
180 return NULL;
181
182 entry = xa_head(xas->xa);
183 if (!xa_is_node(entry)) {
184 if (xas->xa_index)
185 return set_bounds(xas);
186 } else {
187 if ((xas->xa_index >> xa_to_node(entry)->shift) > XA_CHUNK_MASK)
188 return set_bounds(xas);
189 }
190
191 xas->xa_node = NULL;
192 return entry;
193}
194
195static void *xas_descend(struct xa_state *xas, struct xa_node *node)
196{
197 unsigned int offset = get_offset(xas->xa_index, node);
198 void *entry = xa_entry(xas->xa, node, offset);
199
200 xas->xa_node = node;
201 if (xa_is_sibling(entry)) {
202 offset = xa_to_sibling(entry);
203 entry = xa_entry(xas->xa, node, offset);
204 }
205
206 xas->xa_offset = offset;
207 return entry;
208}
209
210/**
211 * xas_load() - Load an entry from the XArray (advanced).
212 * @xas: XArray operation state.
213 *
214 * Usually walks the @xas to the appropriate state to load the entry
215 * stored at xa_index. However, it will do nothing and return %NULL if
216 * @xas is in an error state. xas_load() will never expand the tree.
217 *
218 * If the xa_state is set up to operate on a multi-index entry, xas_load()
219 * may return %NULL or an internal entry, even if there are entries
220 * present within the range specified by @xas.
221 *
222 * Context: Any context. The caller should hold the xa_lock or the RCU lock.
223 * Return: Usually an entry in the XArray, but see description for exceptions.
224 */
225void *xas_load(struct xa_state *xas)
226{
227 void *entry = xas_start(xas);
228
229 while (xa_is_node(entry)) {
230 struct xa_node *node = xa_to_node(entry);
231
232 if (xas->xa_shift > node->shift)
233 break;
234 entry = xas_descend(xas, node);
235 }
236 return entry;
237}
238EXPORT_SYMBOL_GPL(xas_load);
239
240/* Move the radix tree node cache here */
241extern struct kmem_cache *radix_tree_node_cachep;
242extern void radix_tree_node_rcu_free(struct rcu_head *head);
243
244#define XA_RCU_FREE ((struct xarray *)1)
245
246static void xa_node_free(struct xa_node *node)
247{
248 XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
249 node->array = XA_RCU_FREE;
250 call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
251}
252
253/*
254 * xas_destroy() - Free any resources allocated during the XArray operation.
255 * @xas: XArray operation state.
256 *
257 * This function is now internal-only.
258 */
259static void xas_destroy(struct xa_state *xas)
260{
261 struct xa_node *node = xas->xa_alloc;
262
263 if (!node)
264 return;
265 XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
266 kmem_cache_free(radix_tree_node_cachep, node);
267 xas->xa_alloc = NULL;
268}
269
270/**
271 * xas_nomem() - Allocate memory if needed.
272 * @xas: XArray operation state.
273 * @gfp: Memory allocation flags.
274 *
275 * If we need to add new nodes to the XArray, we try to allocate memory
276 * with GFP_NOWAIT while holding the lock, which will usually succeed.
277 * If it fails, @xas is flagged as needing memory to continue. The caller
278 * should drop the lock and call xas_nomem(). If xas_nomem() succeeds,
279 * the caller should retry the operation.
280 *
281 * Forward progress is guaranteed as one node is allocated here and
282 * stored in the xa_state where it will be found by xas_alloc(). More
283 * nodes will likely be found in the slab allocator, but we do not tie
284 * them up here.
285 *
286 * Return: true if memory was needed, and was successfully allocated.
287 */
288bool xas_nomem(struct xa_state *xas, gfp_t gfp)
289{
290 if (xas->xa_node != XA_ERROR(-ENOMEM)) {
291 xas_destroy(xas);
292 return false;
293 }
294 xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp);
295 if (!xas->xa_alloc)
296 return false;
297 XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list));
298 xas->xa_node = XAS_RESTART;
299 return true;
300}
301EXPORT_SYMBOL_GPL(xas_nomem);
302
303/*
304 * __xas_nomem() - Drop locks and allocate memory if needed.
305 * @xas: XArray operation state.
306 * @gfp: Memory allocation flags.
307 *
308 * Internal variant of xas_nomem().
309 *
310 * Return: true if memory was needed, and was successfully allocated.
311 */
312static bool __xas_nomem(struct xa_state *xas, gfp_t gfp)
313 __must_hold(xas->xa->xa_lock)
314{
315 unsigned int lock_type = xa_lock_type(xas->xa);
316
317 if (xas->xa_node != XA_ERROR(-ENOMEM)) {
318 xas_destroy(xas);
319 return false;
320 }
321 if (gfpflags_allow_blocking(gfp)) {
322 xas_unlock_type(xas, lock_type);
323 xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp);
324 xas_lock_type(xas, lock_type);
325 } else {
326 xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp);
327 }
328 if (!xas->xa_alloc)
329 return false;
330 XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list));
331 xas->xa_node = XAS_RESTART;
332 return true;
333}
334
335static void xas_update(struct xa_state *xas, struct xa_node *node)
336{
337 if (xas->xa_update)
338 xas->xa_update(node);
339 else
340 XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
341}
342
343static void *xas_alloc(struct xa_state *xas, unsigned int shift)
344{
345 struct xa_node *parent = xas->xa_node;
346 struct xa_node *node = xas->xa_alloc;
347
348 if (xas_invalid(xas))
349 return NULL;
350
351 if (node) {
352 xas->xa_alloc = NULL;
353 } else {
354 node = kmem_cache_alloc(radix_tree_node_cachep,
355 GFP_NOWAIT | __GFP_NOWARN);
356 if (!node) {
357 xas_set_err(xas, -ENOMEM);
358 return NULL;
359 }
360 }
361
362 if (parent) {
363 node->offset = xas->xa_offset;
364 parent->count++;
365 XA_NODE_BUG_ON(node, parent->count > XA_CHUNK_SIZE);
366 xas_update(xas, parent);
367 }
368 XA_NODE_BUG_ON(node, shift > BITS_PER_LONG);
369 XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
370 node->shift = shift;
371 node->count = 0;
372 node->nr_values = 0;
373 RCU_INIT_POINTER(node->parent, xas->xa_node);
374 node->array = xas->xa;
375
376 return node;
377}
378
379#ifdef CONFIG_XARRAY_MULTI
380/* Returns the number of indices covered by a given xa_state */
381static unsigned long xas_size(const struct xa_state *xas)
382{
383 return (xas->xa_sibs + 1UL) << xas->xa_shift;
384}
385#endif
386
387/*
388 * Use this to calculate the maximum index that will need to be created
389 * in order to add the entry described by @xas. Because we cannot store a
390 * multiple-index entry at index 0, the calculation is a little more complex
391 * than you might expect.
392 */
393static unsigned long xas_max(struct xa_state *xas)
394{
395 unsigned long max = xas->xa_index;
396
397#ifdef CONFIG_XARRAY_MULTI
398 if (xas->xa_shift || xas->xa_sibs) {
399 unsigned long mask = xas_size(xas) - 1;
400 max |= mask;
401 if (mask == max)
402 max++;
403 }
404#endif
405
406 return max;
407}
408
409/* The maximum index that can be contained in the array without expanding it */
410static unsigned long max_index(void *entry)
411{
412 if (!xa_is_node(entry))
413 return 0;
414 return (XA_CHUNK_SIZE << xa_to_node(entry)->shift) - 1;
415}
416
417static void xas_shrink(struct xa_state *xas)
418{
419 struct xarray *xa = xas->xa;
420 struct xa_node *node = xas->xa_node;
421
422 for (;;) {
423 void *entry;
424
425 XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE);
426 if (node->count != 1)
427 break;
428 entry = xa_entry_locked(xa, node, 0);
429 if (!entry)
430 break;
431 if (!xa_is_node(entry) && node->shift)
432 break;
433 xas->xa_node = XAS_BOUNDS;
434
435 RCU_INIT_POINTER(xa->xa_head, entry);
436 if (xa_track_free(xa) && !node_get_mark(node, 0, XA_FREE_MARK))
437 xa_mark_clear(xa, XA_FREE_MARK);
438
439 node->count = 0;
440 node->nr_values = 0;
441 if (!xa_is_node(entry))
442 RCU_INIT_POINTER(node->slots[0], XA_RETRY_ENTRY);
443 xas_update(xas, node);
444 xa_node_free(node);
445 if (!xa_is_node(entry))
446 break;
447 node = xa_to_node(entry);
448 node->parent = NULL;
449 }
450}
451
452/*
453 * xas_delete_node() - Attempt to delete an xa_node
454 * @xas: Array operation state.
455 *
456 * Attempts to delete the @xas->xa_node. This will fail if xa->node has
457 * a non-zero reference count.
458 */
459static void xas_delete_node(struct xa_state *xas)
460{
461 struct xa_node *node = xas->xa_node;
462
463 for (;;) {
464 struct xa_node *parent;
465
466 XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE);
467 if (node->count)
468 break;
469
470 parent = xa_parent_locked(xas->xa, node);
471 xas->xa_node = parent;
472 xas->xa_offset = node->offset;
473 xa_node_free(node);
474
475 if (!parent) {
476 xas->xa->xa_head = NULL;
477 xas->xa_node = XAS_BOUNDS;
478 return;
479 }
480
481 parent->slots[xas->xa_offset] = NULL;
482 parent->count--;
483 XA_NODE_BUG_ON(parent, parent->count > XA_CHUNK_SIZE);
484 node = parent;
485 xas_update(xas, node);
486 }
487
488 if (!node->parent)
489 xas_shrink(xas);
490}
491
492/**
493 * xas_free_nodes() - Free this node and all nodes that it references
494 * @xas: Array operation state.
495 * @top: Node to free
496 *
497 * This node has been removed from the tree. We must now free it and all
498 * of its subnodes. There may be RCU walkers with references into the tree,
499 * so we must replace all entries with retry markers.
500 */
501static void xas_free_nodes(struct xa_state *xas, struct xa_node *top)
502{
503 unsigned int offset = 0;
504 struct xa_node *node = top;
505
506 for (;;) {
507 void *entry = xa_entry_locked(xas->xa, node, offset);
508
509 if (xa_is_node(entry)) {
510 node = xa_to_node(entry);
511 offset = 0;
512 continue;
513 }
514 if (entry)
515 RCU_INIT_POINTER(node->slots[offset], XA_RETRY_ENTRY);
516 offset++;
517 while (offset == XA_CHUNK_SIZE) {
518 struct xa_node *parent;
519
520 parent = xa_parent_locked(xas->xa, node);
521 offset = node->offset + 1;
522 node->count = 0;
523 node->nr_values = 0;
524 xas_update(xas, node);
525 xa_node_free(node);
526 if (node == top)
527 return;
528 node = parent;
529 }
530 }
531}
532
533/*
534 * xas_expand adds nodes to the head of the tree until it has reached
535 * sufficient height to be able to contain @xas->xa_index
536 */
537static int xas_expand(struct xa_state *xas, void *head)
538{
539 struct xarray *xa = xas->xa;
540 struct xa_node *node = NULL;
541 unsigned int shift = 0;
542 unsigned long max = xas_max(xas);
543
544 if (!head) {
545 if (max == 0)
546 return 0;
547 while ((max >> shift) >= XA_CHUNK_SIZE)
548 shift += XA_CHUNK_SHIFT;
549 return shift + XA_CHUNK_SHIFT;
550 } else if (xa_is_node(head)) {
551 node = xa_to_node(head);
552 shift = node->shift + XA_CHUNK_SHIFT;
553 }
554 xas->xa_node = NULL;
555
556 while (max > max_index(head)) {
557 xa_mark_t mark = 0;
558
559 XA_NODE_BUG_ON(node, shift > BITS_PER_LONG);
560 node = xas_alloc(xas, shift);
561 if (!node)
562 return -ENOMEM;
563
564 node->count = 1;
565 if (xa_is_value(head))
566 node->nr_values = 1;
567 RCU_INIT_POINTER(node->slots[0], head);
568
569 /* Propagate the aggregated mark info to the new child */
570 for (;;) {
571 if (xa_track_free(xa) && mark == XA_FREE_MARK) {
572 node_mark_all(node, XA_FREE_MARK);
573 if (!xa_marked(xa, XA_FREE_MARK)) {
574 node_clear_mark(node, 0, XA_FREE_MARK);
575 xa_mark_set(xa, XA_FREE_MARK);
576 }
577 } else if (xa_marked(xa, mark)) {
578 node_set_mark(node, 0, mark);
579 }
580 if (mark == XA_MARK_MAX)
581 break;
582 mark_inc(mark);
583 }
584
585 /*
586 * Now that the new node is fully initialised, we can add
587 * it to the tree
588 */
589 if (xa_is_node(head)) {
590 xa_to_node(head)->offset = 0;
591 rcu_assign_pointer(xa_to_node(head)->parent, node);
592 }
593 head = xa_mk_node(node);
594 rcu_assign_pointer(xa->xa_head, head);
595 xas_update(xas, node);
596
597 shift += XA_CHUNK_SHIFT;
598 }
599
600 xas->xa_node = node;
601 return shift;
602}
603
604/*
605 * xas_create() - Create a slot to store an entry in.
606 * @xas: XArray operation state.
607 *
608 * Most users will not need to call this function directly, as it is called
609 * by xas_store(). It is useful for doing conditional store operations
610 * (see the xa_cmpxchg() implementation for an example).
611 *
612 * Return: If the slot already existed, returns the contents of this slot.
613 * If the slot was newly created, returns NULL. If it failed to create the
614 * slot, returns NULL and indicates the error in @xas.
615 */
616static void *xas_create(struct xa_state *xas)
617{
618 struct xarray *xa = xas->xa;
619 void *entry;
620 void __rcu **slot;
621 struct xa_node *node = xas->xa_node;
622 int shift;
623 unsigned int order = xas->xa_shift;
624
625 if (xas_top(node)) {
626 entry = xa_head_locked(xa);
627 xas->xa_node = NULL;
628 shift = xas_expand(xas, entry);
629 if (shift < 0)
630 return NULL;
631 entry = xa_head_locked(xa);
632 slot = &xa->xa_head;
633 } else if (xas_error(xas)) {
634 return NULL;
635 } else if (node) {
636 unsigned int offset = xas->xa_offset;
637
638 shift = node->shift;
639 entry = xa_entry_locked(xa, node, offset);
640 slot = &node->slots[offset];
641 } else {
642 shift = 0;
643 entry = xa_head_locked(xa);
644 slot = &xa->xa_head;
645 }
646
647 while (shift > order) {
648 shift -= XA_CHUNK_SHIFT;
649 if (!entry) {
650 node = xas_alloc(xas, shift);
651 if (!node)
652 break;
653 if (xa_track_free(xa))
654 node_mark_all(node, XA_FREE_MARK);
655 rcu_assign_pointer(*slot, xa_mk_node(node));
656 } else if (xa_is_node(entry)) {
657 node = xa_to_node(entry);
658 } else {
659 break;
660 }
661 entry = xas_descend(xas, node);
662 slot = &node->slots[xas->xa_offset];
663 }
664
665 return entry;
666}
667
668/**
669 * xas_create_range() - Ensure that stores to this range will succeed
670 * @xas: XArray operation state.
671 *
672 * Creates all of the slots in the range covered by @xas. Sets @xas to
673 * create single-index entries and positions it at the beginning of the
674 * range. This is for the benefit of users which have not yet been
675 * converted to use multi-index entries.
676 */
677void xas_create_range(struct xa_state *xas)
678{
679 unsigned long index = xas->xa_index;
680 unsigned char shift = xas->xa_shift;
681 unsigned char sibs = xas->xa_sibs;
682
683 xas->xa_index |= ((sibs + 1) << shift) - 1;
684 if (xas_is_node(xas) && xas->xa_node->shift == xas->xa_shift)
685 xas->xa_offset |= sibs;
686 xas->xa_shift = 0;
687 xas->xa_sibs = 0;
688
689 for (;;) {
690 xas_create(xas);
691 if (xas_error(xas))
692 goto restore;
693 if (xas->xa_index <= (index | XA_CHUNK_MASK))
694 goto success;
695 xas->xa_index -= XA_CHUNK_SIZE;
696
697 for (;;) {
698 struct xa_node *node = xas->xa_node;
699 xas->xa_node = xa_parent_locked(xas->xa, node);
700 xas->xa_offset = node->offset - 1;
701 if (node->offset != 0)
702 break;
703 }
704 }
705
706restore:
707 xas->xa_shift = shift;
708 xas->xa_sibs = sibs;
709 xas->xa_index = index;
710 return;
711success:
712 xas->xa_index = index;
713 if (xas->xa_node)
714 xas_set_offset(xas);
715}
716EXPORT_SYMBOL_GPL(xas_create_range);
717
718static void update_node(struct xa_state *xas, struct xa_node *node,
719 int count, int values)
720{
721 if (!node || (!count && !values))
722 return;
723
724 node->count += count;
725 node->nr_values += values;
726 XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE);
727 XA_NODE_BUG_ON(node, node->nr_values > XA_CHUNK_SIZE);
728 xas_update(xas, node);
729 if (count < 0)
730 xas_delete_node(xas);
731}
732
733/**
734 * xas_store() - Store this entry in the XArray.
735 * @xas: XArray operation state.
736 * @entry: New entry.
737 *
738 * If @xas is operating on a multi-index entry, the entry returned by this
739 * function is essentially meaningless (it may be an internal entry or it
740 * may be %NULL, even if there are non-NULL entries at some of the indices
741 * covered by the range). This is not a problem for any current users,
742 * and can be changed if needed.
743 *
744 * Return: The old entry at this index.
745 */
746void *xas_store(struct xa_state *xas, void *entry)
747{
748 struct xa_node *node;
749 void __rcu **slot = &xas->xa->xa_head;
750 unsigned int offset, max;
751 int count = 0;
752 int values = 0;
753 void *first, *next;
754 bool value = xa_is_value(entry);
755
756 if (entry)
757 first = xas_create(xas);
758 else
759 first = xas_load(xas);
760
761 if (xas_invalid(xas))
762 return first;
763 node = xas->xa_node;
764 if (node && (xas->xa_shift < node->shift))
765 xas->xa_sibs = 0;
766 if ((first == entry) && !xas->xa_sibs)
767 return first;
768
769 next = first;
770 offset = xas->xa_offset;
771 max = xas->xa_offset + xas->xa_sibs;
772 if (node) {
773 slot = &node->slots[offset];
774 if (xas->xa_sibs)
775 xas_squash_marks(xas);
776 }
777 if (!entry)
778 xas_init_marks(xas);
779
780 for (;;) {
781 /*
782 * Must clear the marks before setting the entry to NULL,
783 * otherwise xas_for_each_marked may find a NULL entry and
784 * stop early. rcu_assign_pointer contains a release barrier
785 * so the mark clearing will appear to happen before the
786 * entry is set to NULL.
787 */
788 rcu_assign_pointer(*slot, entry);
789 if (xa_is_node(next))
790 xas_free_nodes(xas, xa_to_node(next));
791 if (!node)
792 break;
793 count += !next - !entry;
794 values += !xa_is_value(first) - !value;
795 if (entry) {
796 if (offset == max)
797 break;
798 if (!xa_is_sibling(entry))
799 entry = xa_mk_sibling(xas->xa_offset);
800 } else {
801 if (offset == XA_CHUNK_MASK)
802 break;
803 }
804 next = xa_entry_locked(xas->xa, node, ++offset);
805 if (!xa_is_sibling(next)) {
806 if (!entry && (offset > max))
807 break;
808 first = next;
809 }
810 slot++;
811 }
812
813 update_node(xas, node, count, values);
814 return first;
815}
816EXPORT_SYMBOL_GPL(xas_store);
817
818/**
819 * xas_get_mark() - Returns the state of this mark.
820 * @xas: XArray operation state.
821 * @mark: Mark number.
822 *
823 * Return: true if the mark is set, false if the mark is clear or @xas
824 * is in an error state.
825 */
826bool xas_get_mark(const struct xa_state *xas, xa_mark_t mark)
827{
828 if (xas_invalid(xas))
829 return false;
830 if (!xas->xa_node)
831 return xa_marked(xas->xa, mark);
832 return node_get_mark(xas->xa_node, xas->xa_offset, mark);
833}
834EXPORT_SYMBOL_GPL(xas_get_mark);
835
836/**
837 * xas_set_mark() - Sets the mark on this entry and its parents.
838 * @xas: XArray operation state.
839 * @mark: Mark number.
840 *
841 * Sets the specified mark on this entry, and walks up the tree setting it
842 * on all the ancestor entries. Does nothing if @xas has not been walked to
843 * an entry, or is in an error state.
844 */
845void xas_set_mark(const struct xa_state *xas, xa_mark_t mark)
846{
847 struct xa_node *node = xas->xa_node;
848 unsigned int offset = xas->xa_offset;
849
850 if (xas_invalid(xas))
851 return;
852
853 while (node) {
854 if (node_set_mark(node, offset, mark))
855 return;
856 offset = node->offset;
857 node = xa_parent_locked(xas->xa, node);
858 }
859
860 if (!xa_marked(xas->xa, mark))
861 xa_mark_set(xas->xa, mark);
862}
863EXPORT_SYMBOL_GPL(xas_set_mark);
864
865/**
866 * xas_clear_mark() - Clears the mark on this entry and its parents.
867 * @xas: XArray operation state.
868 * @mark: Mark number.
869 *
870 * Clears the specified mark on this entry, and walks back to the head
871 * attempting to clear it on all the ancestor entries. Does nothing if
872 * @xas has not been walked to an entry, or is in an error state.
873 */
874void xas_clear_mark(const struct xa_state *xas, xa_mark_t mark)
875{
876 struct xa_node *node = xas->xa_node;
877 unsigned int offset = xas->xa_offset;
878
879 if (xas_invalid(xas))
880 return;
881
882 while (node) {
883 if (!node_clear_mark(node, offset, mark))
884 return;
885 if (node_any_mark(node, mark))
886 return;
887
888 offset = node->offset;
889 node = xa_parent_locked(xas->xa, node);
890 }
891
892 if (xa_marked(xas->xa, mark))
893 xa_mark_clear(xas->xa, mark);
894}
895EXPORT_SYMBOL_GPL(xas_clear_mark);
896
897/**
898 * xas_init_marks() - Initialise all marks for the entry
899 * @xas: Array operations state.
900 *
901 * Initialise all marks for the entry specified by @xas. If we're tracking
902 * free entries with a mark, we need to set it on all entries. All other
903 * marks are cleared.
904 *
905 * This implementation is not as efficient as it could be; we may walk
906 * up the tree multiple times.
907 */
908void xas_init_marks(const struct xa_state *xas)
909{
910 xa_mark_t mark = 0;
911
912 for (;;) {
913 if (xa_track_free(xas->xa) && mark == XA_FREE_MARK)
914 xas_set_mark(xas, mark);
915 else
916 xas_clear_mark(xas, mark);
917 if (mark == XA_MARK_MAX)
918 break;
919 mark_inc(mark);
920 }
921}
922EXPORT_SYMBOL_GPL(xas_init_marks);
923
924/**
925 * xas_pause() - Pause a walk to drop a lock.
926 * @xas: XArray operation state.
927 *
928 * Some users need to pause a walk and drop the lock they're holding in
929 * order to yield to a higher priority thread or carry out an operation
930 * on an entry. Those users should call this function before they drop
931 * the lock. It resets the @xas to be suitable for the next iteration
932 * of the loop after the user has reacquired the lock. If most entries
933 * found during a walk require you to call xas_pause(), the xa_for_each()
934 * iterator may be more appropriate.
935 *
936 * Note that xas_pause() only works for forward iteration. If a user needs
937 * to pause a reverse iteration, we will need a xas_pause_rev().
938 */
939void xas_pause(struct xa_state *xas)
940{
941 struct xa_node *node = xas->xa_node;
942
943 if (xas_invalid(xas))
944 return;
945
946 if (node) {
947 unsigned int offset = xas->xa_offset;
948 while (++offset < XA_CHUNK_SIZE) {
949 if (!xa_is_sibling(xa_entry(xas->xa, node, offset)))
950 break;
951 }
952 xas->xa_index += (offset - xas->xa_offset) << node->shift;
953 } else {
954 xas->xa_index++;
955 }
956 xas->xa_node = XAS_RESTART;
957}
958EXPORT_SYMBOL_GPL(xas_pause);
959
960/*
961 * __xas_prev() - Find the previous entry in the XArray.
962 * @xas: XArray operation state.
963 *
964 * Helper function for xas_prev() which handles all the complex cases
965 * out of line.
966 */
967void *__xas_prev(struct xa_state *xas)
968{
969 void *entry;
970
971 if (!xas_frozen(xas->xa_node))
972 xas->xa_index--;
973 if (xas_not_node(xas->xa_node))
974 return xas_load(xas);
975
976 if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node))
977 xas->xa_offset--;
978
979 while (xas->xa_offset == 255) {
980 xas->xa_offset = xas->xa_node->offset - 1;
981 xas->xa_node = xa_parent(xas->xa, xas->xa_node);
982 if (!xas->xa_node)
983 return set_bounds(xas);
984 }
985
986 for (;;) {
987 entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
988 if (!xa_is_node(entry))
989 return entry;
990
991 xas->xa_node = xa_to_node(entry);
992 xas_set_offset(xas);
993 }
994}
995EXPORT_SYMBOL_GPL(__xas_prev);
996
997/*
998 * __xas_next() - Find the next entry in the XArray.
999 * @xas: XArray operation state.
1000 *
1001 * Helper function for xas_next() which handles all the complex cases
1002 * out of line.
1003 */
1004void *__xas_next(struct xa_state *xas)
1005{
1006 void *entry;
1007
1008 if (!xas_frozen(xas->xa_node))
1009 xas->xa_index++;
1010 if (xas_not_node(xas->xa_node))
1011 return xas_load(xas);
1012
1013 if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node))
1014 xas->xa_offset++;
1015
1016 while (xas->xa_offset == XA_CHUNK_SIZE) {
1017 xas->xa_offset = xas->xa_node->offset + 1;
1018 xas->xa_node = xa_parent(xas->xa, xas->xa_node);
1019 if (!xas->xa_node)
1020 return set_bounds(xas);
1021 }
1022
1023 for (;;) {
1024 entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
1025 if (!xa_is_node(entry))
1026 return entry;
1027
1028 xas->xa_node = xa_to_node(entry);
1029 xas_set_offset(xas);
1030 }
1031}
1032EXPORT_SYMBOL_GPL(__xas_next);
1033
1034/**
1035 * xas_find() - Find the next present entry in the XArray.
1036 * @xas: XArray operation state.
1037 * @max: Highest index to return.
1038 *
1039 * If the @xas has not yet been walked to an entry, return the entry
1040 * which has an index >= xas.xa_index. If it has been walked, the entry
1041 * currently being pointed at has been processed, and so we move to the
1042 * next entry.
1043 *
1044 * If no entry is found and the array is smaller than @max, the iterator
1045 * is set to the smallest index not yet in the array. This allows @xas
1046 * to be immediately passed to xas_store().
1047 *
1048 * Return: The entry, if found, otherwise %NULL.
1049 */
1050void *xas_find(struct xa_state *xas, unsigned long max)
1051{
1052 void *entry;
1053
1054 if (xas_error(xas))
1055 return NULL;
1056
1057 if (!xas->xa_node) {
1058 xas->xa_index = 1;
1059 return set_bounds(xas);
1060 } else if (xas_top(xas->xa_node)) {
1061 entry = xas_load(xas);
1062 if (entry || xas_not_node(xas->xa_node))
1063 return entry;
1064 } else if (!xas->xa_node->shift &&
1065 xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK)) {
1066 xas->xa_offset = ((xas->xa_index - 1) & XA_CHUNK_MASK) + 1;
1067 }
1068
1069 xas_advance(xas);
1070
1071 while (xas->xa_node && (xas->xa_index <= max)) {
1072 if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) {
1073 xas->xa_offset = xas->xa_node->offset + 1;
1074 xas->xa_node = xa_parent(xas->xa, xas->xa_node);
1075 continue;
1076 }
1077
1078 entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
1079 if (xa_is_node(entry)) {
1080 xas->xa_node = xa_to_node(entry);
1081 xas->xa_offset = 0;
1082 continue;
1083 }
1084 if (entry && !xa_is_sibling(entry))
1085 return entry;
1086
1087 xas_advance(xas);
1088 }
1089
1090 if (!xas->xa_node)
1091 xas->xa_node = XAS_BOUNDS;
1092 return NULL;
1093}
1094EXPORT_SYMBOL_GPL(xas_find);
1095
1096/**
1097 * xas_find_marked() - Find the next marked entry in the XArray.
1098 * @xas: XArray operation state.
1099 * @max: Highest index to return.
1100 * @mark: Mark number to search for.
1101 *
1102 * If the @xas has not yet been walked to an entry, return the marked entry
1103 * which has an index >= xas.xa_index. If it has been walked, the entry
1104 * currently being pointed at has been processed, and so we return the
1105 * first marked entry with an index > xas.xa_index.
1106 *
1107 * If no marked entry is found and the array is smaller than @max, @xas is
1108 * set to the bounds state and xas->xa_index is set to the smallest index
1109 * not yet in the array. This allows @xas to be immediately passed to
1110 * xas_store().
1111 *
1112 * If no entry is found before @max is reached, @xas is set to the restart
1113 * state.
1114 *
1115 * Return: The entry, if found, otherwise %NULL.
1116 */
1117void *xas_find_marked(struct xa_state *xas, unsigned long max, xa_mark_t mark)
1118{
1119 bool advance = true;
1120 unsigned int offset;
1121 void *entry;
1122
1123 if (xas_error(xas))
1124 return NULL;
1125
1126 if (!xas->xa_node) {
1127 xas->xa_index = 1;
1128 goto out;
1129 } else if (xas_top(xas->xa_node)) {
1130 advance = false;
1131 entry = xa_head(xas->xa);
1132 xas->xa_node = NULL;
1133 if (xas->xa_index > max_index(entry))
1134 goto bounds;
1135 if (!xa_is_node(entry)) {
1136 if (xa_marked(xas->xa, mark))
1137 return entry;
1138 xas->xa_index = 1;
1139 goto out;
1140 }
1141 xas->xa_node = xa_to_node(entry);
1142 xas->xa_offset = xas->xa_index >> xas->xa_node->shift;
1143 }
1144
1145 while (xas->xa_index <= max) {
1146 if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) {
1147 xas->xa_offset = xas->xa_node->offset + 1;
1148 xas->xa_node = xa_parent(xas->xa, xas->xa_node);
1149 if (!xas->xa_node)
1150 break;
1151 advance = false;
1152 continue;
1153 }
1154
1155 if (!advance) {
1156 entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
1157 if (xa_is_sibling(entry)) {
1158 xas->xa_offset = xa_to_sibling(entry);
1159 xas_move_index(xas, xas->xa_offset);
1160 }
1161 }
1162
1163 offset = xas_find_chunk(xas, advance, mark);
1164 if (offset > xas->xa_offset) {
1165 advance = false;
1166 xas_move_index(xas, offset);
1167 /* Mind the wrap */
1168 if ((xas->xa_index - 1) >= max)
1169 goto max;
1170 xas->xa_offset = offset;
1171 if (offset == XA_CHUNK_SIZE)
1172 continue;
1173 }
1174
1175 entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
1176 if (!xa_is_node(entry))
1177 return entry;
1178 xas->xa_node = xa_to_node(entry);
1179 xas_set_offset(xas);
1180 }
1181
1182out:
1183 if (!max)
1184 goto max;
1185bounds:
1186 xas->xa_node = XAS_BOUNDS;
1187 return NULL;
1188max:
1189 xas->xa_node = XAS_RESTART;
1190 return NULL;
1191}
1192EXPORT_SYMBOL_GPL(xas_find_marked);
1193
1194/**
1195 * xas_find_conflict() - Find the next present entry in a range.
1196 * @xas: XArray operation state.
1197 *
1198 * The @xas describes both a range and a position within that range.
1199 *
1200 * Context: Any context. Expects xa_lock to be held.
1201 * Return: The next entry in the range covered by @xas or %NULL.
1202 */
1203void *xas_find_conflict(struct xa_state *xas)
1204{
1205 void *curr;
1206
1207 if (xas_error(xas))
1208 return NULL;
1209
1210 if (!xas->xa_node)
1211 return NULL;
1212
1213 if (xas_top(xas->xa_node)) {
1214 curr = xas_start(xas);
1215 if (!curr)
1216 return NULL;
1217 while (xa_is_node(curr)) {
1218 struct xa_node *node = xa_to_node(curr);
1219 curr = xas_descend(xas, node);
1220 }
1221 if (curr)
1222 return curr;
1223 }
1224
1225 if (xas->xa_node->shift > xas->xa_shift)
1226 return NULL;
1227
1228 for (;;) {
1229 if (xas->xa_node->shift == xas->xa_shift) {
1230 if ((xas->xa_offset & xas->xa_sibs) == xas->xa_sibs)
1231 break;
1232 } else if (xas->xa_offset == XA_CHUNK_MASK) {
1233 xas->xa_offset = xas->xa_node->offset;
1234 xas->xa_node = xa_parent_locked(xas->xa, xas->xa_node);
1235 if (!xas->xa_node)
1236 break;
1237 continue;
1238 }
1239 curr = xa_entry_locked(xas->xa, xas->xa_node, ++xas->xa_offset);
1240 if (xa_is_sibling(curr))
1241 continue;
1242 while (xa_is_node(curr)) {
1243 xas->xa_node = xa_to_node(curr);
1244 xas->xa_offset = 0;
1245 curr = xa_entry_locked(xas->xa, xas->xa_node, 0);
1246 }
1247 if (curr)
1248 return curr;
1249 }
1250 xas->xa_offset -= xas->xa_sibs;
1251 return NULL;
1252}
1253EXPORT_SYMBOL_GPL(xas_find_conflict);
1254
1255/**
1256 * xa_init_flags() - Initialise an empty XArray with flags.
1257 * @xa: XArray.
1258 * @flags: XA_FLAG values.
1259 *
1260 * If you need to initialise an XArray with special flags (eg you need
1261 * to take the lock from interrupt context), use this function instead
1262 * of xa_init().
1263 *
1264 * Context: Any context.
1265 */
1266void xa_init_flags(struct xarray *xa, gfp_t flags)
1267{
1268 unsigned int lock_type;
1269 static struct lock_class_key xa_lock_irq;
1270 static struct lock_class_key xa_lock_bh;
1271
1272 spin_lock_init(&xa->xa_lock);
1273 xa->xa_flags = flags;
1274 xa->xa_head = NULL;
1275
1276 lock_type = xa_lock_type(xa);
1277 if (lock_type == XA_LOCK_IRQ)
1278 lockdep_set_class(&xa->xa_lock, &xa_lock_irq);
1279 else if (lock_type == XA_LOCK_BH)
1280 lockdep_set_class(&xa->xa_lock, &xa_lock_bh);
1281}
1282EXPORT_SYMBOL(xa_init_flags);
1283
1284/**
1285 * xa_load() - Load an entry from an XArray.
1286 * @xa: XArray.
1287 * @index: index into array.
1288 *
1289 * Context: Any context. Takes and releases the RCU lock.
1290 * Return: The entry at @index in @xa.
1291 */
1292void *xa_load(struct xarray *xa, unsigned long index)
1293{
1294 XA_STATE(xas, xa, index);
1295 void *entry;
1296
1297 rcu_read_lock();
1298 do {
1299 entry = xas_load(&xas);
1300 if (xa_is_zero(entry))
1301 entry = NULL;
1302 } while (xas_retry(&xas, entry));
1303 rcu_read_unlock();
1304
1305 return entry;
1306}
1307EXPORT_SYMBOL(xa_load);
1308
1309static void *xas_result(struct xa_state *xas, void *curr)
1310{
1311 if (xa_is_zero(curr))
1312 return NULL;
1313 XA_NODE_BUG_ON(xas->xa_node, xa_is_internal(curr));
1314 if (xas_error(xas))
1315 curr = xas->xa_node;
1316 return curr;
1317}
1318
1319/**
1320 * __xa_erase() - Erase this entry from the XArray while locked.
1321 * @xa: XArray.
1322 * @index: Index into array.
1323 *
1324 * If the entry at this index is a multi-index entry then all indices will
1325 * be erased, and the entry will no longer be a multi-index entry.
1326 * This function expects the xa_lock to be held on entry.
1327 *
1328 * Context: Any context. Expects xa_lock to be held on entry. May
1329 * release and reacquire xa_lock if @gfp flags permit.
1330 * Return: The old entry at this index.
1331 */
1332void *__xa_erase(struct xarray *xa, unsigned long index)
1333{
1334 XA_STATE(xas, xa, index);
1335 return xas_result(&xas, xas_store(&xas, NULL));
1336}
1337EXPORT_SYMBOL_GPL(__xa_erase);
1338
1339/**
1340 * xa_store() - Store this entry in the XArray.
1341 * @xa: XArray.
1342 * @index: Index into array.
1343 * @entry: New entry.
1344 * @gfp: Memory allocation flags.
1345 *
1346 * After this function returns, loads from this index will return @entry.
1347 * Storing into an existing multislot entry updates the entry of every index.
1348 * The marks associated with @index are unaffected unless @entry is %NULL.
1349 *
1350 * Context: Process context. Takes and releases the xa_lock. May sleep
1351 * if the @gfp flags permit.
1352 * Return: The old entry at this index on success, xa_err(-EINVAL) if @entry
1353 * cannot be stored in an XArray, or xa_err(-ENOMEM) if memory allocation
1354 * failed.
1355 */
1356void *xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp)
1357{
1358 XA_STATE(xas, xa, index);
1359 void *curr;
1360
1361 if (WARN_ON_ONCE(xa_is_internal(entry)))
1362 return XA_ERROR(-EINVAL);
1363
1364 do {
1365 xas_lock(&xas);
1366 curr = xas_store(&xas, entry);
1367 if (xa_track_free(xa) && entry)
1368 xas_clear_mark(&xas, XA_FREE_MARK);
1369 xas_unlock(&xas);
1370 } while (xas_nomem(&xas, gfp));
1371
1372 return xas_result(&xas, curr);
1373}
1374EXPORT_SYMBOL(xa_store);
1375
1376/**
1377 * __xa_store() - Store this entry in the XArray.
1378 * @xa: XArray.
1379 * @index: Index into array.
1380 * @entry: New entry.
1381 * @gfp: Memory allocation flags.
1382 *
1383 * You must already be holding the xa_lock when calling this function.
1384 * It will drop the lock if needed to allocate memory, and then reacquire
1385 * it afterwards.
1386 *
1387 * Context: Any context. Expects xa_lock to be held on entry. May
1388 * release and reacquire xa_lock if @gfp flags permit.
1389 * Return: The old entry at this index or xa_err() if an error happened.
1390 */
1391void *__xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp)
1392{
1393 XA_STATE(xas, xa, index);
1394 void *curr;
1395
1396 if (WARN_ON_ONCE(xa_is_internal(entry)))
1397 return XA_ERROR(-EINVAL);
1398
1399 do {
1400 curr = xas_store(&xas, entry);
1401 if (xa_track_free(xa) && entry)
1402 xas_clear_mark(&xas, XA_FREE_MARK);
1403 } while (__xas_nomem(&xas, gfp));
1404
1405 return xas_result(&xas, curr);
1406}
1407EXPORT_SYMBOL(__xa_store);
1408
1409/**
1410 * xa_cmpxchg() - Conditionally replace an entry in the XArray.
1411 * @xa: XArray.
1412 * @index: Index into array.
1413 * @old: Old value to test against.
1414 * @entry: New value to place in array.
1415 * @gfp: Memory allocation flags.
1416 *
1417 * If the entry at @index is the same as @old, replace it with @entry.
1418 * If the return value is equal to @old, then the exchange was successful.
1419 *
1420 * Context: Process context. Takes and releases the xa_lock. May sleep
1421 * if the @gfp flags permit.
1422 * Return: The old value at this index or xa_err() if an error happened.
1423 */
1424void *xa_cmpxchg(struct xarray *xa, unsigned long index,
1425 void *old, void *entry, gfp_t gfp)
1426{
1427 XA_STATE(xas, xa, index);
1428 void *curr;
1429
1430 if (WARN_ON_ONCE(xa_is_internal(entry)))
1431 return XA_ERROR(-EINVAL);
1432
1433 do {
1434 xas_lock(&xas);
1435 curr = xas_load(&xas);
1436 if (curr == XA_ZERO_ENTRY)
1437 curr = NULL;
1438 if (curr == old) {
1439 xas_store(&xas, entry);
1440 if (xa_track_free(xa) && entry)
1441 xas_clear_mark(&xas, XA_FREE_MARK);
1442 }
1443 xas_unlock(&xas);
1444 } while (xas_nomem(&xas, gfp));
1445
1446 return xas_result(&xas, curr);
1447}
1448EXPORT_SYMBOL(xa_cmpxchg);
1449
1450/**
1451 * __xa_cmpxchg() - Store this entry in the XArray.
1452 * @xa: XArray.
1453 * @index: Index into array.
1454 * @old: Old value to test against.
1455 * @entry: New entry.
1456 * @gfp: Memory allocation flags.
1457 *
1458 * You must already be holding the xa_lock when calling this function.
1459 * It will drop the lock if needed to allocate memory, and then reacquire
1460 * it afterwards.
1461 *
1462 * Context: Any context. Expects xa_lock to be held on entry. May
1463 * release and reacquire xa_lock if @gfp flags permit.
1464 * Return: The old entry at this index or xa_err() if an error happened.
1465 */
1466void *__xa_cmpxchg(struct xarray *xa, unsigned long index,
1467 void *old, void *entry, gfp_t gfp)
1468{
1469 XA_STATE(xas, xa, index);
1470 void *curr;
1471
1472 if (WARN_ON_ONCE(xa_is_internal(entry)))
1473 return XA_ERROR(-EINVAL);
1474
1475 do {
1476 curr = xas_load(&xas);
1477 if (curr == XA_ZERO_ENTRY)
1478 curr = NULL;
1479 if (curr == old) {
1480 xas_store(&xas, entry);
1481 if (xa_track_free(xa) && entry)
1482 xas_clear_mark(&xas, XA_FREE_MARK);
1483 }
1484 } while (__xas_nomem(&xas, gfp));
1485
1486 return xas_result(&xas, curr);
1487}
1488EXPORT_SYMBOL(__xa_cmpxchg);
1489
1490/**
1491 * xa_reserve() - Reserve this index in the XArray.
1492 * @xa: XArray.
1493 * @index: Index into array.
1494 * @gfp: Memory allocation flags.
1495 *
1496 * Ensures there is somewhere to store an entry at @index in the array.
1497 * If there is already something stored at @index, this function does
1498 * nothing. If there was nothing there, the entry is marked as reserved.
1499 * Loads from @index will continue to see a %NULL pointer until a
1500 * subsequent store to @index.
1501 *
1502 * If you do not use the entry that you have reserved, call xa_release()
1503 * or xa_erase() to free any unnecessary memory.
1504 *
1505 * Context: Process context. Takes and releases the xa_lock, IRQ or BH safe
1506 * if specified in XArray flags. May sleep if the @gfp flags permit.
1507 * Return: 0 if the reservation succeeded or -ENOMEM if it failed.
1508 */
1509int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp)
1510{
1511 XA_STATE(xas, xa, index);
1512 unsigned int lock_type = xa_lock_type(xa);
1513 void *curr;
1514
1515 do {
1516 xas_lock_type(&xas, lock_type);
1517 curr = xas_load(&xas);
1518 if (!curr)
1519 xas_store(&xas, XA_ZERO_ENTRY);
1520 xas_unlock_type(&xas, lock_type);
1521 } while (xas_nomem(&xas, gfp));
1522
1523 return xas_error(&xas);
1524}
1525EXPORT_SYMBOL(xa_reserve);
1526
1527#ifdef CONFIG_XARRAY_MULTI
1528static void xas_set_range(struct xa_state *xas, unsigned long first,
1529 unsigned long last)
1530{
1531 unsigned int shift = 0;
1532 unsigned long sibs = last - first;
1533 unsigned int offset = XA_CHUNK_MASK;
1534
1535 xas_set(xas, first);
1536
1537 while ((first & XA_CHUNK_MASK) == 0) {
1538 if (sibs < XA_CHUNK_MASK)
1539 break;
1540 if ((sibs == XA_CHUNK_MASK) && (offset < XA_CHUNK_MASK))
1541 break;
1542 shift += XA_CHUNK_SHIFT;
1543 if (offset == XA_CHUNK_MASK)
1544 offset = sibs & XA_CHUNK_MASK;
1545 sibs >>= XA_CHUNK_SHIFT;
1546 first >>= XA_CHUNK_SHIFT;
1547 }
1548
1549 offset = first & XA_CHUNK_MASK;
1550 if (offset + sibs > XA_CHUNK_MASK)
1551 sibs = XA_CHUNK_MASK - offset;
1552 if ((((first + sibs + 1) << shift) - 1) > last)
1553 sibs -= 1;
1554
1555 xas->xa_shift = shift;
1556 xas->xa_sibs = sibs;
1557}
1558
1559/**
1560 * xa_store_range() - Store this entry at a range of indices in the XArray.
1561 * @xa: XArray.
1562 * @first: First index to affect.
1563 * @last: Last index to affect.
1564 * @entry: New entry.
1565 * @gfp: Memory allocation flags.
1566 *
1567 * After this function returns, loads from any index between @first and @last,
1568 * inclusive will return @entry.
1569 * Storing into an existing multislot entry updates the entry of every index.
1570 * The marks associated with @index are unaffected unless @entry is %NULL.
1571 *
1572 * Context: Process context. Takes and releases the xa_lock. May sleep
1573 * if the @gfp flags permit.
1574 * Return: %NULL on success, xa_err(-EINVAL) if @entry cannot be stored in
1575 * an XArray, or xa_err(-ENOMEM) if memory allocation failed.
1576 */
1577void *xa_store_range(struct xarray *xa, unsigned long first,
1578 unsigned long last, void *entry, gfp_t gfp)
1579{
1580 XA_STATE(xas, xa, 0);
1581
1582 if (WARN_ON_ONCE(xa_is_internal(entry)))
1583 return XA_ERROR(-EINVAL);
1584 if (last < first)
1585 return XA_ERROR(-EINVAL);
1586
1587 do {
1588 xas_lock(&xas);
1589 if (entry) {
1590 unsigned int order = (last == ~0UL) ? 64 :
1591 ilog2(last + 1);
1592 xas_set_order(&xas, last, order);
1593 xas_create(&xas);
1594 if (xas_error(&xas))
1595 goto unlock;
1596 }
1597 do {
1598 xas_set_range(&xas, first, last);
1599 xas_store(&xas, entry);
1600 if (xas_error(&xas))
1601 goto unlock;
1602 first += xas_size(&xas);
1603 } while (first <= last);
1604unlock:
1605 xas_unlock(&xas);
1606 } while (xas_nomem(&xas, gfp));
1607
1608 return xas_result(&xas, NULL);
1609}
1610EXPORT_SYMBOL(xa_store_range);
1611#endif /* CONFIG_XARRAY_MULTI */
1612
1613/**
1614 * __xa_alloc() - Find somewhere to store this entry in the XArray.
1615 * @xa: XArray.
1616 * @id: Pointer to ID.
1617 * @max: Maximum ID to allocate (inclusive).
1618 * @entry: New entry.
1619 * @gfp: Memory allocation flags.
1620 *
1621 * Allocates an unused ID in the range specified by @id and @max.
1622 * Updates the @id pointer with the index, then stores the entry at that
1623 * index. A concurrent lookup will not see an uninitialised @id.
1624 *
1625 * Context: Any context. Expects xa_lock to be held on entry. May
1626 * release and reacquire xa_lock if @gfp flags permit.
1627 * Return: 0 on success, -ENOMEM if memory allocation fails or -ENOSPC if
1628 * there is no more space in the XArray.
1629 */
1630int __xa_alloc(struct xarray *xa, u32 *id, u32 max, void *entry, gfp_t gfp)
1631{
1632 XA_STATE(xas, xa, 0);
1633 int err;
1634
1635 if (WARN_ON_ONCE(xa_is_internal(entry)))
1636 return -EINVAL;
1637 if (WARN_ON_ONCE(!xa_track_free(xa)))
1638 return -EINVAL;
1639
1640 if (!entry)
1641 entry = XA_ZERO_ENTRY;
1642
1643 do {
1644 xas.xa_index = *id;
1645 xas_find_marked(&xas, max, XA_FREE_MARK);
1646 if (xas.xa_node == XAS_RESTART)
1647 xas_set_err(&xas, -ENOSPC);
1648 xas_store(&xas, entry);
1649 xas_clear_mark(&xas, XA_FREE_MARK);
1650 } while (__xas_nomem(&xas, gfp));
1651
1652 err = xas_error(&xas);
1653 if (!err)
1654 *id = xas.xa_index;
1655 return err;
1656}
1657EXPORT_SYMBOL(__xa_alloc);
1658
1659/**
1660 * __xa_set_mark() - Set this mark on this entry while locked.
1661 * @xa: XArray.
1662 * @index: Index of entry.
1663 * @mark: Mark number.
1664 *
1665 * Attempting to set a mark on a NULL entry does not succeed.
1666 *
1667 * Context: Any context. Expects xa_lock to be held on entry.
1668 */
1669void __xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
1670{
1671 XA_STATE(xas, xa, index);
1672 void *entry = xas_load(&xas);
1673
1674 if (entry)
1675 xas_set_mark(&xas, mark);
1676}
1677EXPORT_SYMBOL_GPL(__xa_set_mark);
1678
1679/**
1680 * __xa_clear_mark() - Clear this mark on this entry while locked.
1681 * @xa: XArray.
1682 * @index: Index of entry.
1683 * @mark: Mark number.
1684 *
1685 * Context: Any context. Expects xa_lock to be held on entry.
1686 */
1687void __xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
1688{
1689 XA_STATE(xas, xa, index);
1690 void *entry = xas_load(&xas);
1691
1692 if (entry)
1693 xas_clear_mark(&xas, mark);
1694}
1695EXPORT_SYMBOL_GPL(__xa_clear_mark);
1696
1697/**
1698 * xa_get_mark() - Inquire whether this mark is set on this entry.
1699 * @xa: XArray.
1700 * @index: Index of entry.
1701 * @mark: Mark number.
1702 *
1703 * This function uses the RCU read lock, so the result may be out of date
1704 * by the time it returns. If you need the result to be stable, use a lock.
1705 *
1706 * Context: Any context. Takes and releases the RCU lock.
1707 * Return: True if the entry at @index has this mark set, false if it doesn't.
1708 */
1709bool xa_get_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
1710{
1711 XA_STATE(xas, xa, index);
1712 void *entry;
1713
1714 rcu_read_lock();
1715 entry = xas_start(&xas);
1716 while (xas_get_mark(&xas, mark)) {
1717 if (!xa_is_node(entry))
1718 goto found;
1719 entry = xas_descend(&xas, xa_to_node(entry));
1720 }
1721 rcu_read_unlock();
1722 return false;
1723 found:
1724 rcu_read_unlock();
1725 return true;
1726}
1727EXPORT_SYMBOL(xa_get_mark);
1728
1729/**
1730 * xa_set_mark() - Set this mark on this entry.
1731 * @xa: XArray.
1732 * @index: Index of entry.
1733 * @mark: Mark number.
1734 *
1735 * Attempting to set a mark on a NULL entry does not succeed.
1736 *
1737 * Context: Process context. Takes and releases the xa_lock.
1738 */
1739void xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
1740{
1741 xa_lock(xa);
1742 __xa_set_mark(xa, index, mark);
1743 xa_unlock(xa);
1744}
1745EXPORT_SYMBOL(xa_set_mark);
1746
1747/**
1748 * xa_clear_mark() - Clear this mark on this entry.
1749 * @xa: XArray.
1750 * @index: Index of entry.
1751 * @mark: Mark number.
1752 *
1753 * Clearing a mark always succeeds.
1754 *
1755 * Context: Process context. Takes and releases the xa_lock.
1756 */
1757void xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
1758{
1759 xa_lock(xa);
1760 __xa_clear_mark(xa, index, mark);
1761 xa_unlock(xa);
1762}
1763EXPORT_SYMBOL(xa_clear_mark);
1764
1765/**
1766 * xa_find() - Search the XArray for an entry.
1767 * @xa: XArray.
1768 * @indexp: Pointer to an index.
1769 * @max: Maximum index to search to.
1770 * @filter: Selection criterion.
1771 *
1772 * Finds the entry in @xa which matches the @filter, and has the lowest
1773 * index that is at least @indexp and no more than @max.
1774 * If an entry is found, @indexp is updated to be the index of the entry.
1775 * This function is protected by the RCU read lock, so it may not find
1776 * entries which are being simultaneously added. It will not return an
1777 * %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find().
1778 *
1779 * Context: Any context. Takes and releases the RCU lock.
1780 * Return: The entry, if found, otherwise %NULL.
1781 */
1782void *xa_find(struct xarray *xa, unsigned long *indexp,
1783 unsigned long max, xa_mark_t filter)
1784{
1785 XA_STATE(xas, xa, *indexp);
1786 void *entry;
1787
1788 rcu_read_lock();
1789 do {
1790 if ((__force unsigned int)filter < XA_MAX_MARKS)
1791 entry = xas_find_marked(&xas, max, filter);
1792 else
1793 entry = xas_find(&xas, max);
1794 } while (xas_retry(&xas, entry));
1795 rcu_read_unlock();
1796
1797 if (entry)
1798 *indexp = xas.xa_index;
1799 return entry;
1800}
1801EXPORT_SYMBOL(xa_find);
1802
1803/**
1804 * xa_find_after() - Search the XArray for a present entry.
1805 * @xa: XArray.
1806 * @indexp: Pointer to an index.
1807 * @max: Maximum index to search to.
1808 * @filter: Selection criterion.
1809 *
1810 * Finds the entry in @xa which matches the @filter and has the lowest
1811 * index that is above @indexp and no more than @max.
1812 * If an entry is found, @indexp is updated to be the index of the entry.
1813 * This function is protected by the RCU read lock, so it may miss entries
1814 * which are being simultaneously added. It will not return an
1815 * %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find().
1816 *
1817 * Context: Any context. Takes and releases the RCU lock.
1818 * Return: The pointer, if found, otherwise %NULL.
1819 */
1820void *xa_find_after(struct xarray *xa, unsigned long *indexp,
1821 unsigned long max, xa_mark_t filter)
1822{
1823 XA_STATE(xas, xa, *indexp + 1);
1824 void *entry;
1825
1826 rcu_read_lock();
1827 for (;;) {
1828 if ((__force unsigned int)filter < XA_MAX_MARKS)
1829 entry = xas_find_marked(&xas, max, filter);
1830 else
1831 entry = xas_find(&xas, max);
1832 if (xas.xa_shift) {
1833 if (xas.xa_index & ((1UL << xas.xa_shift) - 1))
1834 continue;
1835 } else {
1836 if (xas.xa_offset < (xas.xa_index & XA_CHUNK_MASK))
1837 continue;
1838 }
1839 if (!xas_retry(&xas, entry))
1840 break;
1841 }
1842 rcu_read_unlock();
1843
1844 if (entry)
1845 *indexp = xas.xa_index;
1846 return entry;
1847}
1848EXPORT_SYMBOL(xa_find_after);
1849
1850static unsigned int xas_extract_present(struct xa_state *xas, void **dst,
1851 unsigned long max, unsigned int n)
1852{
1853 void *entry;
1854 unsigned int i = 0;
1855
1856 rcu_read_lock();
1857 xas_for_each(xas, entry, max) {
1858 if (xas_retry(xas, entry))
1859 continue;
1860 dst[i++] = entry;
1861 if (i == n)
1862 break;
1863 }
1864 rcu_read_unlock();
1865
1866 return i;
1867}
1868
1869static unsigned int xas_extract_marked(struct xa_state *xas, void **dst,
1870 unsigned long max, unsigned int n, xa_mark_t mark)
1871{
1872 void *entry;
1873 unsigned int i = 0;
1874
1875 rcu_read_lock();
1876 xas_for_each_marked(xas, entry, max, mark) {
1877 if (xas_retry(xas, entry))
1878 continue;
1879 dst[i++] = entry;
1880 if (i == n)
1881 break;
1882 }
1883 rcu_read_unlock();
1884
1885 return i;
1886}
1887
1888/**
1889 * xa_extract() - Copy selected entries from the XArray into a normal array.
1890 * @xa: The source XArray to copy from.
1891 * @dst: The buffer to copy entries into.
1892 * @start: The first index in the XArray eligible to be selected.
1893 * @max: The last index in the XArray eligible to be selected.
1894 * @n: The maximum number of entries to copy.
1895 * @filter: Selection criterion.
1896 *
1897 * Copies up to @n entries that match @filter from the XArray. The
1898 * copied entries will have indices between @start and @max, inclusive.
1899 *
1900 * The @filter may be an XArray mark value, in which case entries which are
1901 * marked with that mark will be copied. It may also be %XA_PRESENT, in
1902 * which case all entries which are not NULL will be copied.
1903 *
1904 * The entries returned may not represent a snapshot of the XArray at a
1905 * moment in time. For example, if another thread stores to index 5, then
1906 * index 10, calling xa_extract() may return the old contents of index 5
1907 * and the new contents of index 10. Indices not modified while this
1908 * function is running will not be skipped.
1909 *
1910 * If you need stronger guarantees, holding the xa_lock across calls to this
1911 * function will prevent concurrent modification.
1912 *
1913 * Context: Any context. Takes and releases the RCU lock.
1914 * Return: The number of entries copied.
1915 */
1916unsigned int xa_extract(struct xarray *xa, void **dst, unsigned long start,
1917 unsigned long max, unsigned int n, xa_mark_t filter)
1918{
1919 XA_STATE(xas, xa, start);
1920
1921 if (!n)
1922 return 0;
1923
1924 if ((__force unsigned int)filter < XA_MAX_MARKS)
1925 return xas_extract_marked(&xas, dst, max, n, filter);
1926 return xas_extract_present(&xas, dst, max, n);
1927}
1928EXPORT_SYMBOL(xa_extract);
1929
1930/**
1931 * xa_destroy() - Free all internal data structures.
1932 * @xa: XArray.
1933 *
1934 * After calling this function, the XArray is empty and has freed all memory
1935 * allocated for its internal data structures. You are responsible for
1936 * freeing the objects referenced by the XArray.
1937 *
1938 * Context: Any context. Takes and releases the xa_lock, interrupt-safe.
1939 */
1940void xa_destroy(struct xarray *xa)
1941{
1942 XA_STATE(xas, xa, 0);
1943 unsigned long flags;
1944 void *entry;
1945
1946 xas.xa_node = NULL;
1947 xas_lock_irqsave(&xas, flags);
1948 entry = xa_head_locked(xa);
1949 RCU_INIT_POINTER(xa->xa_head, NULL);
1950 xas_init_marks(&xas);
1951 /* lockdep checks we're still holding the lock in xas_free_nodes() */
1952 if (xa_is_node(entry))
1953 xas_free_nodes(&xas, xa_to_node(entry));
1954 xas_unlock_irqrestore(&xas, flags);
1955}
1956EXPORT_SYMBOL(xa_destroy);
1957
1958#ifdef XA_DEBUG
1959void xa_dump_node(const struct xa_node *node)
1960{
1961 unsigned i, j;
1962
1963 if (!node)
1964 return;
1965 if ((unsigned long)node & 3) {
1966 pr_cont("node %px\n", node);
1967 return;
1968 }
1969
1970 pr_cont("node %px %s %d parent %px shift %d count %d values %d "
1971 "array %px list %px %px marks",
1972 node, node->parent ? "offset" : "max", node->offset,
1973 node->parent, node->shift, node->count, node->nr_values,
1974 node->array, node->private_list.prev, node->private_list.next);
1975 for (i = 0; i < XA_MAX_MARKS; i++)
1976 for (j = 0; j < XA_MARK_LONGS; j++)
1977 pr_cont(" %lx", node->marks[i][j]);
1978 pr_cont("\n");
1979}
1980
1981void xa_dump_index(unsigned long index, unsigned int shift)
1982{
1983 if (!shift)
1984 pr_info("%lu: ", index);
1985 else if (shift >= BITS_PER_LONG)
1986 pr_info("0-%lu: ", ~0UL);
1987 else
1988 pr_info("%lu-%lu: ", index, index | ((1UL << shift) - 1));
1989}
1990
1991void xa_dump_entry(const void *entry, unsigned long index, unsigned long shift)
1992{
1993 if (!entry)
1994 return;
1995
1996 xa_dump_index(index, shift);
1997
1998 if (xa_is_node(entry)) {
1999 if (shift == 0) {
2000 pr_cont("%px\n", entry);
2001 } else {
2002 unsigned long i;
2003 struct xa_node *node = xa_to_node(entry);
2004 xa_dump_node(node);
2005 for (i = 0; i < XA_CHUNK_SIZE; i++)
2006 xa_dump_entry(node->slots[i],
2007 index + (i << node->shift), node->shift);
2008 }
2009 } else if (xa_is_value(entry))
2010 pr_cont("value %ld (0x%lx) [%px]\n", xa_to_value(entry),
2011 xa_to_value(entry), entry);
2012 else if (!xa_is_internal(entry))
2013 pr_cont("%px\n", entry);
2014 else if (xa_is_retry(entry))
2015 pr_cont("retry (%ld)\n", xa_to_internal(entry));
2016 else if (xa_is_sibling(entry))
2017 pr_cont("sibling (slot %ld)\n", xa_to_sibling(entry));
2018 else if (xa_is_zero(entry))
2019 pr_cont("zero (%ld)\n", xa_to_internal(entry));
2020 else
2021 pr_cont("UNKNOWN ENTRY (%px)\n", entry);
2022}
2023
2024void xa_dump(const struct xarray *xa)
2025{
2026 void *entry = xa->xa_head;
2027 unsigned int shift = 0;
2028
2029 pr_info("xarray: %px head %px flags %x marks %d %d %d\n", xa, entry,
2030 xa->xa_flags, xa_marked(xa, XA_MARK_0),
2031 xa_marked(xa, XA_MARK_1), xa_marked(xa, XA_MARK_2));
2032 if (xa_is_node(entry))
2033 shift = xa_to_node(entry)->shift + XA_CHUNK_SHIFT;
2034 xa_dump_entry(entry, 0, shift);
2035}
2036#endif
diff --git a/mm/Kconfig b/mm/Kconfig
index de64ea658716..02301a89089e 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -379,7 +379,7 @@ config TRANSPARENT_HUGEPAGE
379 bool "Transparent Hugepage Support" 379 bool "Transparent Hugepage Support"
380 depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE 380 depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
381 select COMPACTION 381 select COMPACTION
382 select RADIX_TREE_MULTIORDER 382 select XARRAY_MULTI
383 help 383 help
384 Transparent Hugepages allows the kernel to use huge pages and 384 Transparent Hugepages allows the kernel to use huge pages and
385 huge tlb transparently to the applications whenever possible. 385 huge tlb transparently to the applications whenever possible.
@@ -671,7 +671,7 @@ config ZONE_DEVICE
671 depends on MEMORY_HOTREMOVE 671 depends on MEMORY_HOTREMOVE
672 depends on SPARSEMEM_VMEMMAP 672 depends on SPARSEMEM_VMEMMAP
673 depends on ARCH_HAS_ZONE_DEVICE 673 depends on ARCH_HAS_ZONE_DEVICE
674 select RADIX_TREE_MULTIORDER 674 select XARRAY_MULTI
675 675
676 help 676 help
677 Device memory hotplug support allows for establishing pmem, 677 Device memory hotplug support allows for establishing pmem,
diff --git a/mm/filemap.c b/mm/filemap.c
index 3968da1f7f5a..218d0b2ec82d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -113,60 +113,26 @@
113 * ->tasklist_lock (memory_failure, collect_procs_ao) 113 * ->tasklist_lock (memory_failure, collect_procs_ao)
114 */ 114 */
115 115
116static int page_cache_tree_insert(struct address_space *mapping, 116static void page_cache_delete(struct address_space *mapping,
117 struct page *page, void **shadowp)
118{
119 struct radix_tree_node *node;
120 void **slot;
121 int error;
122
123 error = __radix_tree_create(&mapping->i_pages, page->index, 0,
124 &node, &slot);
125 if (error)
126 return error;
127 if (*slot) {
128 void *p;
129
130 p = radix_tree_deref_slot_protected(slot,
131 &mapping->i_pages.xa_lock);
132 if (!radix_tree_exceptional_entry(p))
133 return -EEXIST;
134
135 mapping->nrexceptional--;
136 if (shadowp)
137 *shadowp = p;
138 }
139 __radix_tree_replace(&mapping->i_pages, node, slot, page,
140 workingset_lookup_update(mapping));
141 mapping->nrpages++;
142 return 0;
143}
144
145static void page_cache_tree_delete(struct address_space *mapping,
146 struct page *page, void *shadow) 117 struct page *page, void *shadow)
147{ 118{
148 int i, nr; 119 XA_STATE(xas, &mapping->i_pages, page->index);
120 unsigned int nr = 1;
121
122 mapping_set_update(&xas, mapping);
149 123
150 /* hugetlb pages are represented by one entry in the radix tree */ 124 /* hugetlb pages are represented by a single entry in the xarray */
151 nr = PageHuge(page) ? 1 : hpage_nr_pages(page); 125 if (!PageHuge(page)) {
126 xas_set_order(&xas, page->index, compound_order(page));
127 nr = 1U << compound_order(page);
128 }
152 129
153 VM_BUG_ON_PAGE(!PageLocked(page), page); 130 VM_BUG_ON_PAGE(!PageLocked(page), page);
154 VM_BUG_ON_PAGE(PageTail(page), page); 131 VM_BUG_ON_PAGE(PageTail(page), page);
155 VM_BUG_ON_PAGE(nr != 1 && shadow, page); 132 VM_BUG_ON_PAGE(nr != 1 && shadow, page);
156 133
157 for (i = 0; i < nr; i++) { 134 xas_store(&xas, shadow);
158 struct radix_tree_node *node; 135 xas_init_marks(&xas);
159 void **slot;
160
161 __radix_tree_lookup(&mapping->i_pages, page->index + i,
162 &node, &slot);
163
164 VM_BUG_ON_PAGE(!node && nr != 1, page);
165
166 radix_tree_clear_tags(&mapping->i_pages, node, slot);
167 __radix_tree_replace(&mapping->i_pages, node, slot, shadow,
168 workingset_lookup_update(mapping));
169 }
170 136
171 page->mapping = NULL; 137 page->mapping = NULL;
172 /* Leave page->index set: truncation lookup relies upon it */ 138 /* Leave page->index set: truncation lookup relies upon it */
@@ -265,7 +231,7 @@ void __delete_from_page_cache(struct page *page, void *shadow)
265 trace_mm_filemap_delete_from_page_cache(page); 231 trace_mm_filemap_delete_from_page_cache(page);
266 232
267 unaccount_page_cache_page(mapping, page); 233 unaccount_page_cache_page(mapping, page);
268 page_cache_tree_delete(mapping, page, shadow); 234 page_cache_delete(mapping, page, shadow);
269} 235}
270 236
271static void page_cache_free_page(struct address_space *mapping, 237static void page_cache_free_page(struct address_space *mapping,
@@ -308,7 +274,7 @@ void delete_from_page_cache(struct page *page)
308EXPORT_SYMBOL(delete_from_page_cache); 274EXPORT_SYMBOL(delete_from_page_cache);
309 275
310/* 276/*
311 * page_cache_tree_delete_batch - delete several pages from page cache 277 * page_cache_delete_batch - delete several pages from page cache
312 * @mapping: the mapping to which pages belong 278 * @mapping: the mapping to which pages belong
313 * @pvec: pagevec with pages to delete 279 * @pvec: pagevec with pages to delete
314 * 280 *
@@ -321,24 +287,19 @@ EXPORT_SYMBOL(delete_from_page_cache);
321 * 287 *
322 * The function expects the i_pages lock to be held. 288 * The function expects the i_pages lock to be held.
323 */ 289 */
324static void 290static void page_cache_delete_batch(struct address_space *mapping,
325page_cache_tree_delete_batch(struct address_space *mapping,
326 struct pagevec *pvec) 291 struct pagevec *pvec)
327{ 292{
328 struct radix_tree_iter iter; 293 XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index);
329 void **slot;
330 int total_pages = 0; 294 int total_pages = 0;
331 int i = 0, tail_pages = 0; 295 int i = 0, tail_pages = 0;
332 struct page *page; 296 struct page *page;
333 pgoff_t start;
334 297
335 start = pvec->pages[0]->index; 298 mapping_set_update(&xas, mapping);
336 radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { 299 xas_for_each(&xas, page, ULONG_MAX) {
337 if (i >= pagevec_count(pvec) && !tail_pages) 300 if (i >= pagevec_count(pvec) && !tail_pages)
338 break; 301 break;
339 page = radix_tree_deref_slot_protected(slot, 302 if (xa_is_value(page))
340 &mapping->i_pages.xa_lock);
341 if (radix_tree_exceptional_entry(page))
342 continue; 303 continue;
343 if (!tail_pages) { 304 if (!tail_pages) {
344 /* 305 /*
@@ -346,8 +307,11 @@ page_cache_tree_delete_batch(struct address_space *mapping,
346 * have our pages locked so they are protected from 307 * have our pages locked so they are protected from
347 * being removed. 308 * being removed.
348 */ 309 */
349 if (page != pvec->pages[i]) 310 if (page != pvec->pages[i]) {
311 VM_BUG_ON_PAGE(page->index >
312 pvec->pages[i]->index, page);
350 continue; 313 continue;
314 }
351 WARN_ON_ONCE(!PageLocked(page)); 315 WARN_ON_ONCE(!PageLocked(page));
352 if (PageTransHuge(page) && !PageHuge(page)) 316 if (PageTransHuge(page) && !PageHuge(page))
353 tail_pages = HPAGE_PMD_NR - 1; 317 tail_pages = HPAGE_PMD_NR - 1;
@@ -358,11 +322,11 @@ page_cache_tree_delete_batch(struct address_space *mapping,
358 */ 322 */
359 i++; 323 i++;
360 } else { 324 } else {
325 VM_BUG_ON_PAGE(page->index + HPAGE_PMD_NR - tail_pages
326 != pvec->pages[i]->index, page);
361 tail_pages--; 327 tail_pages--;
362 } 328 }
363 radix_tree_clear_tags(&mapping->i_pages, iter.node, slot); 329 xas_store(&xas, NULL);
364 __radix_tree_replace(&mapping->i_pages, iter.node, slot, NULL,
365 workingset_lookup_update(mapping));
366 total_pages++; 330 total_pages++;
367 } 331 }
368 mapping->nrpages -= total_pages; 332 mapping->nrpages -= total_pages;
@@ -383,7 +347,7 @@ void delete_from_page_cache_batch(struct address_space *mapping,
383 347
384 unaccount_page_cache_page(mapping, pvec->pages[i]); 348 unaccount_page_cache_page(mapping, pvec->pages[i]);
385 } 349 }
386 page_cache_tree_delete_batch(mapping, pvec); 350 page_cache_delete_batch(mapping, pvec);
387 xa_unlock_irqrestore(&mapping->i_pages, flags); 351 xa_unlock_irqrestore(&mapping->i_pages, flags);
388 352
389 for (i = 0; i < pagevec_count(pvec); i++) 353 for (i = 0; i < pagevec_count(pvec); i++)
@@ -493,20 +457,31 @@ EXPORT_SYMBOL(filemap_flush);
493bool filemap_range_has_page(struct address_space *mapping, 457bool filemap_range_has_page(struct address_space *mapping,
494 loff_t start_byte, loff_t end_byte) 458 loff_t start_byte, loff_t end_byte)
495{ 459{
496 pgoff_t index = start_byte >> PAGE_SHIFT;
497 pgoff_t end = end_byte >> PAGE_SHIFT;
498 struct page *page; 460 struct page *page;
461 XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
462 pgoff_t max = end_byte >> PAGE_SHIFT;
499 463
500 if (end_byte < start_byte) 464 if (end_byte < start_byte)
501 return false; 465 return false;
502 466
503 if (mapping->nrpages == 0) 467 rcu_read_lock();
504 return false; 468 for (;;) {
469 page = xas_find(&xas, max);
470 if (xas_retry(&xas, page))
471 continue;
472 /* Shadow entries don't count */
473 if (xa_is_value(page))
474 continue;
475 /*
476 * We don't need to try to pin this page; we're about to
477 * release the RCU lock anyway. It is enough to know that
478 * there was a page here recently.
479 */
480 break;
481 }
482 rcu_read_unlock();
505 483
506 if (!find_get_pages_range(mapping, &index, end, 1, &page)) 484 return page != NULL;
507 return false;
508 put_page(page);
509 return true;
510} 485}
511EXPORT_SYMBOL(filemap_range_has_page); 486EXPORT_SYMBOL(filemap_range_has_page);
512 487
@@ -777,51 +752,44 @@ EXPORT_SYMBOL(file_write_and_wait_range);
777 * locked. This function does not add the new page to the LRU, the 752 * locked. This function does not add the new page to the LRU, the
778 * caller must do that. 753 * caller must do that.
779 * 754 *
780 * The remove + add is atomic. The only way this function can fail is 755 * The remove + add is atomic. This function cannot fail.
781 * memory allocation failure.
782 */ 756 */
783int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) 757int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
784{ 758{
785 int error; 759 struct address_space *mapping = old->mapping;
760 void (*freepage)(struct page *) = mapping->a_ops->freepage;
761 pgoff_t offset = old->index;
762 XA_STATE(xas, &mapping->i_pages, offset);
763 unsigned long flags;
786 764
787 VM_BUG_ON_PAGE(!PageLocked(old), old); 765 VM_BUG_ON_PAGE(!PageLocked(old), old);
788 VM_BUG_ON_PAGE(!PageLocked(new), new); 766 VM_BUG_ON_PAGE(!PageLocked(new), new);
789 VM_BUG_ON_PAGE(new->mapping, new); 767 VM_BUG_ON_PAGE(new->mapping, new);
790 768
791 error = radix_tree_preload(gfp_mask & GFP_RECLAIM_MASK); 769 get_page(new);
792 if (!error) { 770 new->mapping = mapping;
793 struct address_space *mapping = old->mapping; 771 new->index = offset;
794 void (*freepage)(struct page *);
795 unsigned long flags;
796
797 pgoff_t offset = old->index;
798 freepage = mapping->a_ops->freepage;
799
800 get_page(new);
801 new->mapping = mapping;
802 new->index = offset;
803 772
804 xa_lock_irqsave(&mapping->i_pages, flags); 773 xas_lock_irqsave(&xas, flags);
805 __delete_from_page_cache(old, NULL); 774 xas_store(&xas, new);
806 error = page_cache_tree_insert(mapping, new, NULL);
807 BUG_ON(error);
808 775
809 /* 776 old->mapping = NULL;
810 * hugetlb pages do not participate in page cache accounting. 777 /* hugetlb pages do not participate in page cache accounting. */
811 */ 778 if (!PageHuge(old))
812 if (!PageHuge(new)) 779 __dec_node_page_state(new, NR_FILE_PAGES);
813 __inc_node_page_state(new, NR_FILE_PAGES); 780 if (!PageHuge(new))
814 if (PageSwapBacked(new)) 781 __inc_node_page_state(new, NR_FILE_PAGES);
815 __inc_node_page_state(new, NR_SHMEM); 782 if (PageSwapBacked(old))
816 xa_unlock_irqrestore(&mapping->i_pages, flags); 783 __dec_node_page_state(new, NR_SHMEM);
817 mem_cgroup_migrate(old, new); 784 if (PageSwapBacked(new))
818 radix_tree_preload_end(); 785 __inc_node_page_state(new, NR_SHMEM);
819 if (freepage) 786 xas_unlock_irqrestore(&xas, flags);
820 freepage(old); 787 mem_cgroup_migrate(old, new);
821 put_page(old); 788 if (freepage)
822 } 789 freepage(old);
790 put_page(old);
823 791
824 return error; 792 return 0;
825} 793}
826EXPORT_SYMBOL_GPL(replace_page_cache_page); 794EXPORT_SYMBOL_GPL(replace_page_cache_page);
827 795
@@ -830,12 +798,15 @@ static int __add_to_page_cache_locked(struct page *page,
830 pgoff_t offset, gfp_t gfp_mask, 798 pgoff_t offset, gfp_t gfp_mask,
831 void **shadowp) 799 void **shadowp)
832{ 800{
801 XA_STATE(xas, &mapping->i_pages, offset);
833 int huge = PageHuge(page); 802 int huge = PageHuge(page);
834 struct mem_cgroup *memcg; 803 struct mem_cgroup *memcg;
835 int error; 804 int error;
805 void *old;
836 806
837 VM_BUG_ON_PAGE(!PageLocked(page), page); 807 VM_BUG_ON_PAGE(!PageLocked(page), page);
838 VM_BUG_ON_PAGE(PageSwapBacked(page), page); 808 VM_BUG_ON_PAGE(PageSwapBacked(page), page);
809 mapping_set_update(&xas, mapping);
839 810
840 if (!huge) { 811 if (!huge) {
841 error = mem_cgroup_try_charge(page, current->mm, 812 error = mem_cgroup_try_charge(page, current->mm,
@@ -844,39 +815,47 @@ static int __add_to_page_cache_locked(struct page *page,
844 return error; 815 return error;
845 } 816 }
846 817
847 error = radix_tree_maybe_preload(gfp_mask & GFP_RECLAIM_MASK);
848 if (error) {
849 if (!huge)
850 mem_cgroup_cancel_charge(page, memcg, false);
851 return error;
852 }
853
854 get_page(page); 818 get_page(page);
855 page->mapping = mapping; 819 page->mapping = mapping;
856 page->index = offset; 820 page->index = offset;
857 821
858 xa_lock_irq(&mapping->i_pages); 822 do {
859 error = page_cache_tree_insert(mapping, page, shadowp); 823 xas_lock_irq(&xas);
860 radix_tree_preload_end(); 824 old = xas_load(&xas);
861 if (unlikely(error)) 825 if (old && !xa_is_value(old))
862 goto err_insert; 826 xas_set_err(&xas, -EEXIST);
827 xas_store(&xas, page);
828 if (xas_error(&xas))
829 goto unlock;
830
831 if (xa_is_value(old)) {
832 mapping->nrexceptional--;
833 if (shadowp)
834 *shadowp = old;
835 }
836 mapping->nrpages++;
837
838 /* hugetlb pages do not participate in page cache accounting */
839 if (!huge)
840 __inc_node_page_state(page, NR_FILE_PAGES);
841unlock:
842 xas_unlock_irq(&xas);
843 } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK));
844
845 if (xas_error(&xas))
846 goto error;
863 847
864 /* hugetlb pages do not participate in page cache accounting. */
865 if (!huge)
866 __inc_node_page_state(page, NR_FILE_PAGES);
867 xa_unlock_irq(&mapping->i_pages);
868 if (!huge) 848 if (!huge)
869 mem_cgroup_commit_charge(page, memcg, false, false); 849 mem_cgroup_commit_charge(page, memcg, false, false);
870 trace_mm_filemap_add_to_page_cache(page); 850 trace_mm_filemap_add_to_page_cache(page);
871 return 0; 851 return 0;
872err_insert: 852error:
873 page->mapping = NULL; 853 page->mapping = NULL;
874 /* Leave page->index set: truncation relies upon it */ 854 /* Leave page->index set: truncation relies upon it */
875 xa_unlock_irq(&mapping->i_pages);
876 if (!huge) 855 if (!huge)
877 mem_cgroup_cancel_charge(page, memcg, false); 856 mem_cgroup_cancel_charge(page, memcg, false);
878 put_page(page); 857 put_page(page);
879 return error; 858 return xas_error(&xas);
880} 859}
881 860
882/** 861/**
@@ -1341,86 +1320,76 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
1341} 1320}
1342 1321
1343/** 1322/**
1344 * page_cache_next_hole - find the next hole (not-present entry) 1323 * page_cache_next_miss() - Find the next gap in the page cache.
1345 * @mapping: mapping 1324 * @mapping: Mapping.
1346 * @index: index 1325 * @index: Index.
1347 * @max_scan: maximum range to search 1326 * @max_scan: Maximum range to search.
1348 * 1327 *
1349 * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the 1328 * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the
1350 * lowest indexed hole. 1329 * gap with the lowest index.
1351 * 1330 *
1352 * Returns: the index of the hole if found, otherwise returns an index 1331 * This function may be called under the rcu_read_lock. However, this will
1353 * outside of the set specified (in which case 'return - index >= 1332 * not atomically search a snapshot of the cache at a single point in time.
1354 * max_scan' will be true). In rare cases of index wrap-around, 0 will 1333 * For example, if a gap is created at index 5, then subsequently a gap is
1355 * be returned. 1334 * created at index 10, page_cache_next_miss covering both indices may
1356 * 1335 * return 10 if called under the rcu_read_lock.
1357 * page_cache_next_hole may be called under rcu_read_lock. However, 1336 *
1358 * like radix_tree_gang_lookup, this will not atomically search a 1337 * Return: The index of the gap if found, otherwise an index outside the
1359 * snapshot of the tree at a single point in time. For example, if a 1338 * range specified (in which case 'return - index >= max_scan' will be true).
1360 * hole is created at index 5, then subsequently a hole is created at 1339 * In the rare case of index wrap-around, 0 will be returned.
1361 * index 10, page_cache_next_hole covering both indexes may return 10
1362 * if called under rcu_read_lock.
1363 */ 1340 */
1364pgoff_t page_cache_next_hole(struct address_space *mapping, 1341pgoff_t page_cache_next_miss(struct address_space *mapping,
1365 pgoff_t index, unsigned long max_scan) 1342 pgoff_t index, unsigned long max_scan)
1366{ 1343{
1367 unsigned long i; 1344 XA_STATE(xas, &mapping->i_pages, index);
1368
1369 for (i = 0; i < max_scan; i++) {
1370 struct page *page;
1371 1345
1372 page = radix_tree_lookup(&mapping->i_pages, index); 1346 while (max_scan--) {
1373 if (!page || radix_tree_exceptional_entry(page)) 1347 void *entry = xas_next(&xas);
1348 if (!entry || xa_is_value(entry))
1374 break; 1349 break;
1375 index++; 1350 if (xas.xa_index == 0)
1376 if (index == 0)
1377 break; 1351 break;
1378 } 1352 }
1379 1353
1380 return index; 1354 return xas.xa_index;
1381} 1355}
1382EXPORT_SYMBOL(page_cache_next_hole); 1356EXPORT_SYMBOL(page_cache_next_miss);
1383 1357
1384/** 1358/**
1385 * page_cache_prev_hole - find the prev hole (not-present entry) 1359 * page_cache_prev_miss() - Find the next gap in the page cache.
1386 * @mapping: mapping 1360 * @mapping: Mapping.
1387 * @index: index 1361 * @index: Index.
1388 * @max_scan: maximum range to search 1362 * @max_scan: Maximum range to search.
1389 * 1363 *
1390 * Search backwards in the range [max(index-max_scan+1, 0), index] for 1364 * Search the range [max(index - max_scan + 1, 0), index] for the
1391 * the first hole. 1365 * gap with the highest index.
1392 * 1366 *
1393 * Returns: the index of the hole if found, otherwise returns an index 1367 * This function may be called under the rcu_read_lock. However, this will
1394 * outside of the set specified (in which case 'index - return >= 1368 * not atomically search a snapshot of the cache at a single point in time.
1395 * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX 1369 * For example, if a gap is created at index 10, then subsequently a gap is
1396 * will be returned. 1370 * created at index 5, page_cache_prev_miss() covering both indices may
1397 * 1371 * return 5 if called under the rcu_read_lock.
1398 * page_cache_prev_hole may be called under rcu_read_lock. However, 1372 *
1399 * like radix_tree_gang_lookup, this will not atomically search a 1373 * Return: The index of the gap if found, otherwise an index outside the
1400 * snapshot of the tree at a single point in time. For example, if a 1374 * range specified (in which case 'index - return >= max_scan' will be true).
1401 * hole is created at index 10, then subsequently a hole is created at 1375 * In the rare case of wrap-around, ULONG_MAX will be returned.
1402 * index 5, page_cache_prev_hole covering both indexes may return 5 if
1403 * called under rcu_read_lock.
1404 */ 1376 */
1405pgoff_t page_cache_prev_hole(struct address_space *mapping, 1377pgoff_t page_cache_prev_miss(struct address_space *mapping,
1406 pgoff_t index, unsigned long max_scan) 1378 pgoff_t index, unsigned long max_scan)
1407{ 1379{
1408 unsigned long i; 1380 XA_STATE(xas, &mapping->i_pages, index);
1409
1410 for (i = 0; i < max_scan; i++) {
1411 struct page *page;
1412 1381
1413 page = radix_tree_lookup(&mapping->i_pages, index); 1382 while (max_scan--) {
1414 if (!page || radix_tree_exceptional_entry(page)) 1383 void *entry = xas_prev(&xas);
1384 if (!entry || xa_is_value(entry))
1415 break; 1385 break;
1416 index--; 1386 if (xas.xa_index == ULONG_MAX)
1417 if (index == ULONG_MAX)
1418 break; 1387 break;
1419 } 1388 }
1420 1389
1421 return index; 1390 return xas.xa_index;
1422} 1391}
1423EXPORT_SYMBOL(page_cache_prev_hole); 1392EXPORT_SYMBOL(page_cache_prev_miss);
1424 1393
1425/** 1394/**
1426 * find_get_entry - find and get a page cache entry 1395 * find_get_entry - find and get a page cache entry
@@ -1437,47 +1406,40 @@ EXPORT_SYMBOL(page_cache_prev_hole);
1437 */ 1406 */
1438struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) 1407struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
1439{ 1408{
1440 void **pagep; 1409 XA_STATE(xas, &mapping->i_pages, offset);
1441 struct page *head, *page; 1410 struct page *head, *page;
1442 1411
1443 rcu_read_lock(); 1412 rcu_read_lock();
1444repeat: 1413repeat:
1445 page = NULL; 1414 xas_reset(&xas);
1446 pagep = radix_tree_lookup_slot(&mapping->i_pages, offset); 1415 page = xas_load(&xas);
1447 if (pagep) { 1416 if (xas_retry(&xas, page))
1448 page = radix_tree_deref_slot(pagep); 1417 goto repeat;
1449 if (unlikely(!page)) 1418 /*
1450 goto out; 1419 * A shadow entry of a recently evicted page, or a swap entry from
1451 if (radix_tree_exception(page)) { 1420 * shmem/tmpfs. Return it without attempting to raise page count.
1452 if (radix_tree_deref_retry(page)) 1421 */
1453 goto repeat; 1422 if (!page || xa_is_value(page))
1454 /* 1423 goto out;
1455 * A shadow entry of a recently evicted page,
1456 * or a swap entry from shmem/tmpfs. Return
1457 * it without attempting to raise page count.
1458 */
1459 goto out;
1460 }
1461 1424
1462 head = compound_head(page); 1425 head = compound_head(page);
1463 if (!page_cache_get_speculative(head)) 1426 if (!page_cache_get_speculative(head))
1464 goto repeat; 1427 goto repeat;
1465 1428
1466 /* The page was split under us? */ 1429 /* The page was split under us? */
1467 if (compound_head(page) != head) { 1430 if (compound_head(page) != head) {
1468 put_page(head); 1431 put_page(head);
1469 goto repeat; 1432 goto repeat;
1470 } 1433 }
1471 1434
1472 /* 1435 /*
1473 * Has the page moved? 1436 * Has the page moved?
1474 * This is part of the lockless pagecache protocol. See 1437 * This is part of the lockless pagecache protocol. See
1475 * include/linux/pagemap.h for details. 1438 * include/linux/pagemap.h for details.
1476 */ 1439 */
1477 if (unlikely(page != *pagep)) { 1440 if (unlikely(page != xas_reload(&xas))) {
1478 put_page(head); 1441 put_page(head);
1479 goto repeat; 1442 goto repeat;
1480 }
1481 } 1443 }
1482out: 1444out:
1483 rcu_read_unlock(); 1445 rcu_read_unlock();
@@ -1508,7 +1470,7 @@ struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset)
1508 1470
1509repeat: 1471repeat:
1510 page = find_get_entry(mapping, offset); 1472 page = find_get_entry(mapping, offset);
1511 if (page && !radix_tree_exception(page)) { 1473 if (page && !xa_is_value(page)) {
1512 lock_page(page); 1474 lock_page(page);
1513 /* Has the page been truncated? */ 1475 /* Has the page been truncated? */
1514 if (unlikely(page_mapping(page) != mapping)) { 1476 if (unlikely(page_mapping(page) != mapping)) {
@@ -1554,7 +1516,7 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
1554 1516
1555repeat: 1517repeat:
1556 page = find_get_entry(mapping, offset); 1518 page = find_get_entry(mapping, offset);
1557 if (radix_tree_exceptional_entry(page)) 1519 if (xa_is_value(page))
1558 page = NULL; 1520 page = NULL;
1559 if (!page) 1521 if (!page)
1560 goto no_page; 1522 goto no_page;
@@ -1640,53 +1602,48 @@ unsigned find_get_entries(struct address_space *mapping,
1640 pgoff_t start, unsigned int nr_entries, 1602 pgoff_t start, unsigned int nr_entries,
1641 struct page **entries, pgoff_t *indices) 1603 struct page **entries, pgoff_t *indices)
1642{ 1604{
1643 void **slot; 1605 XA_STATE(xas, &mapping->i_pages, start);
1606 struct page *page;
1644 unsigned int ret = 0; 1607 unsigned int ret = 0;
1645 struct radix_tree_iter iter;
1646 1608
1647 if (!nr_entries) 1609 if (!nr_entries)
1648 return 0; 1610 return 0;
1649 1611
1650 rcu_read_lock(); 1612 rcu_read_lock();
1651 radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { 1613 xas_for_each(&xas, page, ULONG_MAX) {
1652 struct page *head, *page; 1614 struct page *head;
1653repeat: 1615 if (xas_retry(&xas, page))
1654 page = radix_tree_deref_slot(slot);
1655 if (unlikely(!page))
1656 continue; 1616 continue;
1657 if (radix_tree_exception(page)) { 1617 /*
1658 if (radix_tree_deref_retry(page)) { 1618 * A shadow entry of a recently evicted page, a swap
1659 slot = radix_tree_iter_retry(&iter); 1619 * entry from shmem/tmpfs or a DAX entry. Return it
1660 continue; 1620 * without attempting to raise page count.
1661 } 1621 */
1662 /* 1622 if (xa_is_value(page))
1663 * A shadow entry of a recently evicted page, a swap
1664 * entry from shmem/tmpfs or a DAX entry. Return it
1665 * without attempting to raise page count.
1666 */
1667 goto export; 1623 goto export;
1668 }
1669 1624
1670 head = compound_head(page); 1625 head = compound_head(page);
1671 if (!page_cache_get_speculative(head)) 1626 if (!page_cache_get_speculative(head))
1672 goto repeat; 1627 goto retry;
1673 1628
1674 /* The page was split under us? */ 1629 /* The page was split under us? */
1675 if (compound_head(page) != head) { 1630 if (compound_head(page) != head)
1676 put_page(head); 1631 goto put_page;
1677 goto repeat;
1678 }
1679 1632
1680 /* Has the page moved? */ 1633 /* Has the page moved? */
1681 if (unlikely(page != *slot)) { 1634 if (unlikely(page != xas_reload(&xas)))
1682 put_page(head); 1635 goto put_page;
1683 goto repeat; 1636
1684 }
1685export: 1637export:
1686 indices[ret] = iter.index; 1638 indices[ret] = xas.xa_index;
1687 entries[ret] = page; 1639 entries[ret] = page;
1688 if (++ret == nr_entries) 1640 if (++ret == nr_entries)
1689 break; 1641 break;
1642 continue;
1643put_page:
1644 put_page(head);
1645retry:
1646 xas_reset(&xas);
1690 } 1647 }
1691 rcu_read_unlock(); 1648 rcu_read_unlock();
1692 return ret; 1649 return ret;
@@ -1717,64 +1674,50 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
1717 pgoff_t end, unsigned int nr_pages, 1674 pgoff_t end, unsigned int nr_pages,
1718 struct page **pages) 1675 struct page **pages)
1719{ 1676{
1720 struct radix_tree_iter iter; 1677 XA_STATE(xas, &mapping->i_pages, *start);
1721 void **slot; 1678 struct page *page;
1722 unsigned ret = 0; 1679 unsigned ret = 0;
1723 1680
1724 if (unlikely(!nr_pages)) 1681 if (unlikely(!nr_pages))
1725 return 0; 1682 return 0;
1726 1683
1727 rcu_read_lock(); 1684 rcu_read_lock();
1728 radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, *start) { 1685 xas_for_each(&xas, page, end) {
1729 struct page *head, *page; 1686 struct page *head;
1730 1687 if (xas_retry(&xas, page))
1731 if (iter.index > end)
1732 break;
1733repeat:
1734 page = radix_tree_deref_slot(slot);
1735 if (unlikely(!page))
1736 continue; 1688 continue;
1737 1689 /* Skip over shadow, swap and DAX entries */
1738 if (radix_tree_exception(page)) { 1690 if (xa_is_value(page))
1739 if (radix_tree_deref_retry(page)) {
1740 slot = radix_tree_iter_retry(&iter);
1741 continue;
1742 }
1743 /*
1744 * A shadow entry of a recently evicted page,
1745 * or a swap entry from shmem/tmpfs. Skip
1746 * over it.
1747 */
1748 continue; 1691 continue;
1749 }
1750 1692
1751 head = compound_head(page); 1693 head = compound_head(page);
1752 if (!page_cache_get_speculative(head)) 1694 if (!page_cache_get_speculative(head))
1753 goto repeat; 1695 goto retry;
1754 1696
1755 /* The page was split under us? */ 1697 /* The page was split under us? */
1756 if (compound_head(page) != head) { 1698 if (compound_head(page) != head)
1757 put_page(head); 1699 goto put_page;
1758 goto repeat;
1759 }
1760 1700
1761 /* Has the page moved? */ 1701 /* Has the page moved? */
1762 if (unlikely(page != *slot)) { 1702 if (unlikely(page != xas_reload(&xas)))
1763 put_page(head); 1703 goto put_page;
1764 goto repeat;
1765 }
1766 1704
1767 pages[ret] = page; 1705 pages[ret] = page;
1768 if (++ret == nr_pages) { 1706 if (++ret == nr_pages) {
1769 *start = pages[ret - 1]->index + 1; 1707 *start = page->index + 1;
1770 goto out; 1708 goto out;
1771 } 1709 }
1710 continue;
1711put_page:
1712 put_page(head);
1713retry:
1714 xas_reset(&xas);
1772 } 1715 }
1773 1716
1774 /* 1717 /*
1775 * We come here when there is no page beyond @end. We take care to not 1718 * We come here when there is no page beyond @end. We take care to not
1776 * overflow the index @start as it confuses some of the callers. This 1719 * overflow the index @start as it confuses some of the callers. This
1777 * breaks the iteration when there is page at index -1 but that is 1720 * breaks the iteration when there is a page at index -1 but that is
1778 * already broken anyway. 1721 * already broken anyway.
1779 */ 1722 */
1780 if (end == (pgoff_t)-1) 1723 if (end == (pgoff_t)-1)
@@ -1802,57 +1745,43 @@ out:
1802unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, 1745unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
1803 unsigned int nr_pages, struct page **pages) 1746 unsigned int nr_pages, struct page **pages)
1804{ 1747{
1805 struct radix_tree_iter iter; 1748 XA_STATE(xas, &mapping->i_pages, index);
1806 void **slot; 1749 struct page *page;
1807 unsigned int ret = 0; 1750 unsigned int ret = 0;
1808 1751
1809 if (unlikely(!nr_pages)) 1752 if (unlikely(!nr_pages))
1810 return 0; 1753 return 0;
1811 1754
1812 rcu_read_lock(); 1755 rcu_read_lock();
1813 radix_tree_for_each_contig(slot, &mapping->i_pages, &iter, index) { 1756 for (page = xas_load(&xas); page; page = xas_next(&xas)) {
1814 struct page *head, *page; 1757 struct page *head;
1815repeat: 1758 if (xas_retry(&xas, page))
1816 page = radix_tree_deref_slot(slot); 1759 continue;
1817 /* The hole, there no reason to continue */ 1760 /*
1818 if (unlikely(!page)) 1761 * If the entry has been swapped out, we can stop looking.
1819 break; 1762 * No current caller is looking for DAX entries.
1820 1763 */
1821 if (radix_tree_exception(page)) { 1764 if (xa_is_value(page))
1822 if (radix_tree_deref_retry(page)) {
1823 slot = radix_tree_iter_retry(&iter);
1824 continue;
1825 }
1826 /*
1827 * A shadow entry of a recently evicted page,
1828 * or a swap entry from shmem/tmpfs. Stop
1829 * looking for contiguous pages.
1830 */
1831 break; 1765 break;
1832 }
1833 1766
1834 head = compound_head(page); 1767 head = compound_head(page);
1835 if (!page_cache_get_speculative(head)) 1768 if (!page_cache_get_speculative(head))
1836 goto repeat; 1769 goto retry;
1837 1770
1838 /* The page was split under us? */ 1771 /* The page was split under us? */
1839 if (compound_head(page) != head) { 1772 if (compound_head(page) != head)
1840 put_page(head); 1773 goto put_page;
1841 goto repeat;
1842 }
1843 1774
1844 /* Has the page moved? */ 1775 /* Has the page moved? */
1845 if (unlikely(page != *slot)) { 1776 if (unlikely(page != xas_reload(&xas)))
1846 put_page(head); 1777 goto put_page;
1847 goto repeat;
1848 }
1849 1778
1850 /* 1779 /*
1851 * must check mapping and index after taking the ref. 1780 * must check mapping and index after taking the ref.
1852 * otherwise we can get both false positives and false 1781 * otherwise we can get both false positives and false
1853 * negatives, which is just confusing to the caller. 1782 * negatives, which is just confusing to the caller.
1854 */ 1783 */
1855 if (page->mapping == NULL || page_to_pgoff(page) != iter.index) { 1784 if (!page->mapping || page_to_pgoff(page) != xas.xa_index) {
1856 put_page(page); 1785 put_page(page);
1857 break; 1786 break;
1858 } 1787 }
@@ -1860,6 +1789,11 @@ repeat:
1860 pages[ret] = page; 1789 pages[ret] = page;
1861 if (++ret == nr_pages) 1790 if (++ret == nr_pages)
1862 break; 1791 break;
1792 continue;
1793put_page:
1794 put_page(head);
1795retry:
1796 xas_reset(&xas);
1863 } 1797 }
1864 rcu_read_unlock(); 1798 rcu_read_unlock();
1865 return ret; 1799 return ret;
@@ -1879,74 +1813,58 @@ EXPORT_SYMBOL(find_get_pages_contig);
1879 * @tag. We update @index to index the next page for the traversal. 1813 * @tag. We update @index to index the next page for the traversal.
1880 */ 1814 */
1881unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, 1815unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
1882 pgoff_t end, int tag, unsigned int nr_pages, 1816 pgoff_t end, xa_mark_t tag, unsigned int nr_pages,
1883 struct page **pages) 1817 struct page **pages)
1884{ 1818{
1885 struct radix_tree_iter iter; 1819 XA_STATE(xas, &mapping->i_pages, *index);
1886 void **slot; 1820 struct page *page;
1887 unsigned ret = 0; 1821 unsigned ret = 0;
1888 1822
1889 if (unlikely(!nr_pages)) 1823 if (unlikely(!nr_pages))
1890 return 0; 1824 return 0;
1891 1825
1892 rcu_read_lock(); 1826 rcu_read_lock();
1893 radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, *index, tag) { 1827 xas_for_each_marked(&xas, page, end, tag) {
1894 struct page *head, *page; 1828 struct page *head;
1895 1829 if (xas_retry(&xas, page))
1896 if (iter.index > end)
1897 break;
1898repeat:
1899 page = radix_tree_deref_slot(slot);
1900 if (unlikely(!page))
1901 continue; 1830 continue;
1902 1831 /*
1903 if (radix_tree_exception(page)) { 1832 * Shadow entries should never be tagged, but this iteration
1904 if (radix_tree_deref_retry(page)) { 1833 * is lockless so there is a window for page reclaim to evict
1905 slot = radix_tree_iter_retry(&iter); 1834 * a page we saw tagged. Skip over it.
1906 continue; 1835 */
1907 } 1836 if (xa_is_value(page))
1908 /*
1909 * A shadow entry of a recently evicted page.
1910 *
1911 * Those entries should never be tagged, but
1912 * this tree walk is lockless and the tags are
1913 * looked up in bulk, one radix tree node at a
1914 * time, so there is a sizable window for page
1915 * reclaim to evict a page we saw tagged.
1916 *
1917 * Skip over it.
1918 */
1919 continue; 1837 continue;
1920 }
1921 1838
1922 head = compound_head(page); 1839 head = compound_head(page);
1923 if (!page_cache_get_speculative(head)) 1840 if (!page_cache_get_speculative(head))
1924 goto repeat; 1841 goto retry;
1925 1842
1926 /* The page was split under us? */ 1843 /* The page was split under us? */
1927 if (compound_head(page) != head) { 1844 if (compound_head(page) != head)
1928 put_page(head); 1845 goto put_page;
1929 goto repeat;
1930 }
1931 1846
1932 /* Has the page moved? */ 1847 /* Has the page moved? */
1933 if (unlikely(page != *slot)) { 1848 if (unlikely(page != xas_reload(&xas)))
1934 put_page(head); 1849 goto put_page;
1935 goto repeat;
1936 }
1937 1850
1938 pages[ret] = page; 1851 pages[ret] = page;
1939 if (++ret == nr_pages) { 1852 if (++ret == nr_pages) {
1940 *index = pages[ret - 1]->index + 1; 1853 *index = page->index + 1;
1941 goto out; 1854 goto out;
1942 } 1855 }
1856 continue;
1857put_page:
1858 put_page(head);
1859retry:
1860 xas_reset(&xas);
1943 } 1861 }
1944 1862
1945 /* 1863 /*
1946 * We come here when we got at @end. We take care to not overflow the 1864 * We come here when we got to @end. We take care to not overflow the
1947 * index @index as it confuses some of the callers. This breaks the 1865 * index @index as it confuses some of the callers. This breaks the
1948 * iteration when there is page at index -1 but that is already broken 1866 * iteration when there is a page at index -1 but that is already
1949 * anyway. 1867 * broken anyway.
1950 */ 1868 */
1951 if (end == (pgoff_t)-1) 1869 if (end == (pgoff_t)-1)
1952 *index = (pgoff_t)-1; 1870 *index = (pgoff_t)-1;
@@ -1972,57 +1890,51 @@ EXPORT_SYMBOL(find_get_pages_range_tag);
1972 * @tag. 1890 * @tag.
1973 */ 1891 */
1974unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, 1892unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
1975 int tag, unsigned int nr_entries, 1893 xa_mark_t tag, unsigned int nr_entries,
1976 struct page **entries, pgoff_t *indices) 1894 struct page **entries, pgoff_t *indices)
1977{ 1895{
1978 void **slot; 1896 XA_STATE(xas, &mapping->i_pages, start);
1897 struct page *page;
1979 unsigned int ret = 0; 1898 unsigned int ret = 0;
1980 struct radix_tree_iter iter;
1981 1899
1982 if (!nr_entries) 1900 if (!nr_entries)
1983 return 0; 1901 return 0;
1984 1902
1985 rcu_read_lock(); 1903 rcu_read_lock();
1986 radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, tag) { 1904 xas_for_each_marked(&xas, page, ULONG_MAX, tag) {
1987 struct page *head, *page; 1905 struct page *head;
1988repeat: 1906 if (xas_retry(&xas, page))
1989 page = radix_tree_deref_slot(slot);
1990 if (unlikely(!page))
1991 continue; 1907 continue;
1992 if (radix_tree_exception(page)) { 1908 /*
1993 if (radix_tree_deref_retry(page)) { 1909 * A shadow entry of a recently evicted page, a swap
1994 slot = radix_tree_iter_retry(&iter); 1910 * entry from shmem/tmpfs or a DAX entry. Return it
1995 continue; 1911 * without attempting to raise page count.
1996 } 1912 */
1997 1913 if (xa_is_value(page))
1998 /*
1999 * A shadow entry of a recently evicted page, a swap
2000 * entry from shmem/tmpfs or a DAX entry. Return it
2001 * without attempting to raise page count.
2002 */
2003 goto export; 1914 goto export;
2004 }
2005 1915
2006 head = compound_head(page); 1916 head = compound_head(page);
2007 if (!page_cache_get_speculative(head)) 1917 if (!page_cache_get_speculative(head))
2008 goto repeat; 1918 goto retry;
2009 1919
2010 /* The page was split under us? */ 1920 /* The page was split under us? */
2011 if (compound_head(page) != head) { 1921 if (compound_head(page) != head)
2012 put_page(head); 1922 goto put_page;
2013 goto repeat;
2014 }
2015 1923
2016 /* Has the page moved? */ 1924 /* Has the page moved? */
2017 if (unlikely(page != *slot)) { 1925 if (unlikely(page != xas_reload(&xas)))
2018 put_page(head); 1926 goto put_page;
2019 goto repeat; 1927
2020 }
2021export: 1928export:
2022 indices[ret] = iter.index; 1929 indices[ret] = xas.xa_index;
2023 entries[ret] = page; 1930 entries[ret] = page;
2024 if (++ret == nr_entries) 1931 if (++ret == nr_entries)
2025 break; 1932 break;
1933 continue;
1934put_page:
1935 put_page(head);
1936retry:
1937 xas_reset(&xas);
2026 } 1938 }
2027 rcu_read_unlock(); 1939 rcu_read_unlock();
2028 return ret; 1940 return ret;
@@ -2626,45 +2538,31 @@ EXPORT_SYMBOL(filemap_fault);
2626void filemap_map_pages(struct vm_fault *vmf, 2538void filemap_map_pages(struct vm_fault *vmf,
2627 pgoff_t start_pgoff, pgoff_t end_pgoff) 2539 pgoff_t start_pgoff, pgoff_t end_pgoff)
2628{ 2540{
2629 struct radix_tree_iter iter;
2630 void **slot;
2631 struct file *file = vmf->vma->vm_file; 2541 struct file *file = vmf->vma->vm_file;
2632 struct address_space *mapping = file->f_mapping; 2542 struct address_space *mapping = file->f_mapping;
2633 pgoff_t last_pgoff = start_pgoff; 2543 pgoff_t last_pgoff = start_pgoff;
2634 unsigned long max_idx; 2544 unsigned long max_idx;
2545 XA_STATE(xas, &mapping->i_pages, start_pgoff);
2635 struct page *head, *page; 2546 struct page *head, *page;
2636 2547
2637 rcu_read_lock(); 2548 rcu_read_lock();
2638 radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start_pgoff) { 2549 xas_for_each(&xas, page, end_pgoff) {
2639 if (iter.index > end_pgoff) 2550 if (xas_retry(&xas, page))
2640 break; 2551 continue;
2641repeat: 2552 if (xa_is_value(page))
2642 page = radix_tree_deref_slot(slot);
2643 if (unlikely(!page))
2644 goto next;
2645 if (radix_tree_exception(page)) {
2646 if (radix_tree_deref_retry(page)) {
2647 slot = radix_tree_iter_retry(&iter);
2648 continue;
2649 }
2650 goto next; 2553 goto next;
2651 }
2652 2554
2653 head = compound_head(page); 2555 head = compound_head(page);
2654 if (!page_cache_get_speculative(head)) 2556 if (!page_cache_get_speculative(head))
2655 goto repeat; 2557 goto next;
2656 2558
2657 /* The page was split under us? */ 2559 /* The page was split under us? */
2658 if (compound_head(page) != head) { 2560 if (compound_head(page) != head)
2659 put_page(head); 2561 goto skip;
2660 goto repeat;
2661 }
2662 2562
2663 /* Has the page moved? */ 2563 /* Has the page moved? */
2664 if (unlikely(page != *slot)) { 2564 if (unlikely(page != xas_reload(&xas)))
2665 put_page(head); 2565 goto skip;
2666 goto repeat;
2667 }
2668 2566
2669 if (!PageUptodate(page) || 2567 if (!PageUptodate(page) ||
2670 PageReadahead(page) || 2568 PageReadahead(page) ||
@@ -2683,10 +2581,10 @@ repeat:
2683 if (file->f_ra.mmap_miss > 0) 2581 if (file->f_ra.mmap_miss > 0)
2684 file->f_ra.mmap_miss--; 2582 file->f_ra.mmap_miss--;
2685 2583
2686 vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT; 2584 vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
2687 if (vmf->pte) 2585 if (vmf->pte)
2688 vmf->pte += iter.index - last_pgoff; 2586 vmf->pte += xas.xa_index - last_pgoff;
2689 last_pgoff = iter.index; 2587 last_pgoff = xas.xa_index;
2690 if (alloc_set_pte(vmf, NULL, page)) 2588 if (alloc_set_pte(vmf, NULL, page))
2691 goto unlock; 2589 goto unlock;
2692 unlock_page(page); 2590 unlock_page(page);
@@ -2699,8 +2597,6 @@ next:
2699 /* Huge page is mapped? No need to proceed. */ 2597 /* Huge page is mapped? No need to proceed. */
2700 if (pmd_trans_huge(*vmf->pmd)) 2598 if (pmd_trans_huge(*vmf->pmd))
2701 break; 2599 break;
2702 if (iter.index == end_pgoff)
2703 break;
2704 } 2600 }
2705 rcu_read_unlock(); 2601 rcu_read_unlock();
2706} 2602}
@@ -2810,7 +2706,7 @@ repeat:
2810 put_page(page); 2706 put_page(page);
2811 if (err == -EEXIST) 2707 if (err == -EEXIST)
2812 goto repeat; 2708 goto repeat;
2813 /* Presumably ENOMEM for radix tree node */ 2709 /* Presumably ENOMEM for xarray node */
2814 return ERR_PTR(err); 2710 return ERR_PTR(err);
2815 } 2711 }
2816 2712
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 25ef59b7ee34..4e4ef8fa479d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2450,13 +2450,13 @@ static void __split_huge_page(struct page *page, struct list_head *list,
2450 ClearPageCompound(head); 2450 ClearPageCompound(head);
2451 /* See comment in __split_huge_page_tail() */ 2451 /* See comment in __split_huge_page_tail() */
2452 if (PageAnon(head)) { 2452 if (PageAnon(head)) {
2453 /* Additional pin to radix tree of swap cache */ 2453 /* Additional pin to swap cache */
2454 if (PageSwapCache(head)) 2454 if (PageSwapCache(head))
2455 page_ref_add(head, 2); 2455 page_ref_add(head, 2);
2456 else 2456 else
2457 page_ref_inc(head); 2457 page_ref_inc(head);
2458 } else { 2458 } else {
2459 /* Additional pin to radix tree */ 2459 /* Additional pin to page cache */
2460 page_ref_add(head, 2); 2460 page_ref_add(head, 2);
2461 xa_unlock(&head->mapping->i_pages); 2461 xa_unlock(&head->mapping->i_pages);
2462 } 2462 }
@@ -2568,7 +2568,7 @@ bool can_split_huge_page(struct page *page, int *pextra_pins)
2568{ 2568{
2569 int extra_pins; 2569 int extra_pins;
2570 2570
2571 /* Additional pins from radix tree */ 2571 /* Additional pins from page cache */
2572 if (PageAnon(page)) 2572 if (PageAnon(page))
2573 extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0; 2573 extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0;
2574 else 2574 else
@@ -2664,17 +2664,14 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
2664 spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags); 2664 spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags);
2665 2665
2666 if (mapping) { 2666 if (mapping) {
2667 void **pslot; 2667 XA_STATE(xas, &mapping->i_pages, page_index(head));
2668 2668
2669 xa_lock(&mapping->i_pages);
2670 pslot = radix_tree_lookup_slot(&mapping->i_pages,
2671 page_index(head));
2672 /* 2669 /*
2673 * Check if the head page is present in radix tree. 2670 * Check if the head page is present in page cache.
2674 * We assume all tail are present too, if head is there. 2671 * We assume all tail are present too, if head is there.
2675 */ 2672 */
2676 if (radix_tree_deref_slot_protected(pslot, 2673 xa_lock(&mapping->i_pages);
2677 &mapping->i_pages.xa_lock) != head) 2674 if (xas_load(&xas) != head)
2678 goto fail; 2675 goto fail;
2679 } 2676 }
2680 2677
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index a31d740e6cd1..c13625c1ad5e 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1288,17 +1288,17 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
1288 * 1288 *
1289 * Basic scheme is simple, details are more complex: 1289 * Basic scheme is simple, details are more complex:
1290 * - allocate and freeze a new huge page; 1290 * - allocate and freeze a new huge page;
1291 * - scan over radix tree replacing old pages the new one 1291 * - scan page cache replacing old pages with the new one
1292 * + swap in pages if necessary; 1292 * + swap in pages if necessary;
1293 * + fill in gaps; 1293 * + fill in gaps;
1294 * + keep old pages around in case if rollback is required; 1294 * + keep old pages around in case rollback is required;
1295 * - if replacing succeed: 1295 * - if replacing succeeds:
1296 * + copy data over; 1296 * + copy data over;
1297 * + free old pages; 1297 * + free old pages;
1298 * + unfreeze huge page; 1298 * + unfreeze huge page;
1299 * - if replacing failed; 1299 * - if replacing failed;
1300 * + put all pages back and unfreeze them; 1300 * + put all pages back and unfreeze them;
1301 * + restore gaps in the radix-tree; 1301 * + restore gaps in the page cache;
1302 * + free huge page; 1302 * + free huge page;
1303 */ 1303 */
1304static void collapse_shmem(struct mm_struct *mm, 1304static void collapse_shmem(struct mm_struct *mm,
@@ -1306,12 +1306,11 @@ static void collapse_shmem(struct mm_struct *mm,
1306 struct page **hpage, int node) 1306 struct page **hpage, int node)
1307{ 1307{
1308 gfp_t gfp; 1308 gfp_t gfp;
1309 struct page *page, *new_page, *tmp; 1309 struct page *new_page;
1310 struct mem_cgroup *memcg; 1310 struct mem_cgroup *memcg;
1311 pgoff_t index, end = start + HPAGE_PMD_NR; 1311 pgoff_t index, end = start + HPAGE_PMD_NR;
1312 LIST_HEAD(pagelist); 1312 LIST_HEAD(pagelist);
1313 struct radix_tree_iter iter; 1313 XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
1314 void **slot;
1315 int nr_none = 0, result = SCAN_SUCCEED; 1314 int nr_none = 0, result = SCAN_SUCCEED;
1316 1315
1317 VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); 1316 VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
@@ -1336,48 +1335,49 @@ static void collapse_shmem(struct mm_struct *mm,
1336 __SetPageLocked(new_page); 1335 __SetPageLocked(new_page);
1337 BUG_ON(!page_ref_freeze(new_page, 1)); 1336 BUG_ON(!page_ref_freeze(new_page, 1));
1338 1337
1339
1340 /* 1338 /*
1341 * At this point the new_page is 'frozen' (page_count() is zero), locked 1339 * At this point the new_page is 'frozen' (page_count() is zero),
1342 * and not up-to-date. It's safe to insert it into radix tree, because 1340 * locked and not up-to-date. It's safe to insert it into the page
1343 * nobody would be able to map it or use it in other way until we 1341 * cache, because nobody would be able to map it or use it in other
1344 * unfreeze it. 1342 * way until we unfreeze it.
1345 */ 1343 */
1346 1344
1347 index = start; 1345 /* This will be less messy when we use multi-index entries */
1348 xa_lock_irq(&mapping->i_pages); 1346 do {
1349 radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { 1347 xas_lock_irq(&xas);
1350 int n = min(iter.index, end) - index; 1348 xas_create_range(&xas);
1351 1349 if (!xas_error(&xas))
1352 /*
1353 * Handle holes in the radix tree: charge it from shmem and
1354 * insert relevant subpage of new_page into the radix-tree.
1355 */
1356 if (n && !shmem_charge(mapping->host, n)) {
1357 result = SCAN_FAIL;
1358 break; 1350 break;
1359 } 1351 xas_unlock_irq(&xas);
1360 nr_none += n; 1352 if (!xas_nomem(&xas, GFP_KERNEL))
1361 for (; index < min(iter.index, end); index++) { 1353 goto out;
1362 radix_tree_insert(&mapping->i_pages, index, 1354 } while (1);
1363 new_page + (index % HPAGE_PMD_NR));
1364 }
1365 1355
1366 /* We are done. */ 1356 xas_set(&xas, start);
1367 if (index >= end) 1357 for (index = start; index < end; index++) {
1368 break; 1358 struct page *page = xas_next(&xas);
1359
1360 VM_BUG_ON(index != xas.xa_index);
1361 if (!page) {
1362 if (!shmem_charge(mapping->host, 1)) {
1363 result = SCAN_FAIL;
1364 break;
1365 }
1366 xas_store(&xas, new_page + (index % HPAGE_PMD_NR));
1367 nr_none++;
1368 continue;
1369 }
1369 1370
1370 page = radix_tree_deref_slot_protected(slot, 1371 if (xa_is_value(page) || !PageUptodate(page)) {
1371 &mapping->i_pages.xa_lock); 1372 xas_unlock_irq(&xas);
1372 if (radix_tree_exceptional_entry(page) || !PageUptodate(page)) {
1373 xa_unlock_irq(&mapping->i_pages);
1374 /* swap in or instantiate fallocated page */ 1373 /* swap in or instantiate fallocated page */
1375 if (shmem_getpage(mapping->host, index, &page, 1374 if (shmem_getpage(mapping->host, index, &page,
1376 SGP_NOHUGE)) { 1375 SGP_NOHUGE)) {
1377 result = SCAN_FAIL; 1376 result = SCAN_FAIL;
1378 goto tree_unlocked; 1377 goto xa_unlocked;
1379 } 1378 }
1380 xa_lock_irq(&mapping->i_pages); 1379 xas_lock_irq(&xas);
1380 xas_set(&xas, index);
1381 } else if (trylock_page(page)) { 1381 } else if (trylock_page(page)) {
1382 get_page(page); 1382 get_page(page);
1383 } else { 1383 } else {
@@ -1397,7 +1397,7 @@ static void collapse_shmem(struct mm_struct *mm,
1397 result = SCAN_TRUNCATED; 1397 result = SCAN_TRUNCATED;
1398 goto out_unlock; 1398 goto out_unlock;
1399 } 1399 }
1400 xa_unlock_irq(&mapping->i_pages); 1400 xas_unlock_irq(&xas);
1401 1401
1402 if (isolate_lru_page(page)) { 1402 if (isolate_lru_page(page)) {
1403 result = SCAN_DEL_PAGE_LRU; 1403 result = SCAN_DEL_PAGE_LRU;
@@ -1407,17 +1407,16 @@ static void collapse_shmem(struct mm_struct *mm,
1407 if (page_mapped(page)) 1407 if (page_mapped(page))
1408 unmap_mapping_pages(mapping, index, 1, false); 1408 unmap_mapping_pages(mapping, index, 1, false);
1409 1409
1410 xa_lock_irq(&mapping->i_pages); 1410 xas_lock_irq(&xas);
1411 xas_set(&xas, index);
1411 1412
1412 slot = radix_tree_lookup_slot(&mapping->i_pages, index); 1413 VM_BUG_ON_PAGE(page != xas_load(&xas), page);
1413 VM_BUG_ON_PAGE(page != radix_tree_deref_slot_protected(slot,
1414 &mapping->i_pages.xa_lock), page);
1415 VM_BUG_ON_PAGE(page_mapped(page), page); 1414 VM_BUG_ON_PAGE(page_mapped(page), page);
1416 1415
1417 /* 1416 /*
1418 * The page is expected to have page_count() == 3: 1417 * The page is expected to have page_count() == 3:
1419 * - we hold a pin on it; 1418 * - we hold a pin on it;
1420 * - one reference from radix tree; 1419 * - one reference from page cache;
1421 * - one from isolate_lru_page; 1420 * - one from isolate_lru_page;
1422 */ 1421 */
1423 if (!page_ref_freeze(page, 3)) { 1422 if (!page_ref_freeze(page, 3)) {
@@ -1432,56 +1431,30 @@ static void collapse_shmem(struct mm_struct *mm,
1432 list_add_tail(&page->lru, &pagelist); 1431 list_add_tail(&page->lru, &pagelist);
1433 1432
1434 /* Finally, replace with the new page. */ 1433 /* Finally, replace with the new page. */
1435 radix_tree_replace_slot(&mapping->i_pages, slot, 1434 xas_store(&xas, new_page + (index % HPAGE_PMD_NR));
1436 new_page + (index % HPAGE_PMD_NR));
1437
1438 slot = radix_tree_iter_resume(slot, &iter);
1439 index++;
1440 continue; 1435 continue;
1441out_lru: 1436out_lru:
1442 xa_unlock_irq(&mapping->i_pages); 1437 xas_unlock_irq(&xas);
1443 putback_lru_page(page); 1438 putback_lru_page(page);
1444out_isolate_failed: 1439out_isolate_failed:
1445 unlock_page(page); 1440 unlock_page(page);
1446 put_page(page); 1441 put_page(page);
1447 goto tree_unlocked; 1442 goto xa_unlocked;
1448out_unlock: 1443out_unlock:
1449 unlock_page(page); 1444 unlock_page(page);
1450 put_page(page); 1445 put_page(page);
1451 break; 1446 break;
1452 } 1447 }
1448 xas_unlock_irq(&xas);
1453 1449
1454 /* 1450xa_unlocked:
1455 * Handle hole in radix tree at the end of the range.
1456 * This code only triggers if there's nothing in radix tree
1457 * beyond 'end'.
1458 */
1459 if (result == SCAN_SUCCEED && index < end) {
1460 int n = end - index;
1461
1462 if (!shmem_charge(mapping->host, n)) {
1463 result = SCAN_FAIL;
1464 goto tree_locked;
1465 }
1466
1467 for (; index < end; index++) {
1468 radix_tree_insert(&mapping->i_pages, index,
1469 new_page + (index % HPAGE_PMD_NR));
1470 }
1471 nr_none += n;
1472 }
1473
1474tree_locked:
1475 xa_unlock_irq(&mapping->i_pages);
1476tree_unlocked:
1477
1478 if (result == SCAN_SUCCEED) { 1451 if (result == SCAN_SUCCEED) {
1479 unsigned long flags; 1452 struct page *page, *tmp;
1480 struct zone *zone = page_zone(new_page); 1453 struct zone *zone = page_zone(new_page);
1481 1454
1482 /* 1455 /*
1483 * Replacing old pages with new one has succeed, now we need to 1456 * Replacing old pages with new one has succeeded, now we
1484 * copy the content and free old pages. 1457 * need to copy the content and free the old pages.
1485 */ 1458 */
1486 list_for_each_entry_safe(page, tmp, &pagelist, lru) { 1459 list_for_each_entry_safe(page, tmp, &pagelist, lru) {
1487 copy_highpage(new_page + (page->index % HPAGE_PMD_NR), 1460 copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
@@ -1495,16 +1468,16 @@ tree_unlocked:
1495 put_page(page); 1468 put_page(page);
1496 } 1469 }
1497 1470
1498 local_irq_save(flags); 1471 local_irq_disable();
1499 __inc_node_page_state(new_page, NR_SHMEM_THPS); 1472 __inc_node_page_state(new_page, NR_SHMEM_THPS);
1500 if (nr_none) { 1473 if (nr_none) {
1501 __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none); 1474 __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none);
1502 __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none); 1475 __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none);
1503 } 1476 }
1504 local_irq_restore(flags); 1477 local_irq_enable();
1505 1478
1506 /* 1479 /*
1507 * Remove pte page tables, so we can re-faulti 1480 * Remove pte page tables, so we can re-fault
1508 * the page as huge. 1481 * the page as huge.
1509 */ 1482 */
1510 retract_page_tables(mapping, start); 1483 retract_page_tables(mapping, start);
@@ -1521,37 +1494,37 @@ tree_unlocked:
1521 1494
1522 khugepaged_pages_collapsed++; 1495 khugepaged_pages_collapsed++;
1523 } else { 1496 } else {
1524 /* Something went wrong: rollback changes to the radix-tree */ 1497 struct page *page;
1498 /* Something went wrong: roll back page cache changes */
1525 shmem_uncharge(mapping->host, nr_none); 1499 shmem_uncharge(mapping->host, nr_none);
1526 xa_lock_irq(&mapping->i_pages); 1500 xas_lock_irq(&xas);
1527 radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { 1501 xas_set(&xas, start);
1528 if (iter.index >= end) 1502 xas_for_each(&xas, page, end - 1) {
1529 break;
1530 page = list_first_entry_or_null(&pagelist, 1503 page = list_first_entry_or_null(&pagelist,
1531 struct page, lru); 1504 struct page, lru);
1532 if (!page || iter.index < page->index) { 1505 if (!page || xas.xa_index < page->index) {
1533 if (!nr_none) 1506 if (!nr_none)
1534 break; 1507 break;
1535 nr_none--; 1508 nr_none--;
1536 /* Put holes back where they were */ 1509 /* Put holes back where they were */
1537 radix_tree_delete(&mapping->i_pages, iter.index); 1510 xas_store(&xas, NULL);
1538 continue; 1511 continue;
1539 } 1512 }
1540 1513
1541 VM_BUG_ON_PAGE(page->index != iter.index, page); 1514 VM_BUG_ON_PAGE(page->index != xas.xa_index, page);
1542 1515
1543 /* Unfreeze the page. */ 1516 /* Unfreeze the page. */
1544 list_del(&page->lru); 1517 list_del(&page->lru);
1545 page_ref_unfreeze(page, 2); 1518 page_ref_unfreeze(page, 2);
1546 radix_tree_replace_slot(&mapping->i_pages, slot, page); 1519 xas_store(&xas, page);
1547 slot = radix_tree_iter_resume(slot, &iter); 1520 xas_pause(&xas);
1548 xa_unlock_irq(&mapping->i_pages); 1521 xas_unlock_irq(&xas);
1549 putback_lru_page(page); 1522 putback_lru_page(page);
1550 unlock_page(page); 1523 unlock_page(page);
1551 xa_lock_irq(&mapping->i_pages); 1524 xas_lock_irq(&xas);
1552 } 1525 }
1553 VM_BUG_ON(nr_none); 1526 VM_BUG_ON(nr_none);
1554 xa_unlock_irq(&mapping->i_pages); 1527 xas_unlock_irq(&xas);
1555 1528
1556 /* Unfreeze new_page, caller would take care about freeing it */ 1529 /* Unfreeze new_page, caller would take care about freeing it */
1557 page_ref_unfreeze(new_page, 1); 1530 page_ref_unfreeze(new_page, 1);
@@ -1569,8 +1542,7 @@ static void khugepaged_scan_shmem(struct mm_struct *mm,
1569 pgoff_t start, struct page **hpage) 1542 pgoff_t start, struct page **hpage)
1570{ 1543{
1571 struct page *page = NULL; 1544 struct page *page = NULL;
1572 struct radix_tree_iter iter; 1545 XA_STATE(xas, &mapping->i_pages, start);
1573 void **slot;
1574 int present, swap; 1546 int present, swap;
1575 int node = NUMA_NO_NODE; 1547 int node = NUMA_NO_NODE;
1576 int result = SCAN_SUCCEED; 1548 int result = SCAN_SUCCEED;
@@ -1579,17 +1551,11 @@ static void khugepaged_scan_shmem(struct mm_struct *mm,
1579 swap = 0; 1551 swap = 0;
1580 memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); 1552 memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
1581 rcu_read_lock(); 1553 rcu_read_lock();
1582 radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { 1554 xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) {
1583 if (iter.index >= start + HPAGE_PMD_NR) 1555 if (xas_retry(&xas, page))
1584 break;
1585
1586 page = radix_tree_deref_slot(slot);
1587 if (radix_tree_deref_retry(page)) {
1588 slot = radix_tree_iter_retry(&iter);
1589 continue; 1556 continue;
1590 }
1591 1557
1592 if (radix_tree_exception(page)) { 1558 if (xa_is_value(page)) {
1593 if (++swap > khugepaged_max_ptes_swap) { 1559 if (++swap > khugepaged_max_ptes_swap) {
1594 result = SCAN_EXCEED_SWAP_PTE; 1560 result = SCAN_EXCEED_SWAP_PTE;
1595 break; 1561 break;
@@ -1628,7 +1594,7 @@ static void khugepaged_scan_shmem(struct mm_struct *mm,
1628 present++; 1594 present++;
1629 1595
1630 if (need_resched()) { 1596 if (need_resched()) {
1631 slot = radix_tree_iter_resume(slot, &iter); 1597 xas_pause(&xas);
1632 cond_resched_rcu(); 1598 cond_resched_rcu();
1633 } 1599 }
1634 } 1600 }
diff --git a/mm/madvise.c b/mm/madvise.c
index 71d21df2a3f3..6cb1ca93e290 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -251,7 +251,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
251 index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 251 index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
252 252
253 page = find_get_entry(mapping, index); 253 page = find_get_entry(mapping, index);
254 if (!radix_tree_exceptional_entry(page)) { 254 if (!xa_is_value(page)) {
255 if (page) 255 if (page)
256 put_page(page); 256 put_page(page);
257 continue; 257 continue;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 10a9b554d69f..54920cbc46bf 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4728,7 +4728,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
4728 /* shmem/tmpfs may report page out on swap: account for that too. */ 4728 /* shmem/tmpfs may report page out on swap: account for that too. */
4729 if (shmem_mapping(mapping)) { 4729 if (shmem_mapping(mapping)) {
4730 page = find_get_entry(mapping, pgoff); 4730 page = find_get_entry(mapping, pgoff);
4731 if (radix_tree_exceptional_entry(page)) { 4731 if (xa_is_value(page)) {
4732 swp_entry_t swp = radix_to_swp_entry(page); 4732 swp_entry_t swp = radix_to_swp_entry(page);
4733 if (do_memsw_account()) 4733 if (do_memsw_account())
4734 *entry = swp; 4734 *entry = swp;
diff --git a/mm/memfd.c b/mm/memfd.c
index 2bb5e257080e..97264c79d2cd 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -21,44 +21,36 @@
21#include <uapi/linux/memfd.h> 21#include <uapi/linux/memfd.h>
22 22
23/* 23/*
24 * We need a tag: a new tag would expand every radix_tree_node by 8 bytes, 24 * We need a tag: a new tag would expand every xa_node by 8 bytes,
25 * so reuse a tag which we firmly believe is never set or cleared on tmpfs 25 * so reuse a tag which we firmly believe is never set or cleared on tmpfs
26 * or hugetlbfs because they are memory only filesystems. 26 * or hugetlbfs because they are memory only filesystems.
27 */ 27 */
28#define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE 28#define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE
29#define LAST_SCAN 4 /* about 150ms max */ 29#define LAST_SCAN 4 /* about 150ms max */
30 30
31static void memfd_tag_pins(struct address_space *mapping) 31static void memfd_tag_pins(struct xa_state *xas)
32{ 32{
33 struct radix_tree_iter iter;
34 void __rcu **slot;
35 pgoff_t start;
36 struct page *page; 33 struct page *page;
34 unsigned int tagged = 0;
37 35
38 lru_add_drain(); 36 lru_add_drain();
39 start = 0;
40 rcu_read_lock();
41
42 radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
43 page = radix_tree_deref_slot(slot);
44 if (!page || radix_tree_exception(page)) {
45 if (radix_tree_deref_retry(page)) {
46 slot = radix_tree_iter_retry(&iter);
47 continue;
48 }
49 } else if (page_count(page) - page_mapcount(page) > 1) {
50 xa_lock_irq(&mapping->i_pages);
51 radix_tree_tag_set(&mapping->i_pages, iter.index,
52 MEMFD_TAG_PINNED);
53 xa_unlock_irq(&mapping->i_pages);
54 }
55 37
56 if (need_resched()) { 38 xas_lock_irq(xas);
57 slot = radix_tree_iter_resume(slot, &iter); 39 xas_for_each(xas, page, ULONG_MAX) {
58 cond_resched_rcu(); 40 if (xa_is_value(page))
59 } 41 continue;
42 if (page_count(page) - page_mapcount(page) > 1)
43 xas_set_mark(xas, MEMFD_TAG_PINNED);
44
45 if (++tagged % XA_CHECK_SCHED)
46 continue;
47
48 xas_pause(xas);
49 xas_unlock_irq(xas);
50 cond_resched();
51 xas_lock_irq(xas);
60 } 52 }
61 rcu_read_unlock(); 53 xas_unlock_irq(xas);
62} 54}
63 55
64/* 56/*
@@ -72,17 +64,17 @@ static void memfd_tag_pins(struct address_space *mapping)
72 */ 64 */
73static int memfd_wait_for_pins(struct address_space *mapping) 65static int memfd_wait_for_pins(struct address_space *mapping)
74{ 66{
75 struct radix_tree_iter iter; 67 XA_STATE(xas, &mapping->i_pages, 0);
76 void __rcu **slot;
77 pgoff_t start;
78 struct page *page; 68 struct page *page;
79 int error, scan; 69 int error, scan;
80 70
81 memfd_tag_pins(mapping); 71 memfd_tag_pins(&xas);
82 72
83 error = 0; 73 error = 0;
84 for (scan = 0; scan <= LAST_SCAN; scan++) { 74 for (scan = 0; scan <= LAST_SCAN; scan++) {
85 if (!radix_tree_tagged(&mapping->i_pages, MEMFD_TAG_PINNED)) 75 unsigned int tagged = 0;
76
77 if (!xas_marked(&xas, MEMFD_TAG_PINNED))
86 break; 78 break;
87 79
88 if (!scan) 80 if (!scan)
@@ -90,45 +82,34 @@ static int memfd_wait_for_pins(struct address_space *mapping)
90 else if (schedule_timeout_killable((HZ << scan) / 200)) 82 else if (schedule_timeout_killable((HZ << scan) / 200))
91 scan = LAST_SCAN; 83 scan = LAST_SCAN;
92 84
93 start = 0; 85 xas_set(&xas, 0);
94 rcu_read_lock(); 86 xas_lock_irq(&xas);
95 radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 87 xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) {
96 start, MEMFD_TAG_PINNED) { 88 bool clear = true;
97 89 if (xa_is_value(page))
98 page = radix_tree_deref_slot(slot); 90 continue;
99 if (radix_tree_exception(page)) { 91 if (page_count(page) - page_mapcount(page) != 1) {
100 if (radix_tree_deref_retry(page)) {
101 slot = radix_tree_iter_retry(&iter);
102 continue;
103 }
104
105 page = NULL;
106 }
107
108 if (page &&
109 page_count(page) - page_mapcount(page) != 1) {
110 if (scan < LAST_SCAN)
111 goto continue_resched;
112
113 /* 92 /*
114 * On the last scan, we clean up all those tags 93 * On the last scan, we clean up all those tags
115 * we inserted; but make a note that we still 94 * we inserted; but make a note that we still
116 * found pages pinned. 95 * found pages pinned.
117 */ 96 */
118 error = -EBUSY; 97 if (scan == LAST_SCAN)
98 error = -EBUSY;
99 else
100 clear = false;
119 } 101 }
102 if (clear)
103 xas_clear_mark(&xas, MEMFD_TAG_PINNED);
104 if (++tagged % XA_CHECK_SCHED)
105 continue;
120 106
121 xa_lock_irq(&mapping->i_pages); 107 xas_pause(&xas);
122 radix_tree_tag_clear(&mapping->i_pages, 108 xas_unlock_irq(&xas);
123 iter.index, MEMFD_TAG_PINNED); 109 cond_resched();
124 xa_unlock_irq(&mapping->i_pages); 110 xas_lock_irq(&xas);
125continue_resched:
126 if (need_resched()) {
127 slot = radix_tree_iter_resume(slot, &iter);
128 cond_resched_rcu();
129 }
130 } 111 }
131 rcu_read_unlock(); 112 xas_unlock_irq(&xas);
132 } 113 }
133 114
134 return error; 115 return error;
diff --git a/mm/migrate.c b/mm/migrate.c
index b6700f2962f3..f7e4bfdc13b7 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -326,7 +326,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
326 page = migration_entry_to_page(entry); 326 page = migration_entry_to_page(entry);
327 327
328 /* 328 /*
329 * Once radix-tree replacement of page migration started, page_count 329 * Once page cache replacement of page migration started, page_count
330 * *must* be zero. And, we don't want to call wait_on_page_locked() 330 * *must* be zero. And, we don't want to call wait_on_page_locked()
331 * against a page without get_page(). 331 * against a page without get_page().
332 * So, we use get_page_unless_zero(), here. Even failed, page fault 332 * So, we use get_page_unless_zero(), here. Even failed, page fault
@@ -441,10 +441,10 @@ int migrate_page_move_mapping(struct address_space *mapping,
441 struct buffer_head *head, enum migrate_mode mode, 441 struct buffer_head *head, enum migrate_mode mode,
442 int extra_count) 442 int extra_count)
443{ 443{
444 XA_STATE(xas, &mapping->i_pages, page_index(page));
444 struct zone *oldzone, *newzone; 445 struct zone *oldzone, *newzone;
445 int dirty; 446 int dirty;
446 int expected_count = 1 + extra_count; 447 int expected_count = 1 + extra_count;
447 void **pslot;
448 448
449 /* 449 /*
450 * Device public or private pages have an extra refcount as they are 450 * Device public or private pages have an extra refcount as they are
@@ -470,21 +470,16 @@ int migrate_page_move_mapping(struct address_space *mapping,
470 oldzone = page_zone(page); 470 oldzone = page_zone(page);
471 newzone = page_zone(newpage); 471 newzone = page_zone(newpage);
472 472
473 xa_lock_irq(&mapping->i_pages); 473 xas_lock_irq(&xas);
474
475 pslot = radix_tree_lookup_slot(&mapping->i_pages,
476 page_index(page));
477 474
478 expected_count += hpage_nr_pages(page) + page_has_private(page); 475 expected_count += hpage_nr_pages(page) + page_has_private(page);
479 if (page_count(page) != expected_count || 476 if (page_count(page) != expected_count || xas_load(&xas) != page) {
480 radix_tree_deref_slot_protected(pslot, 477 xas_unlock_irq(&xas);
481 &mapping->i_pages.xa_lock) != page) {
482 xa_unlock_irq(&mapping->i_pages);
483 return -EAGAIN; 478 return -EAGAIN;
484 } 479 }
485 480
486 if (!page_ref_freeze(page, expected_count)) { 481 if (!page_ref_freeze(page, expected_count)) {
487 xa_unlock_irq(&mapping->i_pages); 482 xas_unlock_irq(&xas);
488 return -EAGAIN; 483 return -EAGAIN;
489 } 484 }
490 485
@@ -498,7 +493,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
498 if (mode == MIGRATE_ASYNC && head && 493 if (mode == MIGRATE_ASYNC && head &&
499 !buffer_migrate_lock_buffers(head, mode)) { 494 !buffer_migrate_lock_buffers(head, mode)) {
500 page_ref_unfreeze(page, expected_count); 495 page_ref_unfreeze(page, expected_count);
501 xa_unlock_irq(&mapping->i_pages); 496 xas_unlock_irq(&xas);
502 return -EAGAIN; 497 return -EAGAIN;
503 } 498 }
504 499
@@ -526,16 +521,13 @@ int migrate_page_move_mapping(struct address_space *mapping,
526 SetPageDirty(newpage); 521 SetPageDirty(newpage);
527 } 522 }
528 523
529 radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); 524 xas_store(&xas, newpage);
530 if (PageTransHuge(page)) { 525 if (PageTransHuge(page)) {
531 int i; 526 int i;
532 int index = page_index(page);
533 527
534 for (i = 1; i < HPAGE_PMD_NR; i++) { 528 for (i = 1; i < HPAGE_PMD_NR; i++) {
535 pslot = radix_tree_lookup_slot(&mapping->i_pages, 529 xas_next(&xas);
536 index + i); 530 xas_store(&xas, newpage + i);
537 radix_tree_replace_slot(&mapping->i_pages, pslot,
538 newpage + i);
539 } 531 }
540 } 532 }
541 533
@@ -546,7 +538,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
546 */ 538 */
547 page_ref_unfreeze(page, expected_count - hpage_nr_pages(page)); 539 page_ref_unfreeze(page, expected_count - hpage_nr_pages(page));
548 540
549 xa_unlock(&mapping->i_pages); 541 xas_unlock(&xas);
550 /* Leave irq disabled to prevent preemption while updating stats */ 542 /* Leave irq disabled to prevent preemption while updating stats */
551 543
552 /* 544 /*
@@ -586,22 +578,18 @@ EXPORT_SYMBOL(migrate_page_move_mapping);
586int migrate_huge_page_move_mapping(struct address_space *mapping, 578int migrate_huge_page_move_mapping(struct address_space *mapping,
587 struct page *newpage, struct page *page) 579 struct page *newpage, struct page *page)
588{ 580{
581 XA_STATE(xas, &mapping->i_pages, page_index(page));
589 int expected_count; 582 int expected_count;
590 void **pslot;
591
592 xa_lock_irq(&mapping->i_pages);
593
594 pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page));
595 583
584 xas_lock_irq(&xas);
596 expected_count = 2 + page_has_private(page); 585 expected_count = 2 + page_has_private(page);
597 if (page_count(page) != expected_count || 586 if (page_count(page) != expected_count || xas_load(&xas) != page) {
598 radix_tree_deref_slot_protected(pslot, &mapping->i_pages.xa_lock) != page) { 587 xas_unlock_irq(&xas);
599 xa_unlock_irq(&mapping->i_pages);
600 return -EAGAIN; 588 return -EAGAIN;
601 } 589 }
602 590
603 if (!page_ref_freeze(page, expected_count)) { 591 if (!page_ref_freeze(page, expected_count)) {
604 xa_unlock_irq(&mapping->i_pages); 592 xas_unlock_irq(&xas);
605 return -EAGAIN; 593 return -EAGAIN;
606 } 594 }
607 595
@@ -610,11 +598,11 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
610 598
611 get_page(newpage); 599 get_page(newpage);
612 600
613 radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); 601 xas_store(&xas, newpage);
614 602
615 page_ref_unfreeze(page, expected_count - 1); 603 page_ref_unfreeze(page, expected_count - 1);
616 604
617 xa_unlock_irq(&mapping->i_pages); 605 xas_unlock_irq(&xas);
618 606
619 return MIGRATEPAGE_SUCCESS; 607 return MIGRATEPAGE_SUCCESS;
620} 608}
diff --git a/mm/mincore.c b/mm/mincore.c
index fc37afe226e6..4985965aa20a 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -66,7 +66,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
66 * shmem/tmpfs may return swap: account for swapcache 66 * shmem/tmpfs may return swap: account for swapcache
67 * page too. 67 * page too.
68 */ 68 */
69 if (radix_tree_exceptional_entry(page)) { 69 if (xa_is_value(page)) {
70 swp_entry_t swp = radix_to_swp_entry(page); 70 swp_entry_t swp = radix_to_swp_entry(page);
71 page = find_get_page(swap_address_space(swp), 71 page = find_get_page(swap_address_space(swp),
72 swp_offset(swp)); 72 swp_offset(swp));
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 439a304a6c92..3f690bae6b78 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2097,34 +2097,25 @@ void __init page_writeback_init(void)
2097 * dirty pages in the file (thus it is important for this function to be quick 2097 * dirty pages in the file (thus it is important for this function to be quick
2098 * so that it can tag pages faster than a dirtying process can create them). 2098 * so that it can tag pages faster than a dirtying process can create them).
2099 */ 2099 */
2100/*
2101 * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce the i_pages lock
2102 * latency.
2103 */
2104void tag_pages_for_writeback(struct address_space *mapping, 2100void tag_pages_for_writeback(struct address_space *mapping,
2105 pgoff_t start, pgoff_t end) 2101 pgoff_t start, pgoff_t end)
2106{ 2102{
2107#define WRITEBACK_TAG_BATCH 4096 2103 XA_STATE(xas, &mapping->i_pages, start);
2108 unsigned long tagged = 0; 2104 unsigned int tagged = 0;
2109 struct radix_tree_iter iter; 2105 void *page;
2110 void **slot;
2111 2106
2112 xa_lock_irq(&mapping->i_pages); 2107 xas_lock_irq(&xas);
2113 radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, 2108 xas_for_each_marked(&xas, page, end, PAGECACHE_TAG_DIRTY) {
2114 PAGECACHE_TAG_DIRTY) { 2109 xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE);
2115 if (iter.index > end) 2110 if (++tagged % XA_CHECK_SCHED)
2116 break;
2117 radix_tree_iter_tag_set(&mapping->i_pages, &iter,
2118 PAGECACHE_TAG_TOWRITE);
2119 tagged++;
2120 if ((tagged % WRITEBACK_TAG_BATCH) != 0)
2121 continue; 2111 continue;
2122 slot = radix_tree_iter_resume(slot, &iter); 2112
2123 xa_unlock_irq(&mapping->i_pages); 2113 xas_pause(&xas);
2114 xas_unlock_irq(&xas);
2124 cond_resched(); 2115 cond_resched();
2125 xa_lock_irq(&mapping->i_pages); 2116 xas_lock_irq(&xas);
2126 } 2117 }
2127 xa_unlock_irq(&mapping->i_pages); 2118 xas_unlock_irq(&xas);
2128} 2119}
2129EXPORT_SYMBOL(tag_pages_for_writeback); 2120EXPORT_SYMBOL(tag_pages_for_writeback);
2130 2121
@@ -2170,7 +2161,7 @@ int write_cache_pages(struct address_space *mapping,
2170 pgoff_t end; /* Inclusive */ 2161 pgoff_t end; /* Inclusive */
2171 pgoff_t done_index; 2162 pgoff_t done_index;
2172 int range_whole = 0; 2163 int range_whole = 0;
2173 int tag; 2164 xa_mark_t tag;
2174 2165
2175 pagevec_init(&pvec); 2166 pagevec_init(&pvec);
2176 if (wbc->range_cyclic) { 2167 if (wbc->range_cyclic) {
@@ -2442,7 +2433,7 @@ void account_page_cleaned(struct page *page, struct address_space *mapping,
2442 2433
2443/* 2434/*
2444 * For address_spaces which do not use buffers. Just tag the page as dirty in 2435 * For address_spaces which do not use buffers. Just tag the page as dirty in
2445 * its radix tree. 2436 * the xarray.
2446 * 2437 *
2447 * This is also used when a single buffer is being dirtied: we want to set the 2438 * This is also used when a single buffer is being dirtied: we want to set the
2448 * page dirty in that case, but not all the buffers. This is a "bottom-up" 2439 * page dirty in that case, but not all the buffers. This is a "bottom-up"
@@ -2468,7 +2459,7 @@ int __set_page_dirty_nobuffers(struct page *page)
2468 BUG_ON(page_mapping(page) != mapping); 2459 BUG_ON(page_mapping(page) != mapping);
2469 WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); 2460 WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
2470 account_page_dirtied(page, mapping); 2461 account_page_dirtied(page, mapping);
2471 radix_tree_tag_set(&mapping->i_pages, page_index(page), 2462 __xa_set_mark(&mapping->i_pages, page_index(page),
2472 PAGECACHE_TAG_DIRTY); 2463 PAGECACHE_TAG_DIRTY);
2473 xa_unlock_irqrestore(&mapping->i_pages, flags); 2464 xa_unlock_irqrestore(&mapping->i_pages, flags);
2474 unlock_page_memcg(page); 2465 unlock_page_memcg(page);
@@ -2631,13 +2622,13 @@ EXPORT_SYMBOL(__cancel_dirty_page);
2631 * Returns true if the page was previously dirty. 2622 * Returns true if the page was previously dirty.
2632 * 2623 *
2633 * This is for preparing to put the page under writeout. We leave the page 2624 * This is for preparing to put the page under writeout. We leave the page
2634 * tagged as dirty in the radix tree so that a concurrent write-for-sync 2625 * tagged as dirty in the xarray so that a concurrent write-for-sync
2635 * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage 2626 * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage
2636 * implementation will run either set_page_writeback() or set_page_dirty(), 2627 * implementation will run either set_page_writeback() or set_page_dirty(),
2637 * at which stage we bring the page's dirty flag and radix-tree dirty tag 2628 * at which stage we bring the page's dirty flag and xarray dirty tag
2638 * back into sync. 2629 * back into sync.
2639 * 2630 *
2640 * This incoherency between the page's dirty flag and radix-tree tag is 2631 * This incoherency between the page's dirty flag and xarray tag is
2641 * unfortunate, but it only exists while the page is locked. 2632 * unfortunate, but it only exists while the page is locked.
2642 */ 2633 */
2643int clear_page_dirty_for_io(struct page *page) 2634int clear_page_dirty_for_io(struct page *page)
@@ -2718,7 +2709,7 @@ int test_clear_page_writeback(struct page *page)
2718 xa_lock_irqsave(&mapping->i_pages, flags); 2709 xa_lock_irqsave(&mapping->i_pages, flags);
2719 ret = TestClearPageWriteback(page); 2710 ret = TestClearPageWriteback(page);
2720 if (ret) { 2711 if (ret) {
2721 radix_tree_tag_clear(&mapping->i_pages, page_index(page), 2712 __xa_clear_mark(&mapping->i_pages, page_index(page),
2722 PAGECACHE_TAG_WRITEBACK); 2713 PAGECACHE_TAG_WRITEBACK);
2723 if (bdi_cap_account_writeback(bdi)) { 2714 if (bdi_cap_account_writeback(bdi)) {
2724 struct bdi_writeback *wb = inode_to_wb(inode); 2715 struct bdi_writeback *wb = inode_to_wb(inode);
@@ -2758,11 +2749,13 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
2758 2749
2759 lock_page_memcg(page); 2750 lock_page_memcg(page);
2760 if (mapping && mapping_use_writeback_tags(mapping)) { 2751 if (mapping && mapping_use_writeback_tags(mapping)) {
2752 XA_STATE(xas, &mapping->i_pages, page_index(page));
2761 struct inode *inode = mapping->host; 2753 struct inode *inode = mapping->host;
2762 struct backing_dev_info *bdi = inode_to_bdi(inode); 2754 struct backing_dev_info *bdi = inode_to_bdi(inode);
2763 unsigned long flags; 2755 unsigned long flags;
2764 2756
2765 xa_lock_irqsave(&mapping->i_pages, flags); 2757 xas_lock_irqsave(&xas, flags);
2758 xas_load(&xas);
2766 ret = TestSetPageWriteback(page); 2759 ret = TestSetPageWriteback(page);
2767 if (!ret) { 2760 if (!ret) {
2768 bool on_wblist; 2761 bool on_wblist;
@@ -2770,8 +2763,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
2770 on_wblist = mapping_tagged(mapping, 2763 on_wblist = mapping_tagged(mapping,
2771 PAGECACHE_TAG_WRITEBACK); 2764 PAGECACHE_TAG_WRITEBACK);
2772 2765
2773 radix_tree_tag_set(&mapping->i_pages, page_index(page), 2766 xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
2774 PAGECACHE_TAG_WRITEBACK);
2775 if (bdi_cap_account_writeback(bdi)) 2767 if (bdi_cap_account_writeback(bdi))
2776 inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); 2768 inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
2777 2769
@@ -2784,12 +2776,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
2784 sb_mark_inode_writeback(mapping->host); 2776 sb_mark_inode_writeback(mapping->host);
2785 } 2777 }
2786 if (!PageDirty(page)) 2778 if (!PageDirty(page))
2787 radix_tree_tag_clear(&mapping->i_pages, page_index(page), 2779 xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
2788 PAGECACHE_TAG_DIRTY);
2789 if (!keep_write) 2780 if (!keep_write)
2790 radix_tree_tag_clear(&mapping->i_pages, page_index(page), 2781 xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
2791 PAGECACHE_TAG_TOWRITE); 2782 xas_unlock_irqrestore(&xas, flags);
2792 xa_unlock_irqrestore(&mapping->i_pages, flags);
2793 } else { 2783 } else {
2794 ret = TestSetPageWriteback(page); 2784 ret = TestSetPageWriteback(page);
2795 } 2785 }
@@ -2803,16 +2793,6 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
2803} 2793}
2804EXPORT_SYMBOL(__test_set_page_writeback); 2794EXPORT_SYMBOL(__test_set_page_writeback);
2805 2795
2806/*
2807 * Return true if any of the pages in the mapping are marked with the
2808 * passed tag.
2809 */
2810int mapping_tagged(struct address_space *mapping, int tag)
2811{
2812 return radix_tree_tagged(&mapping->i_pages, tag);
2813}
2814EXPORT_SYMBOL(mapping_tagged);
2815
2816/** 2796/**
2817 * wait_for_stable_page() - wait for writeback to finish, if necessary. 2797 * wait_for_stable_page() - wait for writeback to finish, if necessary.
2818 * @page: The page to wait on. 2798 * @page: The page to wait on.
diff --git a/mm/readahead.c b/mm/readahead.c
index 4e630143a0ba..f3d6f9656a3c 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -176,10 +176,8 @@ unsigned int __do_page_cache_readahead(struct address_space *mapping,
176 if (page_offset > end_index) 176 if (page_offset > end_index)
177 break; 177 break;
178 178
179 rcu_read_lock(); 179 page = xa_load(&mapping->i_pages, page_offset);
180 page = radix_tree_lookup(&mapping->i_pages, page_offset); 180 if (page && !xa_is_value(page)) {
181 rcu_read_unlock();
182 if (page && !radix_tree_exceptional_entry(page)) {
183 /* 181 /*
184 * Page already present? Kick off the current batch of 182 * Page already present? Kick off the current batch of
185 * contiguous pages before continuing with the next 183 * contiguous pages before continuing with the next
@@ -336,7 +334,7 @@ static pgoff_t count_history_pages(struct address_space *mapping,
336 pgoff_t head; 334 pgoff_t head;
337 335
338 rcu_read_lock(); 336 rcu_read_lock();
339 head = page_cache_prev_hole(mapping, offset - 1, max); 337 head = page_cache_prev_miss(mapping, offset - 1, max);
340 rcu_read_unlock(); 338 rcu_read_unlock();
341 339
342 return offset - 1 - head; 340 return offset - 1 - head;
@@ -425,7 +423,7 @@ ondemand_readahead(struct address_space *mapping,
425 pgoff_t start; 423 pgoff_t start;
426 424
427 rcu_read_lock(); 425 rcu_read_lock();
428 start = page_cache_next_hole(mapping, offset + 1, max_pages); 426 start = page_cache_next_miss(mapping, offset + 1, max_pages);
429 rcu_read_unlock(); 427 rcu_read_unlock();
430 428
431 if (!start || start - offset > max_pages) 429 if (!start || start - offset > max_pages)
diff --git a/mm/shmem.c b/mm/shmem.c
index 446942677cd4..56bf122e0bb4 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -322,24 +322,20 @@ void shmem_uncharge(struct inode *inode, long pages)
322} 322}
323 323
324/* 324/*
325 * Replace item expected in radix tree by a new item, while holding tree lock. 325 * Replace item expected in xarray by a new item, while holding xa_lock.
326 */ 326 */
327static int shmem_radix_tree_replace(struct address_space *mapping, 327static int shmem_replace_entry(struct address_space *mapping,
328 pgoff_t index, void *expected, void *replacement) 328 pgoff_t index, void *expected, void *replacement)
329{ 329{
330 struct radix_tree_node *node; 330 XA_STATE(xas, &mapping->i_pages, index);
331 void __rcu **pslot;
332 void *item; 331 void *item;
333 332
334 VM_BUG_ON(!expected); 333 VM_BUG_ON(!expected);
335 VM_BUG_ON(!replacement); 334 VM_BUG_ON(!replacement);
336 item = __radix_tree_lookup(&mapping->i_pages, index, &node, &pslot); 335 item = xas_load(&xas);
337 if (!item)
338 return -ENOENT;
339 if (item != expected) 336 if (item != expected)
340 return -ENOENT; 337 return -ENOENT;
341 __radix_tree_replace(&mapping->i_pages, node, pslot, 338 xas_store(&xas, replacement);
342 replacement, NULL);
343 return 0; 339 return 0;
344} 340}
345 341
@@ -353,12 +349,7 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
353static bool shmem_confirm_swap(struct address_space *mapping, 349static bool shmem_confirm_swap(struct address_space *mapping,
354 pgoff_t index, swp_entry_t swap) 350 pgoff_t index, swp_entry_t swap)
355{ 351{
356 void *item; 352 return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
357
358 rcu_read_lock();
359 item = radix_tree_lookup(&mapping->i_pages, index);
360 rcu_read_unlock();
361 return item == swp_to_radix_entry(swap);
362} 353}
363 354
364/* 355/*
@@ -586,9 +577,11 @@ static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo)
586 */ 577 */
587static int shmem_add_to_page_cache(struct page *page, 578static int shmem_add_to_page_cache(struct page *page,
588 struct address_space *mapping, 579 struct address_space *mapping,
589 pgoff_t index, void *expected) 580 pgoff_t index, void *expected, gfp_t gfp)
590{ 581{
591 int error, nr = hpage_nr_pages(page); 582 XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
583 unsigned long i = 0;
584 unsigned long nr = 1UL << compound_order(page);
592 585
593 VM_BUG_ON_PAGE(PageTail(page), page); 586 VM_BUG_ON_PAGE(PageTail(page), page);
594 VM_BUG_ON_PAGE(index != round_down(index, nr), page); 587 VM_BUG_ON_PAGE(index != round_down(index, nr), page);
@@ -600,47 +593,39 @@ static int shmem_add_to_page_cache(struct page *page,
600 page->mapping = mapping; 593 page->mapping = mapping;
601 page->index = index; 594 page->index = index;
602 595
603 xa_lock_irq(&mapping->i_pages); 596 do {
604 if (PageTransHuge(page)) { 597 void *entry;
605 void __rcu **results; 598 xas_lock_irq(&xas);
606 pgoff_t idx; 599 entry = xas_find_conflict(&xas);
607 int i; 600 if (entry != expected)
608 601 xas_set_err(&xas, -EEXIST);
609 error = 0; 602 xas_create_range(&xas);
610 if (radix_tree_gang_lookup_slot(&mapping->i_pages, 603 if (xas_error(&xas))
611 &results, &idx, index, 1) && 604 goto unlock;
612 idx < index + HPAGE_PMD_NR) { 605next:
613 error = -EEXIST; 606 xas_store(&xas, page + i);
607 if (++i < nr) {
608 xas_next(&xas);
609 goto next;
614 } 610 }
615 611 if (PageTransHuge(page)) {
616 if (!error) {
617 for (i = 0; i < HPAGE_PMD_NR; i++) {
618 error = radix_tree_insert(&mapping->i_pages,
619 index + i, page + i);
620 VM_BUG_ON(error);
621 }
622 count_vm_event(THP_FILE_ALLOC); 612 count_vm_event(THP_FILE_ALLOC);
613 __inc_node_page_state(page, NR_SHMEM_THPS);
623 } 614 }
624 } else if (!expected) {
625 error = radix_tree_insert(&mapping->i_pages, index, page);
626 } else {
627 error = shmem_radix_tree_replace(mapping, index, expected,
628 page);
629 }
630
631 if (!error) {
632 mapping->nrpages += nr; 615 mapping->nrpages += nr;
633 if (PageTransHuge(page))
634 __inc_node_page_state(page, NR_SHMEM_THPS);
635 __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); 616 __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
636 __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr); 617 __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr);
637 xa_unlock_irq(&mapping->i_pages); 618unlock:
638 } else { 619 xas_unlock_irq(&xas);
620 } while (xas_nomem(&xas, gfp));
621
622 if (xas_error(&xas)) {
639 page->mapping = NULL; 623 page->mapping = NULL;
640 xa_unlock_irq(&mapping->i_pages);
641 page_ref_sub(page, nr); 624 page_ref_sub(page, nr);
625 return xas_error(&xas);
642 } 626 }
643 return error; 627
628 return 0;
644} 629}
645 630
646/* 631/*
@@ -654,7 +639,7 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap)
654 VM_BUG_ON_PAGE(PageCompound(page), page); 639 VM_BUG_ON_PAGE(PageCompound(page), page);
655 640
656 xa_lock_irq(&mapping->i_pages); 641 xa_lock_irq(&mapping->i_pages);
657 error = shmem_radix_tree_replace(mapping, page->index, page, radswap); 642 error = shmem_replace_entry(mapping, page->index, page, radswap);
658 page->mapping = NULL; 643 page->mapping = NULL;
659 mapping->nrpages--; 644 mapping->nrpages--;
660 __dec_node_page_state(page, NR_FILE_PAGES); 645 __dec_node_page_state(page, NR_FILE_PAGES);
@@ -665,7 +650,7 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap)
665} 650}
666 651
667/* 652/*
668 * Remove swap entry from radix tree, free the swap and its page cache. 653 * Remove swap entry from page cache, free the swap and its page cache.
669 */ 654 */
670static int shmem_free_swap(struct address_space *mapping, 655static int shmem_free_swap(struct address_space *mapping,
671 pgoff_t index, void *radswap) 656 pgoff_t index, void *radswap)
@@ -673,7 +658,7 @@ static int shmem_free_swap(struct address_space *mapping,
673 void *old; 658 void *old;
674 659
675 xa_lock_irq(&mapping->i_pages); 660 xa_lock_irq(&mapping->i_pages);
676 old = radix_tree_delete_item(&mapping->i_pages, index, radswap); 661 old = __xa_cmpxchg(&mapping->i_pages, index, radswap, NULL, 0);
677 xa_unlock_irq(&mapping->i_pages); 662 xa_unlock_irq(&mapping->i_pages);
678 if (old != radswap) 663 if (old != radswap)
679 return -ENOENT; 664 return -ENOENT;
@@ -691,29 +676,19 @@ static int shmem_free_swap(struct address_space *mapping,
691unsigned long shmem_partial_swap_usage(struct address_space *mapping, 676unsigned long shmem_partial_swap_usage(struct address_space *mapping,
692 pgoff_t start, pgoff_t end) 677 pgoff_t start, pgoff_t end)
693{ 678{
694 struct radix_tree_iter iter; 679 XA_STATE(xas, &mapping->i_pages, start);
695 void __rcu **slot;
696 struct page *page; 680 struct page *page;
697 unsigned long swapped = 0; 681 unsigned long swapped = 0;
698 682
699 rcu_read_lock(); 683 rcu_read_lock();
700 684 xas_for_each(&xas, page, end - 1) {
701 radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { 685 if (xas_retry(&xas, page))
702 if (iter.index >= end)
703 break;
704
705 page = radix_tree_deref_slot(slot);
706
707 if (radix_tree_deref_retry(page)) {
708 slot = radix_tree_iter_retry(&iter);
709 continue; 686 continue;
710 } 687 if (xa_is_value(page))
711
712 if (radix_tree_exceptional_entry(page))
713 swapped++; 688 swapped++;
714 689
715 if (need_resched()) { 690 if (need_resched()) {
716 slot = radix_tree_iter_resume(slot, &iter); 691 xas_pause(&xas);
717 cond_resched_rcu(); 692 cond_resched_rcu();
718 } 693 }
719 } 694 }
@@ -788,7 +763,7 @@ void shmem_unlock_mapping(struct address_space *mapping)
788} 763}
789 764
790/* 765/*
791 * Remove range of pages and swap entries from radix tree, and free them. 766 * Remove range of pages and swap entries from page cache, and free them.
792 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. 767 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
793 */ 768 */
794static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, 769static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
@@ -824,7 +799,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
824 if (index >= end) 799 if (index >= end)
825 break; 800 break;
826 801
827 if (radix_tree_exceptional_entry(page)) { 802 if (xa_is_value(page)) {
828 if (unfalloc) 803 if (unfalloc)
829 continue; 804 continue;
830 nr_swaps_freed += !shmem_free_swap(mapping, 805 nr_swaps_freed += !shmem_free_swap(mapping,
@@ -921,7 +896,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
921 if (index >= end) 896 if (index >= end)
922 break; 897 break;
923 898
924 if (radix_tree_exceptional_entry(page)) { 899 if (xa_is_value(page)) {
925 if (unfalloc) 900 if (unfalloc)
926 continue; 901 continue;
927 if (shmem_free_swap(mapping, index, page)) { 902 if (shmem_free_swap(mapping, index, page)) {
@@ -1110,34 +1085,27 @@ static void shmem_evict_inode(struct inode *inode)
1110 clear_inode(inode); 1085 clear_inode(inode);
1111} 1086}
1112 1087
1113static unsigned long find_swap_entry(struct radix_tree_root *root, void *item) 1088static unsigned long find_swap_entry(struct xarray *xa, void *item)
1114{ 1089{
1115 struct radix_tree_iter iter; 1090 XA_STATE(xas, xa, 0);
1116 void __rcu **slot;
1117 unsigned long found = -1;
1118 unsigned int checked = 0; 1091 unsigned int checked = 0;
1092 void *entry;
1119 1093
1120 rcu_read_lock(); 1094 rcu_read_lock();
1121 radix_tree_for_each_slot(slot, root, &iter, 0) { 1095 xas_for_each(&xas, entry, ULONG_MAX) {
1122 void *entry = radix_tree_deref_slot(slot); 1096 if (xas_retry(&xas, entry))
1123
1124 if (radix_tree_deref_retry(entry)) {
1125 slot = radix_tree_iter_retry(&iter);
1126 continue; 1097 continue;
1127 } 1098 if (entry == item)
1128 if (entry == item) {
1129 found = iter.index;
1130 break; 1099 break;
1131 }
1132 checked++; 1100 checked++;
1133 if ((checked % 4096) != 0) 1101 if ((checked % XA_CHECK_SCHED) != 0)
1134 continue; 1102 continue;
1135 slot = radix_tree_iter_resume(slot, &iter); 1103 xas_pause(&xas);
1136 cond_resched_rcu(); 1104 cond_resched_rcu();
1137 } 1105 }
1138
1139 rcu_read_unlock(); 1106 rcu_read_unlock();
1140 return found; 1107
1108 return entry ? xas.xa_index : -1;
1141} 1109}
1142 1110
1143/* 1111/*
@@ -1175,10 +1143,10 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
1175 * We needed to drop mutex to make that restrictive page 1143 * We needed to drop mutex to make that restrictive page
1176 * allocation, but the inode might have been freed while we 1144 * allocation, but the inode might have been freed while we
1177 * dropped it: although a racing shmem_evict_inode() cannot 1145 * dropped it: although a racing shmem_evict_inode() cannot
1178 * complete without emptying the radix_tree, our page lock 1146 * complete without emptying the page cache, our page lock
1179 * on this swapcache page is not enough to prevent that - 1147 * on this swapcache page is not enough to prevent that -
1180 * free_swap_and_cache() of our swap entry will only 1148 * free_swap_and_cache() of our swap entry will only
1181 * trylock_page(), removing swap from radix_tree whatever. 1149 * trylock_page(), removing swap from page cache whatever.
1182 * 1150 *
1183 * We must not proceed to shmem_add_to_page_cache() if the 1151 * We must not proceed to shmem_add_to_page_cache() if the
1184 * inode has been freed, but of course we cannot rely on 1152 * inode has been freed, but of course we cannot rely on
@@ -1200,7 +1168,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
1200 */ 1168 */
1201 if (!error) 1169 if (!error)
1202 error = shmem_add_to_page_cache(*pagep, mapping, index, 1170 error = shmem_add_to_page_cache(*pagep, mapping, index,
1203 radswap); 1171 radswap, gfp);
1204 if (error != -ENOMEM) { 1172 if (error != -ENOMEM) {
1205 /* 1173 /*
1206 * Truncation and eviction use free_swap_and_cache(), which 1174 * Truncation and eviction use free_swap_and_cache(), which
@@ -1244,7 +1212,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
1244 &memcg, false); 1212 &memcg, false);
1245 if (error) 1213 if (error)
1246 goto out; 1214 goto out;
1247 /* No radix_tree_preload: swap entry keeps a place for page in tree */ 1215 /* No memory allocation: swap entry occupies the slot for the page */
1248 error = -EAGAIN; 1216 error = -EAGAIN;
1249 1217
1250 mutex_lock(&shmem_swaplist_mutex); 1218 mutex_lock(&shmem_swaplist_mutex);
@@ -1453,23 +1421,17 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
1453 struct shmem_inode_info *info, pgoff_t index) 1421 struct shmem_inode_info *info, pgoff_t index)
1454{ 1422{
1455 struct vm_area_struct pvma; 1423 struct vm_area_struct pvma;
1456 struct inode *inode = &info->vfs_inode; 1424 struct address_space *mapping = info->vfs_inode.i_mapping;
1457 struct address_space *mapping = inode->i_mapping; 1425 pgoff_t hindex;
1458 pgoff_t idx, hindex;
1459 void __rcu **results;
1460 struct page *page; 1426 struct page *page;
1461 1427
1462 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) 1428 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
1463 return NULL; 1429 return NULL;
1464 1430
1465 hindex = round_down(index, HPAGE_PMD_NR); 1431 hindex = round_down(index, HPAGE_PMD_NR);
1466 rcu_read_lock(); 1432 if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1,
1467 if (radix_tree_gang_lookup_slot(&mapping->i_pages, &results, &idx, 1433 XA_PRESENT))
1468 hindex, 1) && idx < hindex + HPAGE_PMD_NR) {
1469 rcu_read_unlock();
1470 return NULL; 1434 return NULL;
1471 }
1472 rcu_read_unlock();
1473 1435
1474 shmem_pseudo_vma_init(&pvma, info, hindex); 1436 shmem_pseudo_vma_init(&pvma, info, hindex);
1475 page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, 1437 page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
@@ -1578,8 +1540,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
1578 * a nice clean interface for us to replace oldpage by newpage there. 1540 * a nice clean interface for us to replace oldpage by newpage there.
1579 */ 1541 */
1580 xa_lock_irq(&swap_mapping->i_pages); 1542 xa_lock_irq(&swap_mapping->i_pages);
1581 error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, 1543 error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage);
1582 newpage);
1583 if (!error) { 1544 if (!error) {
1584 __inc_node_page_state(newpage, NR_FILE_PAGES); 1545 __inc_node_page_state(newpage, NR_FILE_PAGES);
1585 __dec_node_page_state(oldpage, NR_FILE_PAGES); 1546 __dec_node_page_state(oldpage, NR_FILE_PAGES);
@@ -1643,7 +1604,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
1643repeat: 1604repeat:
1644 swap.val = 0; 1605 swap.val = 0;
1645 page = find_lock_entry(mapping, index); 1606 page = find_lock_entry(mapping, index);
1646 if (radix_tree_exceptional_entry(page)) { 1607 if (xa_is_value(page)) {
1647 swap = radix_to_swp_entry(page); 1608 swap = radix_to_swp_entry(page);
1648 page = NULL; 1609 page = NULL;
1649 } 1610 }
@@ -1718,7 +1679,7 @@ repeat:
1718 false); 1679 false);
1719 if (!error) { 1680 if (!error) {
1720 error = shmem_add_to_page_cache(page, mapping, index, 1681 error = shmem_add_to_page_cache(page, mapping, index,
1721 swp_to_radix_entry(swap)); 1682 swp_to_radix_entry(swap), gfp);
1722 /* 1683 /*
1723 * We already confirmed swap under page lock, and make 1684 * We already confirmed swap under page lock, and make
1724 * no memory allocation here, so usually no possibility 1685 * no memory allocation here, so usually no possibility
@@ -1824,13 +1785,8 @@ alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode,
1824 PageTransHuge(page)); 1785 PageTransHuge(page));
1825 if (error) 1786 if (error)
1826 goto unacct; 1787 goto unacct;
1827 error = radix_tree_maybe_preload_order(gfp & GFP_RECLAIM_MASK, 1788 error = shmem_add_to_page_cache(page, mapping, hindex,
1828 compound_order(page)); 1789 NULL, gfp & GFP_RECLAIM_MASK);
1829 if (!error) {
1830 error = shmem_add_to_page_cache(page, mapping, hindex,
1831 NULL);
1832 radix_tree_preload_end();
1833 }
1834 if (error) { 1790 if (error) {
1835 mem_cgroup_cancel_charge(page, memcg, 1791 mem_cgroup_cancel_charge(page, memcg,
1836 PageTransHuge(page)); 1792 PageTransHuge(page));
@@ -1931,7 +1887,7 @@ unlock:
1931 spin_unlock_irq(&info->lock); 1887 spin_unlock_irq(&info->lock);
1932 goto repeat; 1888 goto repeat;
1933 } 1889 }
1934 if (error == -EEXIST) /* from above or from radix_tree_insert */ 1890 if (error == -EEXIST)
1935 goto repeat; 1891 goto repeat;
1936 return error; 1892 return error;
1937} 1893}
@@ -2299,11 +2255,8 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
2299 if (ret) 2255 if (ret)
2300 goto out_release; 2256 goto out_release;
2301 2257
2302 ret = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); 2258 ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
2303 if (!ret) { 2259 gfp & GFP_RECLAIM_MASK);
2304 ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL);
2305 radix_tree_preload_end();
2306 }
2307 if (ret) 2260 if (ret)
2308 goto out_release_uncharge; 2261 goto out_release_uncharge;
2309 2262
@@ -2548,7 +2501,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
2548} 2501}
2549 2502
2550/* 2503/*
2551 * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. 2504 * llseek SEEK_DATA or SEEK_HOLE through the page cache.
2552 */ 2505 */
2553static pgoff_t shmem_seek_hole_data(struct address_space *mapping, 2506static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
2554 pgoff_t index, pgoff_t end, int whence) 2507 pgoff_t index, pgoff_t end, int whence)
@@ -2578,7 +2531,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
2578 index = indices[i]; 2531 index = indices[i];
2579 } 2532 }
2580 page = pvec.pages[i]; 2533 page = pvec.pages[i];
2581 if (page && !radix_tree_exceptional_entry(page)) { 2534 if (page && !xa_is_value(page)) {
2582 if (!PageUptodate(page)) 2535 if (!PageUptodate(page))
2583 page = NULL; 2536 page = NULL;
2584 } 2537 }
diff --git a/mm/swap.c b/mm/swap.c
index 87a54c8dee34..aa483719922e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -964,7 +964,7 @@ void pagevec_remove_exceptionals(struct pagevec *pvec)
964 964
965 for (i = 0, j = 0; i < pagevec_count(pvec); i++) { 965 for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
966 struct page *page = pvec->pages[i]; 966 struct page *page = pvec->pages[i];
967 if (!radix_tree_exceptional_entry(page)) 967 if (!xa_is_value(page))
968 pvec->pages[j++] = page; 968 pvec->pages[j++] = page;
969 } 969 }
970 pvec->nr = j; 970 pvec->nr = j;
@@ -1001,7 +1001,7 @@ EXPORT_SYMBOL(pagevec_lookup_range);
1001 1001
1002unsigned pagevec_lookup_range_tag(struct pagevec *pvec, 1002unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
1003 struct address_space *mapping, pgoff_t *index, pgoff_t end, 1003 struct address_space *mapping, pgoff_t *index, pgoff_t end,
1004 int tag) 1004 xa_mark_t tag)
1005{ 1005{
1006 pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, 1006 pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
1007 PAGEVEC_SIZE, pvec->pages); 1007 PAGEVEC_SIZE, pvec->pages);
@@ -1011,7 +1011,7 @@ EXPORT_SYMBOL(pagevec_lookup_range_tag);
1011 1011
1012unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec, 1012unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec,
1013 struct address_space *mapping, pgoff_t *index, pgoff_t end, 1013 struct address_space *mapping, pgoff_t *index, pgoff_t end,
1014 int tag, unsigned max_pages) 1014 xa_mark_t tag, unsigned max_pages)
1015{ 1015{
1016 pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, 1016 pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
1017 min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages); 1017 min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 0d6a7f268d2e..fd2f21e1c60a 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -107,14 +107,15 @@ void show_swap_cache_info(void)
107} 107}
108 108
109/* 109/*
110 * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, 110 * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
111 * but sets SwapCache flag and private instead of mapping and index. 111 * but sets SwapCache flag and private instead of mapping and index.
112 */ 112 */
113int __add_to_swap_cache(struct page *page, swp_entry_t entry) 113int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp)
114{ 114{
115 int error, i, nr = hpage_nr_pages(page); 115 struct address_space *address_space = swap_address_space(entry);
116 struct address_space *address_space;
117 pgoff_t idx = swp_offset(entry); 116 pgoff_t idx = swp_offset(entry);
117 XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page));
118 unsigned long i, nr = 1UL << compound_order(page);
118 119
119 VM_BUG_ON_PAGE(!PageLocked(page), page); 120 VM_BUG_ON_PAGE(!PageLocked(page), page);
120 VM_BUG_ON_PAGE(PageSwapCache(page), page); 121 VM_BUG_ON_PAGE(PageSwapCache(page), page);
@@ -123,73 +124,52 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry)
123 page_ref_add(page, nr); 124 page_ref_add(page, nr);
124 SetPageSwapCache(page); 125 SetPageSwapCache(page);
125 126
126 address_space = swap_address_space(entry); 127 do {
127 xa_lock_irq(&address_space->i_pages); 128 xas_lock_irq(&xas);
128 for (i = 0; i < nr; i++) { 129 xas_create_range(&xas);
129 set_page_private(page + i, entry.val + i); 130 if (xas_error(&xas))
130 error = radix_tree_insert(&address_space->i_pages, 131 goto unlock;
131 idx + i, page + i); 132 for (i = 0; i < nr; i++) {
132 if (unlikely(error)) 133 VM_BUG_ON_PAGE(xas.xa_index != idx + i, page);
133 break; 134 set_page_private(page + i, entry.val + i);
134 } 135 xas_store(&xas, page + i);
135 if (likely(!error)) { 136 xas_next(&xas);
137 }
136 address_space->nrpages += nr; 138 address_space->nrpages += nr;
137 __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); 139 __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
138 ADD_CACHE_INFO(add_total, nr); 140 ADD_CACHE_INFO(add_total, nr);
139 } else { 141unlock:
140 /* 142 xas_unlock_irq(&xas);
141 * Only the context which have set SWAP_HAS_CACHE flag 143 } while (xas_nomem(&xas, gfp));
142 * would call add_to_swap_cache().
143 * So add_to_swap_cache() doesn't returns -EEXIST.
144 */
145 VM_BUG_ON(error == -EEXIST);
146 set_page_private(page + i, 0UL);
147 while (i--) {
148 radix_tree_delete(&address_space->i_pages, idx + i);
149 set_page_private(page + i, 0UL);
150 }
151 ClearPageSwapCache(page);
152 page_ref_sub(page, nr);
153 }
154 xa_unlock_irq(&address_space->i_pages);
155 144
156 return error; 145 if (!xas_error(&xas))
157} 146 return 0;
158
159
160int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
161{
162 int error;
163 147
164 error = radix_tree_maybe_preload_order(gfp_mask, compound_order(page)); 148 ClearPageSwapCache(page);
165 if (!error) { 149 page_ref_sub(page, nr);
166 error = __add_to_swap_cache(page, entry); 150 return xas_error(&xas);
167 radix_tree_preload_end();
168 }
169 return error;
170} 151}
171 152
172/* 153/*
173 * This must be called only on pages that have 154 * This must be called only on pages that have
174 * been verified to be in the swap cache. 155 * been verified to be in the swap cache.
175 */ 156 */
176void __delete_from_swap_cache(struct page *page) 157void __delete_from_swap_cache(struct page *page, swp_entry_t entry)
177{ 158{
178 struct address_space *address_space; 159 struct address_space *address_space = swap_address_space(entry);
179 int i, nr = hpage_nr_pages(page); 160 int i, nr = hpage_nr_pages(page);
180 swp_entry_t entry; 161 pgoff_t idx = swp_offset(entry);
181 pgoff_t idx; 162 XA_STATE(xas, &address_space->i_pages, idx);
182 163
183 VM_BUG_ON_PAGE(!PageLocked(page), page); 164 VM_BUG_ON_PAGE(!PageLocked(page), page);
184 VM_BUG_ON_PAGE(!PageSwapCache(page), page); 165 VM_BUG_ON_PAGE(!PageSwapCache(page), page);
185 VM_BUG_ON_PAGE(PageWriteback(page), page); 166 VM_BUG_ON_PAGE(PageWriteback(page), page);
186 167
187 entry.val = page_private(page);
188 address_space = swap_address_space(entry);
189 idx = swp_offset(entry);
190 for (i = 0; i < nr; i++) { 168 for (i = 0; i < nr; i++) {
191 radix_tree_delete(&address_space->i_pages, idx + i); 169 void *entry = xas_store(&xas, NULL);
170 VM_BUG_ON_PAGE(entry != page + i, entry);
192 set_page_private(page + i, 0); 171 set_page_private(page + i, 0);
172 xas_next(&xas);
193 } 173 }
194 ClearPageSwapCache(page); 174 ClearPageSwapCache(page);
195 address_space->nrpages -= nr; 175 address_space->nrpages -= nr;
@@ -217,7 +197,7 @@ int add_to_swap(struct page *page)
217 return 0; 197 return 0;
218 198
219 /* 199 /*
220 * Radix-tree node allocations from PF_MEMALLOC contexts could 200 * XArray node allocations from PF_MEMALLOC contexts could
221 * completely exhaust the page allocator. __GFP_NOMEMALLOC 201 * completely exhaust the page allocator. __GFP_NOMEMALLOC
222 * stops emergency reserves from being allocated. 202 * stops emergency reserves from being allocated.
223 * 203 *
@@ -229,7 +209,6 @@ int add_to_swap(struct page *page)
229 */ 209 */
230 err = add_to_swap_cache(page, entry, 210 err = add_to_swap_cache(page, entry,
231 __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); 211 __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
232 /* -ENOMEM radix-tree allocation failure */
233 if (err) 212 if (err)
234 /* 213 /*
235 * add_to_swap_cache() doesn't return -EEXIST, so we can safely 214 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
@@ -263,14 +242,11 @@ fail:
263 */ 242 */
264void delete_from_swap_cache(struct page *page) 243void delete_from_swap_cache(struct page *page)
265{ 244{
266 swp_entry_t entry; 245 swp_entry_t entry = { .val = page_private(page) };
267 struct address_space *address_space; 246 struct address_space *address_space = swap_address_space(entry);
268 247
269 entry.val = page_private(page);
270
271 address_space = swap_address_space(entry);
272 xa_lock_irq(&address_space->i_pages); 248 xa_lock_irq(&address_space->i_pages);
273 __delete_from_swap_cache(page); 249 __delete_from_swap_cache(page, entry);
274 xa_unlock_irq(&address_space->i_pages); 250 xa_unlock_irq(&address_space->i_pages);
275 251
276 put_swap_page(page, entry); 252 put_swap_page(page, entry);
@@ -414,18 +390,10 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
414 } 390 }
415 391
416 /* 392 /*
417 * call radix_tree_preload() while we can wait.
418 */
419 err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL);
420 if (err)
421 break;
422
423 /*
424 * Swap entry may have been freed since our caller observed it. 393 * Swap entry may have been freed since our caller observed it.
425 */ 394 */
426 err = swapcache_prepare(entry); 395 err = swapcache_prepare(entry);
427 if (err == -EEXIST) { 396 if (err == -EEXIST) {
428 radix_tree_preload_end();
429 /* 397 /*
430 * We might race against get_swap_page() and stumble 398 * We might race against get_swap_page() and stumble
431 * across a SWAP_HAS_CACHE swap_map entry whose page 399 * across a SWAP_HAS_CACHE swap_map entry whose page
@@ -433,27 +401,20 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
433 */ 401 */
434 cond_resched(); 402 cond_resched();
435 continue; 403 continue;
436 } 404 } else if (err) /* swp entry is obsolete ? */
437 if (err) { /* swp entry is obsolete ? */
438 radix_tree_preload_end();
439 break; 405 break;
440 }
441 406
442 /* May fail (-ENOMEM) if radix-tree node allocation failed. */ 407 /* May fail (-ENOMEM) if XArray node allocation failed. */
443 __SetPageLocked(new_page); 408 __SetPageLocked(new_page);
444 __SetPageSwapBacked(new_page); 409 __SetPageSwapBacked(new_page);
445 err = __add_to_swap_cache(new_page, entry); 410 err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
446 if (likely(!err)) { 411 if (likely(!err)) {
447 radix_tree_preload_end(); 412 /* Initiate read into locked page */
448 /*
449 * Initiate read into locked page and return.
450 */
451 SetPageWorkingset(new_page); 413 SetPageWorkingset(new_page);
452 lru_cache_add_anon(new_page); 414 lru_cache_add_anon(new_page);
453 *new_page_allocated = true; 415 *new_page_allocated = true;
454 return new_page; 416 return new_page;
455 } 417 }
456 radix_tree_preload_end();
457 __ClearPageLocked(new_page); 418 __ClearPageLocked(new_page);
458 /* 419 /*
459 * add_to_swap_cache() doesn't return -EEXIST, so we can safely 420 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
@@ -626,7 +587,7 @@ int init_swap_address_space(unsigned int type, unsigned long nr_pages)
626 return -ENOMEM; 587 return -ENOMEM;
627 for (i = 0; i < nr; i++) { 588 for (i = 0; i < nr; i++) {
628 space = spaces + i; 589 space = spaces + i;
629 INIT_RADIX_TREE(&space->i_pages, GFP_ATOMIC|__GFP_NOWARN); 590 xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ);
630 atomic_set(&space->i_mmap_writable, 0); 591 atomic_set(&space->i_mmap_writable, 0);
631 space->a_ops = &swap_aops; 592 space->a_ops = &swap_aops;
632 /* swap cache doesn't use writeback related tags */ 593 /* swap cache doesn't use writeback related tags */
diff --git a/mm/truncate.c b/mm/truncate.c
index 1d2fb2dca96f..45d68e90b703 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -33,15 +33,12 @@
33static inline void __clear_shadow_entry(struct address_space *mapping, 33static inline void __clear_shadow_entry(struct address_space *mapping,
34 pgoff_t index, void *entry) 34 pgoff_t index, void *entry)
35{ 35{
36 struct radix_tree_node *node; 36 XA_STATE(xas, &mapping->i_pages, index);
37 void **slot;
38 37
39 if (!__radix_tree_lookup(&mapping->i_pages, index, &node, &slot)) 38 xas_set_update(&xas, workingset_update_node);
39 if (xas_load(&xas) != entry)
40 return; 40 return;
41 if (*slot != entry) 41 xas_store(&xas, NULL);
42 return;
43 __radix_tree_replace(&mapping->i_pages, node, slot, NULL,
44 workingset_update_node);
45 mapping->nrexceptional--; 42 mapping->nrexceptional--;
46} 43}
47 44
@@ -70,7 +67,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping,
70 return; 67 return;
71 68
72 for (j = 0; j < pagevec_count(pvec); j++) 69 for (j = 0; j < pagevec_count(pvec); j++)
73 if (radix_tree_exceptional_entry(pvec->pages[j])) 70 if (xa_is_value(pvec->pages[j]))
74 break; 71 break;
75 72
76 if (j == pagevec_count(pvec)) 73 if (j == pagevec_count(pvec))
@@ -85,7 +82,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping,
85 struct page *page = pvec->pages[i]; 82 struct page *page = pvec->pages[i];
86 pgoff_t index = indices[i]; 83 pgoff_t index = indices[i];
87 84
88 if (!radix_tree_exceptional_entry(page)) { 85 if (!xa_is_value(page)) {
89 pvec->pages[j++] = page; 86 pvec->pages[j++] = page;
90 continue; 87 continue;
91 } 88 }
@@ -347,7 +344,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
347 if (index >= end) 344 if (index >= end)
348 break; 345 break;
349 346
350 if (radix_tree_exceptional_entry(page)) 347 if (xa_is_value(page))
351 continue; 348 continue;
352 349
353 if (!trylock_page(page)) 350 if (!trylock_page(page))
@@ -442,7 +439,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
442 break; 439 break;
443 } 440 }
444 441
445 if (radix_tree_exceptional_entry(page)) 442 if (xa_is_value(page))
446 continue; 443 continue;
447 444
448 lock_page(page); 445 lock_page(page);
@@ -561,7 +558,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
561 if (index > end) 558 if (index > end)
562 break; 559 break;
563 560
564 if (radix_tree_exceptional_entry(page)) { 561 if (xa_is_value(page)) {
565 invalidate_exceptional_entry(mapping, index, 562 invalidate_exceptional_entry(mapping, index,
566 page); 563 page);
567 continue; 564 continue;
@@ -692,7 +689,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
692 if (index > end) 689 if (index > end)
693 break; 690 break;
694 691
695 if (radix_tree_exceptional_entry(page)) { 692 if (xa_is_value(page)) {
696 if (!invalidate_exceptional_entry2(mapping, 693 if (!invalidate_exceptional_entry2(mapping,
697 index, page)) 694 index, page))
698 ret = -EBUSY; 695 ret = -EBUSY;
@@ -738,10 +735,10 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
738 index++; 735 index++;
739 } 736 }
740 /* 737 /*
741 * For DAX we invalidate page tables after invalidating radix tree. We 738 * For DAX we invalidate page tables after invalidating page cache. We
742 * could invalidate page tables while invalidating each entry however 739 * could invalidate page tables while invalidating each entry however
743 * that would be expensive. And doing range unmapping before doesn't 740 * that would be expensive. And doing range unmapping before doesn't
744 * work as we have no cheap way to find whether radix tree entry didn't 741 * work as we have no cheap way to find whether page cache entry didn't
745 * get remapped later. 742 * get remapped later.
746 */ 743 */
747 if (dax_mapping(mapping)) { 744 if (dax_mapping(mapping)) {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 28c9ae5633b9..62ac0c488624 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -751,12 +751,12 @@ static inline int is_page_cache_freeable(struct page *page)
751{ 751{
752 /* 752 /*
753 * A freeable page cache page is referenced only by the caller 753 * A freeable page cache page is referenced only by the caller
754 * that isolated the page, the page cache radix tree and 754 * that isolated the page, the page cache and optional buffer
755 * optional buffer heads at page->private. 755 * heads at page->private.
756 */ 756 */
757 int radix_pins = PageTransHuge(page) && PageSwapCache(page) ? 757 int page_cache_pins = PageTransHuge(page) && PageSwapCache(page) ?
758 HPAGE_PMD_NR : 1; 758 HPAGE_PMD_NR : 1;
759 return page_count(page) - page_has_private(page) == 1 + radix_pins; 759 return page_count(page) - page_has_private(page) == 1 + page_cache_pins;
760} 760}
761 761
762static int may_write_to_inode(struct inode *inode, struct scan_control *sc) 762static int may_write_to_inode(struct inode *inode, struct scan_control *sc)
@@ -932,7 +932,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
932 if (PageSwapCache(page)) { 932 if (PageSwapCache(page)) {
933 swp_entry_t swap = { .val = page_private(page) }; 933 swp_entry_t swap = { .val = page_private(page) };
934 mem_cgroup_swapout(page, swap); 934 mem_cgroup_swapout(page, swap);
935 __delete_from_swap_cache(page); 935 __delete_from_swap_cache(page, swap);
936 xa_unlock_irqrestore(&mapping->i_pages, flags); 936 xa_unlock_irqrestore(&mapping->i_pages, flags);
937 put_swap_page(page, swap); 937 put_swap_page(page, swap);
938 } else { 938 } else {
diff --git a/mm/workingset.c b/mm/workingset.c
index cbc13d4dfa79..d46f8c92aa2f 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -160,20 +160,20 @@
160 * and activations is maintained (node->inactive_age). 160 * and activations is maintained (node->inactive_age).
161 * 161 *
162 * On eviction, a snapshot of this counter (along with some bits to 162 * On eviction, a snapshot of this counter (along with some bits to
163 * identify the node) is stored in the now empty page cache radix tree 163 * identify the node) is stored in the now empty page cache
164 * slot of the evicted page. This is called a shadow entry. 164 * slot of the evicted page. This is called a shadow entry.
165 * 165 *
166 * On cache misses for which there are shadow entries, an eligible 166 * On cache misses for which there are shadow entries, an eligible
167 * refault distance will immediately activate the refaulting page. 167 * refault distance will immediately activate the refaulting page.
168 */ 168 */
169 169
170#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \ 170#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \
171 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT) 171 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT)
172#define EVICTION_MASK (~0UL >> EVICTION_SHIFT) 172#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
173 173
174/* 174/*
175 * Eviction timestamps need to be able to cover the full range of 175 * Eviction timestamps need to be able to cover the full range of
176 * actionable refaults. However, bits are tight in the radix tree 176 * actionable refaults. However, bits are tight in the xarray
177 * entry, and after storing the identifier for the lruvec there might 177 * entry, and after storing the identifier for the lruvec there might
178 * not be enough left to represent every single actionable refault. In 178 * not be enough left to represent every single actionable refault. In
179 * that case, we have to sacrifice granularity for distance, and group 179 * that case, we have to sacrifice granularity for distance, and group
@@ -185,22 +185,21 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
185 bool workingset) 185 bool workingset)
186{ 186{
187 eviction >>= bucket_order; 187 eviction >>= bucket_order;
188 eviction &= EVICTION_MASK;
188 eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; 189 eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
189 eviction = (eviction << NODES_SHIFT) | pgdat->node_id; 190 eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
190 eviction = (eviction << 1) | workingset; 191 eviction = (eviction << 1) | workingset;
191 eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
192 192
193 return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); 193 return xa_mk_value(eviction);
194} 194}
195 195
196static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, 196static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
197 unsigned long *evictionp, bool *workingsetp) 197 unsigned long *evictionp, bool *workingsetp)
198{ 198{
199 unsigned long entry = (unsigned long)shadow; 199 unsigned long entry = xa_to_value(shadow);
200 int memcgid, nid; 200 int memcgid, nid;
201 bool workingset; 201 bool workingset;
202 202
203 entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
204 workingset = entry & 1; 203 workingset = entry & 1;
205 entry >>= 1; 204 entry >>= 1;
206 nid = entry & ((1UL << NODES_SHIFT) - 1); 205 nid = entry & ((1UL << NODES_SHIFT) - 1);
@@ -367,7 +366,7 @@ out:
367 366
368static struct list_lru shadow_nodes; 367static struct list_lru shadow_nodes;
369 368
370void workingset_update_node(struct radix_tree_node *node) 369void workingset_update_node(struct xa_node *node)
371{ 370{
372 /* 371 /*
373 * Track non-empty nodes that contain only shadow entries; 372 * Track non-empty nodes that contain only shadow entries;
@@ -379,7 +378,7 @@ void workingset_update_node(struct radix_tree_node *node)
379 */ 378 */
380 VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */ 379 VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */
381 380
382 if (node->count && node->count == node->exceptional) { 381 if (node->count && node->count == node->nr_values) {
383 if (list_empty(&node->private_list)) { 382 if (list_empty(&node->private_list)) {
384 list_lru_add(&shadow_nodes, &node->private_list); 383 list_lru_add(&shadow_nodes, &node->private_list);
385 __inc_lruvec_page_state(virt_to_page(node), 384 __inc_lruvec_page_state(virt_to_page(node),
@@ -404,7 +403,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
404 nodes = list_lru_shrink_count(&shadow_nodes, sc); 403 nodes = list_lru_shrink_count(&shadow_nodes, sc);
405 404
406 /* 405 /*
407 * Approximate a reasonable limit for the radix tree nodes 406 * Approximate a reasonable limit for the nodes
408 * containing shadow entries. We don't need to keep more 407 * containing shadow entries. We don't need to keep more
409 * shadow entries than possible pages on the active list, 408 * shadow entries than possible pages on the active list,
410 * since refault distances bigger than that are dismissed. 409 * since refault distances bigger than that are dismissed.
@@ -419,11 +418,11 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
419 * worst-case density of 1/8th. Below that, not all eligible 418 * worst-case density of 1/8th. Below that, not all eligible
420 * refaults can be detected anymore. 419 * refaults can be detected anymore.
421 * 420 *
422 * On 64-bit with 7 radix_tree_nodes per page and 64 slots 421 * On 64-bit with 7 xa_nodes per page and 64 slots
423 * each, this will reclaim shadow entries when they consume 422 * each, this will reclaim shadow entries when they consume
424 * ~1.8% of available memory: 423 * ~1.8% of available memory:
425 * 424 *
426 * PAGE_SIZE / radix_tree_nodes / node_entries * 8 / PAGE_SIZE 425 * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE
427 */ 426 */
428#ifdef CONFIG_MEMCG 427#ifdef CONFIG_MEMCG
429 if (sc->memcg) { 428 if (sc->memcg) {
@@ -438,7 +437,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
438#endif 437#endif
439 pages = node_present_pages(sc->nid); 438 pages = node_present_pages(sc->nid);
440 439
441 max_nodes = pages >> (RADIX_TREE_MAP_SHIFT - 3); 440 max_nodes = pages >> (XA_CHUNK_SHIFT - 3);
442 441
443 if (!nodes) 442 if (!nodes)
444 return SHRINK_EMPTY; 443 return SHRINK_EMPTY;
@@ -451,11 +450,11 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
451static enum lru_status shadow_lru_isolate(struct list_head *item, 450static enum lru_status shadow_lru_isolate(struct list_head *item,
452 struct list_lru_one *lru, 451 struct list_lru_one *lru,
453 spinlock_t *lru_lock, 452 spinlock_t *lru_lock,
454 void *arg) 453 void *arg) __must_hold(lru_lock)
455{ 454{
455 struct xa_node *node = container_of(item, struct xa_node, private_list);
456 XA_STATE(xas, node->array, 0);
456 struct address_space *mapping; 457 struct address_space *mapping;
457 struct radix_tree_node *node;
458 unsigned int i;
459 int ret; 458 int ret;
460 459
461 /* 460 /*
@@ -463,15 +462,14 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
463 * the shadow node LRU under the i_pages lock and the 462 * the shadow node LRU under the i_pages lock and the
464 * lru_lock. Because the page cache tree is emptied before 463 * lru_lock. Because the page cache tree is emptied before
465 * the inode can be destroyed, holding the lru_lock pins any 464 * the inode can be destroyed, holding the lru_lock pins any
466 * address_space that has radix tree nodes on the LRU. 465 * address_space that has nodes on the LRU.
467 * 466 *
468 * We can then safely transition to the i_pages lock to 467 * We can then safely transition to the i_pages lock to
469 * pin only the address_space of the particular node we want 468 * pin only the address_space of the particular node we want
470 * to reclaim, take the node off-LRU, and drop the lru_lock. 469 * to reclaim, take the node off-LRU, and drop the lru_lock.
471 */ 470 */
472 471
473 node = container_of(item, struct radix_tree_node, private_list); 472 mapping = container_of(node->array, struct address_space, i_pages);
474 mapping = container_of(node->root, struct address_space, i_pages);
475 473
476 /* Coming from the list, invert the lock order */ 474 /* Coming from the list, invert the lock order */
477 if (!xa_trylock(&mapping->i_pages)) { 475 if (!xa_trylock(&mapping->i_pages)) {
@@ -490,29 +488,21 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
490 * no pages, so we expect to be able to remove them all and 488 * no pages, so we expect to be able to remove them all and
491 * delete and free the empty node afterwards. 489 * delete and free the empty node afterwards.
492 */ 490 */
493 if (WARN_ON_ONCE(!node->exceptional)) 491 if (WARN_ON_ONCE(!node->nr_values))
494 goto out_invalid; 492 goto out_invalid;
495 if (WARN_ON_ONCE(node->count != node->exceptional)) 493 if (WARN_ON_ONCE(node->count != node->nr_values))
496 goto out_invalid;
497 for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
498 if (node->slots[i]) {
499 if (WARN_ON_ONCE(!radix_tree_exceptional_entry(node->slots[i])))
500 goto out_invalid;
501 if (WARN_ON_ONCE(!node->exceptional))
502 goto out_invalid;
503 if (WARN_ON_ONCE(!mapping->nrexceptional))
504 goto out_invalid;
505 node->slots[i] = NULL;
506 node->exceptional--;
507 node->count--;
508 mapping->nrexceptional--;
509 }
510 }
511 if (WARN_ON_ONCE(node->exceptional))
512 goto out_invalid; 494 goto out_invalid;
495 mapping->nrexceptional -= node->nr_values;
496 xas.xa_node = xa_parent_locked(&mapping->i_pages, node);
497 xas.xa_offset = node->offset;
498 xas.xa_shift = node->shift + XA_CHUNK_SHIFT;
499 xas_set_update(&xas, workingset_update_node);
500 /*
501 * We could store a shadow entry here which was the minimum of the
502 * shadow entries we were tracking ...
503 */
504 xas_store(&xas, NULL);
513 __inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); 505 __inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
514 __radix_tree_delete_node(&mapping->i_pages, node,
515 workingset_lookup_update(mapping));
516 506
517out_invalid: 507out_invalid:
518 xa_unlock_irq(&mapping->i_pages); 508 xa_unlock_irq(&mapping->i_pages);
diff --git a/tools/include/asm-generic/bitops.h b/tools/include/asm-generic/bitops.h
index 9bce3b56b5e7..5d2ab38965cc 100644
--- a/tools/include/asm-generic/bitops.h
+++ b/tools/include/asm-generic/bitops.h
@@ -27,5 +27,6 @@
27#include <asm-generic/bitops/hweight.h> 27#include <asm-generic/bitops/hweight.h>
28 28
29#include <asm-generic/bitops/atomic.h> 29#include <asm-generic/bitops/atomic.h>
30#include <asm-generic/bitops/non-atomic.h>
30 31
31#endif /* __TOOLS_ASM_GENERIC_BITOPS_H */ 32#endif /* __TOOLS_ASM_GENERIC_BITOPS_H */
diff --git a/tools/include/asm-generic/bitops/atomic.h b/tools/include/asm-generic/bitops/atomic.h
index 21c41ccd1266..2f6ea28764a7 100644
--- a/tools/include/asm-generic/bitops/atomic.h
+++ b/tools/include/asm-generic/bitops/atomic.h
@@ -15,13 +15,4 @@ static inline void clear_bit(int nr, unsigned long *addr)
15 addr[nr / __BITS_PER_LONG] &= ~(1UL << (nr % __BITS_PER_LONG)); 15 addr[nr / __BITS_PER_LONG] &= ~(1UL << (nr % __BITS_PER_LONG));
16} 16}
17 17
18static __always_inline int test_bit(unsigned int nr, const unsigned long *addr)
19{
20 return ((1UL << (nr % __BITS_PER_LONG)) &
21 (((unsigned long *)addr)[nr / __BITS_PER_LONG])) != 0;
22}
23
24#define __set_bit(nr, addr) set_bit(nr, addr)
25#define __clear_bit(nr, addr) clear_bit(nr, addr)
26
27#endif /* _TOOLS_LINUX_ASM_GENERIC_BITOPS_ATOMIC_H_ */ 18#endif /* _TOOLS_LINUX_ASM_GENERIC_BITOPS_ATOMIC_H_ */
diff --git a/tools/include/asm-generic/bitops/non-atomic.h b/tools/include/asm-generic/bitops/non-atomic.h
new file mode 100644
index 000000000000..7e10c4b50c5d
--- /dev/null
+++ b/tools/include/asm-generic/bitops/non-atomic.h
@@ -0,0 +1,109 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _ASM_GENERIC_BITOPS_NON_ATOMIC_H_
3#define _ASM_GENERIC_BITOPS_NON_ATOMIC_H_
4
5#include <asm/types.h>
6
7/**
8 * __set_bit - Set a bit in memory
9 * @nr: the bit to set
10 * @addr: the address to start counting from
11 *
12 * Unlike set_bit(), this function is non-atomic and may be reordered.
13 * If it's called on the same region of memory simultaneously, the effect
14 * may be that only one operation succeeds.
15 */
16static inline void __set_bit(int nr, volatile unsigned long *addr)
17{
18 unsigned long mask = BIT_MASK(nr);
19 unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
20
21 *p |= mask;
22}
23
24static inline void __clear_bit(int nr, volatile unsigned long *addr)
25{
26 unsigned long mask = BIT_MASK(nr);
27 unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
28
29 *p &= ~mask;
30}
31
32/**
33 * __change_bit - Toggle a bit in memory
34 * @nr: the bit to change
35 * @addr: the address to start counting from
36 *
37 * Unlike change_bit(), this function is non-atomic and may be reordered.
38 * If it's called on the same region of memory simultaneously, the effect
39 * may be that only one operation succeeds.
40 */
41static inline void __change_bit(int nr, volatile unsigned long *addr)
42{
43 unsigned long mask = BIT_MASK(nr);
44 unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
45
46 *p ^= mask;
47}
48
49/**
50 * __test_and_set_bit - Set a bit and return its old value
51 * @nr: Bit to set
52 * @addr: Address to count from
53 *
54 * This operation is non-atomic and can be reordered.
55 * If two examples of this operation race, one can appear to succeed
56 * but actually fail. You must protect multiple accesses with a lock.
57 */
58static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
59{
60 unsigned long mask = BIT_MASK(nr);
61 unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
62 unsigned long old = *p;
63
64 *p = old | mask;
65 return (old & mask) != 0;
66}
67
68/**
69 * __test_and_clear_bit - Clear a bit and return its old value
70 * @nr: Bit to clear
71 * @addr: Address to count from
72 *
73 * This operation is non-atomic and can be reordered.
74 * If two examples of this operation race, one can appear to succeed
75 * but actually fail. You must protect multiple accesses with a lock.
76 */
77static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
78{
79 unsigned long mask = BIT_MASK(nr);
80 unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
81 unsigned long old = *p;
82
83 *p = old & ~mask;
84 return (old & mask) != 0;
85}
86
87/* WARNING: non atomic and it can be reordered! */
88static inline int __test_and_change_bit(int nr,
89 volatile unsigned long *addr)
90{
91 unsigned long mask = BIT_MASK(nr);
92 unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
93 unsigned long old = *p;
94
95 *p = old ^ mask;
96 return (old & mask) != 0;
97}
98
99/**
100 * test_bit - Determine whether a bit is set
101 * @nr: bit number to test
102 * @addr: Address to start counting from
103 */
104static inline int test_bit(int nr, const volatile unsigned long *addr)
105{
106 return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
107}
108
109#endif /* _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ */
diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h
index e63662db131b..05dca5c203f3 100644
--- a/tools/include/linux/bitmap.h
+++ b/tools/include/linux/bitmap.h
@@ -15,6 +15,7 @@ void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
15 const unsigned long *bitmap2, int bits); 15 const unsigned long *bitmap2, int bits);
16int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, 16int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
17 const unsigned long *bitmap2, unsigned int bits); 17 const unsigned long *bitmap2, unsigned int bits);
18void bitmap_clear(unsigned long *map, unsigned int start, int len);
18 19
19#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) 20#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
20 21
diff --git a/tools/include/linux/kernel.h b/tools/include/linux/kernel.h
index 0ad884452c5c..6935ef94e77a 100644
--- a/tools/include/linux/kernel.h
+++ b/tools/include/linux/kernel.h
@@ -70,6 +70,7 @@
70#define BUG_ON(cond) assert(!(cond)) 70#define BUG_ON(cond) assert(!(cond))
71#endif 71#endif
72#endif 72#endif
73#define BUG() BUG_ON(1)
73 74
74#if __BYTE_ORDER == __BIG_ENDIAN 75#if __BYTE_ORDER == __BIG_ENDIAN
75#define cpu_to_le16 bswap_16 76#define cpu_to_le16 bswap_16
diff --git a/tools/include/linux/spinlock.h b/tools/include/linux/spinlock.h
index 1738c0391da4..c934572d935c 100644
--- a/tools/include/linux/spinlock.h
+++ b/tools/include/linux/spinlock.h
@@ -8,8 +8,14 @@
8#define spinlock_t pthread_mutex_t 8#define spinlock_t pthread_mutex_t
9#define DEFINE_SPINLOCK(x) pthread_mutex_t x = PTHREAD_MUTEX_INITIALIZER 9#define DEFINE_SPINLOCK(x) pthread_mutex_t x = PTHREAD_MUTEX_INITIALIZER
10#define __SPIN_LOCK_UNLOCKED(x) (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER 10#define __SPIN_LOCK_UNLOCKED(x) (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER
11#define spin_lock_init(x) pthread_mutex_init(x, NULL) 11#define spin_lock_init(x) pthread_mutex_init(x, NULL)
12 12
13#define spin_lock(x) pthread_mutex_lock(x)
14#define spin_unlock(x) pthread_mutex_unlock(x)
15#define spin_lock_bh(x) pthread_mutex_lock(x)
16#define spin_unlock_bh(x) pthread_mutex_unlock(x)
17#define spin_lock_irq(x) pthread_mutex_lock(x)
18#define spin_unlock_irq(x) pthread_mutex_unlock(x)
13#define spin_lock_irqsave(x, f) (void)f, pthread_mutex_lock(x) 19#define spin_lock_irqsave(x, f) (void)f, pthread_mutex_lock(x)
14#define spin_unlock_irqrestore(x, f) (void)f, pthread_mutex_unlock(x) 20#define spin_unlock_irqrestore(x, f) (void)f, pthread_mutex_unlock(x)
15 21
@@ -31,4 +37,6 @@ static inline bool arch_spin_is_locked(arch_spinlock_t *mutex)
31 return true; 37 return true;
32} 38}
33 39
40#include <linux/lockdep.h>
41
34#endif 42#endif
diff --git a/tools/testing/radix-tree/.gitignore b/tools/testing/radix-tree/.gitignore
index d4706c0ffceb..3834899b6693 100644
--- a/tools/testing/radix-tree/.gitignore
+++ b/tools/testing/radix-tree/.gitignore
@@ -4,3 +4,4 @@ idr-test
4main 4main
5multiorder 5multiorder
6radix-tree.c 6radix-tree.c
7xarray
diff --git a/tools/testing/radix-tree/Makefile b/tools/testing/radix-tree/Makefile
index 37baecc3766f..acf1afa01c5b 100644
--- a/tools/testing/radix-tree/Makefile
+++ b/tools/testing/radix-tree/Makefile
@@ -4,8 +4,8 @@ CFLAGS += -I. -I../../include -g -Og -Wall -D_LGPL_SOURCE -fsanitize=address \
4 -fsanitize=undefined 4 -fsanitize=undefined
5LDFLAGS += -fsanitize=address -fsanitize=undefined 5LDFLAGS += -fsanitize=address -fsanitize=undefined
6LDLIBS+= -lpthread -lurcu 6LDLIBS+= -lpthread -lurcu
7TARGETS = main idr-test multiorder 7TARGETS = main idr-test multiorder xarray
8CORE_OFILES := radix-tree.o idr.o linux.o test.o find_bit.o 8CORE_OFILES := xarray.o radix-tree.o idr.o linux.o test.o find_bit.o bitmap.o
9OFILES = main.o $(CORE_OFILES) regression1.o regression2.o regression3.o \ 9OFILES = main.o $(CORE_OFILES) regression1.o regression2.o regression3.o \
10 tag_check.o multiorder.o idr-test.o iteration_check.o benchmark.o 10 tag_check.o multiorder.o idr-test.o iteration_check.o benchmark.o
11 11
@@ -25,6 +25,8 @@ main: $(OFILES)
25idr-test.o: ../../../lib/test_ida.c 25idr-test.o: ../../../lib/test_ida.c
26idr-test: idr-test.o $(CORE_OFILES) 26idr-test: idr-test.o $(CORE_OFILES)
27 27
28xarray: $(CORE_OFILES)
29
28multiorder: multiorder.o $(CORE_OFILES) 30multiorder: multiorder.o $(CORE_OFILES)
29 31
30clean: 32clean:
@@ -35,6 +37,7 @@ vpath %.c ../../lib
35$(OFILES): Makefile *.h */*.h generated/map-shift.h \ 37$(OFILES): Makefile *.h */*.h generated/map-shift.h \
36 ../../include/linux/*.h \ 38 ../../include/linux/*.h \
37 ../../include/asm/*.h \ 39 ../../include/asm/*.h \
40 ../../../include/linux/xarray.h \
38 ../../../include/linux/radix-tree.h \ 41 ../../../include/linux/radix-tree.h \
39 ../../../include/linux/idr.h 42 ../../../include/linux/idr.h
40 43
@@ -44,8 +47,10 @@ radix-tree.c: ../../../lib/radix-tree.c
44idr.c: ../../../lib/idr.c 47idr.c: ../../../lib/idr.c
45 sed -e 's/^static //' -e 's/__always_inline //' -e 's/inline //' < $< > $@ 48 sed -e 's/^static //' -e 's/__always_inline //' -e 's/inline //' < $< > $@
46 49
50xarray.o: ../../../lib/xarray.c ../../../lib/test_xarray.c
51
47generated/map-shift.h: 52generated/map-shift.h:
48 @if ! grep -qws $(SHIFT) generated/map-shift.h; then \ 53 @if ! grep -qws $(SHIFT) generated/map-shift.h; then \
49 echo "#define RADIX_TREE_MAP_SHIFT $(SHIFT)" > \ 54 echo "#define XA_CHUNK_SHIFT $(SHIFT)" > \
50 generated/map-shift.h; \ 55 generated/map-shift.h; \
51 fi 56 fi
diff --git a/tools/testing/radix-tree/benchmark.c b/tools/testing/radix-tree/benchmark.c
index 99c40f3ed133..7e195ed8e92d 100644
--- a/tools/testing/radix-tree/benchmark.c
+++ b/tools/testing/radix-tree/benchmark.c
@@ -17,9 +17,6 @@
17#include <time.h> 17#include <time.h>
18#include "test.h" 18#include "test.h"
19 19
20#define for_each_index(i, base, order) \
21 for (i = base; i < base + (1 << order); i++)
22
23#define NSEC_PER_SEC 1000000000L 20#define NSEC_PER_SEC 1000000000L
24 21
25static long long benchmark_iter(struct radix_tree_root *root, bool tagged) 22static long long benchmark_iter(struct radix_tree_root *root, bool tagged)
@@ -61,7 +58,7 @@ again:
61} 58}
62 59
63static void benchmark_insert(struct radix_tree_root *root, 60static void benchmark_insert(struct radix_tree_root *root,
64 unsigned long size, unsigned long step, int order) 61 unsigned long size, unsigned long step)
65{ 62{
66 struct timespec start, finish; 63 struct timespec start, finish;
67 unsigned long index; 64 unsigned long index;
@@ -70,19 +67,19 @@ static void benchmark_insert(struct radix_tree_root *root,
70 clock_gettime(CLOCK_MONOTONIC, &start); 67 clock_gettime(CLOCK_MONOTONIC, &start);
71 68
72 for (index = 0 ; index < size ; index += step) 69 for (index = 0 ; index < size ; index += step)
73 item_insert_order(root, index, order); 70 item_insert(root, index);
74 71
75 clock_gettime(CLOCK_MONOTONIC, &finish); 72 clock_gettime(CLOCK_MONOTONIC, &finish);
76 73
77 nsec = (finish.tv_sec - start.tv_sec) * NSEC_PER_SEC + 74 nsec = (finish.tv_sec - start.tv_sec) * NSEC_PER_SEC +
78 (finish.tv_nsec - start.tv_nsec); 75 (finish.tv_nsec - start.tv_nsec);
79 76
80 printv(2, "Size: %8ld, step: %8ld, order: %d, insertion: %15lld ns\n", 77 printv(2, "Size: %8ld, step: %8ld, insertion: %15lld ns\n",
81 size, step, order, nsec); 78 size, step, nsec);
82} 79}
83 80
84static void benchmark_tagging(struct radix_tree_root *root, 81static void benchmark_tagging(struct radix_tree_root *root,
85 unsigned long size, unsigned long step, int order) 82 unsigned long size, unsigned long step)
86{ 83{
87 struct timespec start, finish; 84 struct timespec start, finish;
88 unsigned long index; 85 unsigned long index;
@@ -98,138 +95,53 @@ static void benchmark_tagging(struct radix_tree_root *root,
98 nsec = (finish.tv_sec - start.tv_sec) * NSEC_PER_SEC + 95 nsec = (finish.tv_sec - start.tv_sec) * NSEC_PER_SEC +
99 (finish.tv_nsec - start.tv_nsec); 96 (finish.tv_nsec - start.tv_nsec);
100 97
101 printv(2, "Size: %8ld, step: %8ld, order: %d, tagging: %17lld ns\n", 98 printv(2, "Size: %8ld, step: %8ld, tagging: %17lld ns\n",
102 size, step, order, nsec); 99 size, step, nsec);
103} 100}
104 101
105static void benchmark_delete(struct radix_tree_root *root, 102static void benchmark_delete(struct radix_tree_root *root,
106 unsigned long size, unsigned long step, int order) 103 unsigned long size, unsigned long step)
107{ 104{
108 struct timespec start, finish; 105 struct timespec start, finish;
109 unsigned long index, i; 106 unsigned long index;
110 long long nsec; 107 long long nsec;
111 108
112 clock_gettime(CLOCK_MONOTONIC, &start); 109 clock_gettime(CLOCK_MONOTONIC, &start);
113 110
114 for (index = 0 ; index < size ; index += step) 111 for (index = 0 ; index < size ; index += step)
115 for_each_index(i, index, order) 112 item_delete(root, index);
116 item_delete(root, i);
117 113
118 clock_gettime(CLOCK_MONOTONIC, &finish); 114 clock_gettime(CLOCK_MONOTONIC, &finish);
119 115
120 nsec = (finish.tv_sec - start.tv_sec) * NSEC_PER_SEC + 116 nsec = (finish.tv_sec - start.tv_sec) * NSEC_PER_SEC +
121 (finish.tv_nsec - start.tv_nsec); 117 (finish.tv_nsec - start.tv_nsec);
122 118
123 printv(2, "Size: %8ld, step: %8ld, order: %d, deletion: %16lld ns\n", 119 printv(2, "Size: %8ld, step: %8ld, deletion: %16lld ns\n",
124 size, step, order, nsec); 120 size, step, nsec);
125} 121}
126 122
127static void benchmark_size(unsigned long size, unsigned long step, int order) 123static void benchmark_size(unsigned long size, unsigned long step)
128{ 124{
129 RADIX_TREE(tree, GFP_KERNEL); 125 RADIX_TREE(tree, GFP_KERNEL);
130 long long normal, tagged; 126 long long normal, tagged;
131 127
132 benchmark_insert(&tree, size, step, order); 128 benchmark_insert(&tree, size, step);
133 benchmark_tagging(&tree, size, step, order); 129 benchmark_tagging(&tree, size, step);
134 130
135 tagged = benchmark_iter(&tree, true); 131 tagged = benchmark_iter(&tree, true);
136 normal = benchmark_iter(&tree, false); 132 normal = benchmark_iter(&tree, false);
137 133
138 printv(2, "Size: %8ld, step: %8ld, order: %d, tagged iteration: %8lld ns\n", 134 printv(2, "Size: %8ld, step: %8ld, tagged iteration: %8lld ns\n",
139 size, step, order, tagged); 135 size, step, tagged);
140 printv(2, "Size: %8ld, step: %8ld, order: %d, normal iteration: %8lld ns\n", 136 printv(2, "Size: %8ld, step: %8ld, normal iteration: %8lld ns\n",
141 size, step, order, normal); 137 size, step, normal);
142 138
143 benchmark_delete(&tree, size, step, order); 139 benchmark_delete(&tree, size, step);
144 140
145 item_kill_tree(&tree); 141 item_kill_tree(&tree);
146 rcu_barrier(); 142 rcu_barrier();
147} 143}
148 144
149static long long __benchmark_split(unsigned long index,
150 int old_order, int new_order)
151{
152 struct timespec start, finish;
153 long long nsec;
154 RADIX_TREE(tree, GFP_ATOMIC);
155
156 item_insert_order(&tree, index, old_order);
157
158 clock_gettime(CLOCK_MONOTONIC, &start);
159 radix_tree_split(&tree, index, new_order);
160 clock_gettime(CLOCK_MONOTONIC, &finish);
161 nsec = (finish.tv_sec - start.tv_sec) * NSEC_PER_SEC +
162 (finish.tv_nsec - start.tv_nsec);
163
164 item_kill_tree(&tree);
165
166 return nsec;
167
168}
169
170static void benchmark_split(unsigned long size, unsigned long step)
171{
172 int i, j, idx;
173 long long nsec = 0;
174
175
176 for (idx = 0; idx < size; idx += step) {
177 for (i = 3; i < 11; i++) {
178 for (j = 0; j < i; j++) {
179 nsec += __benchmark_split(idx, i, j);
180 }
181 }
182 }
183
184 printv(2, "Size %8ld, step %8ld, split time %10lld ns\n",
185 size, step, nsec);
186
187}
188
189static long long __benchmark_join(unsigned long index,
190 unsigned order1, unsigned order2)
191{
192 unsigned long loc;
193 struct timespec start, finish;
194 long long nsec;
195 void *item, *item2 = item_create(index + 1, order1);
196 RADIX_TREE(tree, GFP_KERNEL);
197
198 item_insert_order(&tree, index, order2);
199 item = radix_tree_lookup(&tree, index);
200
201 clock_gettime(CLOCK_MONOTONIC, &start);
202 radix_tree_join(&tree, index + 1, order1, item2);
203 clock_gettime(CLOCK_MONOTONIC, &finish);
204 nsec = (finish.tv_sec - start.tv_sec) * NSEC_PER_SEC +
205 (finish.tv_nsec - start.tv_nsec);
206
207 loc = find_item(&tree, item);
208 if (loc == -1)
209 free(item);
210
211 item_kill_tree(&tree);
212
213 return nsec;
214}
215
216static void benchmark_join(unsigned long step)
217{
218 int i, j, idx;
219 long long nsec = 0;
220
221 for (idx = 0; idx < 1 << 10; idx += step) {
222 for (i = 1; i < 15; i++) {
223 for (j = 0; j < i; j++) {
224 nsec += __benchmark_join(idx, i, j);
225 }
226 }
227 }
228
229 printv(2, "Size %8d, step %8ld, join time %10lld ns\n",
230 1 << 10, step, nsec);
231}
232
233void benchmark(void) 145void benchmark(void)
234{ 146{
235 unsigned long size[] = {1 << 10, 1 << 20, 0}; 147 unsigned long size[] = {1 << 10, 1 << 20, 0};
@@ -242,16 +154,5 @@ void benchmark(void)
242 154
243 for (c = 0; size[c]; c++) 155 for (c = 0; size[c]; c++)
244 for (s = 0; step[s]; s++) 156 for (s = 0; step[s]; s++)
245 benchmark_size(size[c], step[s], 0); 157 benchmark_size(size[c], step[s]);
246
247 for (c = 0; size[c]; c++)
248 for (s = 0; step[s]; s++)
249 benchmark_size(size[c], step[s] << 9, 9);
250
251 for (c = 0; size[c]; c++)
252 for (s = 0; step[s]; s++)
253 benchmark_split(size[c], step[s]);
254
255 for (s = 0; step[s]; s++)
256 benchmark_join(step[s]);
257} 158}
diff --git a/tools/testing/radix-tree/bitmap.c b/tools/testing/radix-tree/bitmap.c
new file mode 100644
index 000000000000..66ec4a24a203
--- /dev/null
+++ b/tools/testing/radix-tree/bitmap.c
@@ -0,0 +1,23 @@
1/* lib/bitmap.c pulls in at least two other files. */
2
3#include <linux/bitmap.h>
4
5void bitmap_clear(unsigned long *map, unsigned int start, int len)
6{
7 unsigned long *p = map + BIT_WORD(start);
8 const unsigned int size = start + len;
9 int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
10 unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
11
12 while (len - bits_to_clear >= 0) {
13 *p &= ~mask_to_clear;
14 len -= bits_to_clear;
15 bits_to_clear = BITS_PER_LONG;
16 mask_to_clear = ~0UL;
17 p++;
18 }
19 if (len) {
20 mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
21 *p &= ~mask_to_clear;
22 }
23}
diff --git a/tools/testing/radix-tree/generated/autoconf.h b/tools/testing/radix-tree/generated/autoconf.h
index cf88dc5b8832..2218b3cc184e 100644
--- a/tools/testing/radix-tree/generated/autoconf.h
+++ b/tools/testing/radix-tree/generated/autoconf.h
@@ -1 +1 @@
#define CONFIG_RADIX_TREE_MULTIORDER 1 #define CONFIG_XARRAY_MULTI 1
diff --git a/tools/testing/radix-tree/idr-test.c b/tools/testing/radix-tree/idr-test.c
index 321ba92c70d2..1b63bdb7688f 100644
--- a/tools/testing/radix-tree/idr-test.c
+++ b/tools/testing/radix-tree/idr-test.c
@@ -19,7 +19,7 @@
19 19
20#include "test.h" 20#include "test.h"
21 21
22#define DUMMY_PTR ((void *)0x12) 22#define DUMMY_PTR ((void *)0x10)
23 23
24int item_idr_free(int id, void *p, void *data) 24int item_idr_free(int id, void *p, void *data)
25{ 25{
@@ -227,6 +227,66 @@ void idr_u32_test(int base)
227 idr_u32_test1(&idr, 0xffffffff); 227 idr_u32_test1(&idr, 0xffffffff);
228} 228}
229 229
230static void idr_align_test(struct idr *idr)
231{
232 char name[] = "Motorola 68000";
233 int i, id;
234 void *entry;
235
236 for (i = 0; i < 9; i++) {
237 BUG_ON(idr_alloc(idr, &name[i], 0, 0, GFP_KERNEL) != i);
238 idr_for_each_entry(idr, entry, id);
239 }
240 idr_destroy(idr);
241
242 for (i = 1; i < 10; i++) {
243 BUG_ON(idr_alloc(idr, &name[i], 0, 0, GFP_KERNEL) != i - 1);
244 idr_for_each_entry(idr, entry, id);
245 }
246 idr_destroy(idr);
247
248 for (i = 2; i < 11; i++) {
249 BUG_ON(idr_alloc(idr, &name[i], 0, 0, GFP_KERNEL) != i - 2);
250 idr_for_each_entry(idr, entry, id);
251 }
252 idr_destroy(idr);
253
254 for (i = 3; i < 12; i++) {
255 BUG_ON(idr_alloc(idr, &name[i], 0, 0, GFP_KERNEL) != i - 3);
256 idr_for_each_entry(idr, entry, id);
257 }
258 idr_destroy(idr);
259
260 for (i = 0; i < 8; i++) {
261 BUG_ON(idr_alloc(idr, &name[i], 0, 0, GFP_KERNEL) != 0);
262 BUG_ON(idr_alloc(idr, &name[i + 1], 0, 0, GFP_KERNEL) != 1);
263 idr_for_each_entry(idr, entry, id);
264 idr_remove(idr, 1);
265 idr_for_each_entry(idr, entry, id);
266 idr_remove(idr, 0);
267 BUG_ON(!idr_is_empty(idr));
268 }
269
270 for (i = 0; i < 8; i++) {
271 BUG_ON(idr_alloc(idr, NULL, 0, 0, GFP_KERNEL) != 0);
272 idr_for_each_entry(idr, entry, id);
273 idr_replace(idr, &name[i], 0);
274 idr_for_each_entry(idr, entry, id);
275 BUG_ON(idr_find(idr, 0) != &name[i]);
276 idr_remove(idr, 0);
277 }
278
279 for (i = 0; i < 8; i++) {
280 BUG_ON(idr_alloc(idr, &name[i], 0, 0, GFP_KERNEL) != 0);
281 BUG_ON(idr_alloc(idr, NULL, 0, 0, GFP_KERNEL) != 1);
282 idr_remove(idr, 1);
283 idr_for_each_entry(idr, entry, id);
284 idr_replace(idr, &name[i + 1], 0);
285 idr_for_each_entry(idr, entry, id);
286 idr_remove(idr, 0);
287 }
288}
289
230void idr_checks(void) 290void idr_checks(void)
231{ 291{
232 unsigned long i; 292 unsigned long i;
@@ -307,6 +367,7 @@ void idr_checks(void)
307 idr_u32_test(4); 367 idr_u32_test(4);
308 idr_u32_test(1); 368 idr_u32_test(1);
309 idr_u32_test(0); 369 idr_u32_test(0);
370 idr_align_test(&idr);
310} 371}
311 372
312#define module_init(x) 373#define module_init(x)
@@ -344,16 +405,16 @@ void ida_check_conv_user(void)
344 DEFINE_IDA(ida); 405 DEFINE_IDA(ida);
345 unsigned long i; 406 unsigned long i;
346 407
347 radix_tree_cpu_dead(1);
348 for (i = 0; i < 1000000; i++) { 408 for (i = 0; i < 1000000; i++) {
349 int id = ida_alloc(&ida, GFP_NOWAIT); 409 int id = ida_alloc(&ida, GFP_NOWAIT);
350 if (id == -ENOMEM) { 410 if (id == -ENOMEM) {
351 IDA_BUG_ON(&ida, (i % IDA_BITMAP_BITS) != 411 IDA_BUG_ON(&ida, ((i % IDA_BITMAP_BITS) !=
352 BITS_PER_LONG - 2); 412 BITS_PER_XA_VALUE) &&
413 ((i % IDA_BITMAP_BITS) != 0));
353 id = ida_alloc(&ida, GFP_KERNEL); 414 id = ida_alloc(&ida, GFP_KERNEL);
354 } else { 415 } else {
355 IDA_BUG_ON(&ida, (i % IDA_BITMAP_BITS) == 416 IDA_BUG_ON(&ida, (i % IDA_BITMAP_BITS) ==
356 BITS_PER_LONG - 2); 417 BITS_PER_XA_VALUE);
357 } 418 }
358 IDA_BUG_ON(&ida, id != i); 419 IDA_BUG_ON(&ida, id != i);
359 } 420 }
diff --git a/tools/testing/radix-tree/iteration_check.c b/tools/testing/radix-tree/iteration_check.c
index a92bab513701..238db187aa15 100644
--- a/tools/testing/radix-tree/iteration_check.c
+++ b/tools/testing/radix-tree/iteration_check.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * iteration_check.c: test races having to do with radix tree iteration 2 * iteration_check.c: test races having to do with xarray iteration
3 * Copyright (c) 2016 Intel Corporation 3 * Copyright (c) 2016 Intel Corporation
4 * Author: Ross Zwisler <ross.zwisler@linux.intel.com> 4 * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
5 * 5 *
@@ -12,41 +12,54 @@
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details. 13 * more details.
14 */ 14 */
15#include <linux/radix-tree.h>
16#include <pthread.h> 15#include <pthread.h>
17#include "test.h" 16#include "test.h"
18 17
19#define NUM_THREADS 5 18#define NUM_THREADS 5
20#define MAX_IDX 100 19#define MAX_IDX 100
21#define TAG 0 20#define TAG XA_MARK_0
22#define NEW_TAG 1 21#define NEW_TAG XA_MARK_1
23 22
24static pthread_mutex_t tree_lock = PTHREAD_MUTEX_INITIALIZER;
25static pthread_t threads[NUM_THREADS]; 23static pthread_t threads[NUM_THREADS];
26static unsigned int seeds[3]; 24static unsigned int seeds[3];
27static RADIX_TREE(tree, GFP_KERNEL); 25static DEFINE_XARRAY(array);
28static bool test_complete; 26static bool test_complete;
29static int max_order; 27static int max_order;
30 28
31/* relentlessly fill the tree with tagged entries */ 29void my_item_insert(struct xarray *xa, unsigned long index)
30{
31 XA_STATE(xas, xa, index);
32 struct item *item = item_create(index, 0);
33 int order;
34
35retry:
36 xas_lock(&xas);
37 for (order = max_order; order >= 0; order--) {
38 xas_set_order(&xas, index, order);
39 item->order = order;
40 if (xas_find_conflict(&xas))
41 continue;
42 xas_store(&xas, item);
43 xas_set_mark(&xas, TAG);
44 break;
45 }
46 xas_unlock(&xas);
47 if (xas_nomem(&xas, GFP_KERNEL))
48 goto retry;
49 if (order < 0)
50 free(item);
51}
52
53/* relentlessly fill the array with tagged entries */
32static void *add_entries_fn(void *arg) 54static void *add_entries_fn(void *arg)
33{ 55{
34 rcu_register_thread(); 56 rcu_register_thread();
35 57
36 while (!test_complete) { 58 while (!test_complete) {
37 unsigned long pgoff; 59 unsigned long pgoff;
38 int order;
39 60
40 for (pgoff = 0; pgoff < MAX_IDX; pgoff++) { 61 for (pgoff = 0; pgoff < MAX_IDX; pgoff++) {
41 pthread_mutex_lock(&tree_lock); 62 my_item_insert(&array, pgoff);
42 for (order = max_order; order >= 0; order--) {
43 if (item_insert_order(&tree, pgoff, order)
44 == 0) {
45 item_tag_set(&tree, pgoff, TAG);
46 break;
47 }
48 }
49 pthread_mutex_unlock(&tree_lock);
50 } 63 }
51 } 64 }
52 65
@@ -56,33 +69,25 @@ static void *add_entries_fn(void *arg)
56} 69}
57 70
58/* 71/*
59 * Iterate over the tagged entries, doing a radix_tree_iter_retry() as we find 72 * Iterate over tagged entries, retrying when we find ourselves in a deleted
60 * things that have been removed and randomly resetting our iteration to the 73 * node and randomly pausing the iteration.
61 * next chunk with radix_tree_iter_resume(). Both radix_tree_iter_retry() and
62 * radix_tree_iter_resume() cause radix_tree_next_slot() to be called with a
63 * NULL 'slot' variable.
64 */ 74 */
65static void *tagged_iteration_fn(void *arg) 75static void *tagged_iteration_fn(void *arg)
66{ 76{
67 struct radix_tree_iter iter; 77 XA_STATE(xas, &array, 0);
68 void **slot; 78 void *entry;
69 79
70 rcu_register_thread(); 80 rcu_register_thread();
71 81
72 while (!test_complete) { 82 while (!test_complete) {
83 xas_set(&xas, 0);
73 rcu_read_lock(); 84 rcu_read_lock();
74 radix_tree_for_each_tagged(slot, &tree, &iter, 0, TAG) { 85 xas_for_each_marked(&xas, entry, ULONG_MAX, TAG) {
75 void *entry = radix_tree_deref_slot(slot); 86 if (xas_retry(&xas, entry))
76 if (unlikely(!entry))
77 continue; 87 continue;
78 88
79 if (radix_tree_deref_retry(entry)) {
80 slot = radix_tree_iter_retry(&iter);
81 continue;
82 }
83
84 if (rand_r(&seeds[0]) % 50 == 0) { 89 if (rand_r(&seeds[0]) % 50 == 0) {
85 slot = radix_tree_iter_resume(slot, &iter); 90 xas_pause(&xas);
86 rcu_read_unlock(); 91 rcu_read_unlock();
87 rcu_barrier(); 92 rcu_barrier();
88 rcu_read_lock(); 93 rcu_read_lock();
@@ -97,33 +102,25 @@ static void *tagged_iteration_fn(void *arg)
97} 102}
98 103
99/* 104/*
100 * Iterate over the entries, doing a radix_tree_iter_retry() as we find things 105 * Iterate over the entries, retrying when we find ourselves in a deleted
101 * that have been removed and randomly resetting our iteration to the next 106 * node and randomly pausing the iteration.
102 * chunk with radix_tree_iter_resume(). Both radix_tree_iter_retry() and
103 * radix_tree_iter_resume() cause radix_tree_next_slot() to be called with a
104 * NULL 'slot' variable.
105 */ 107 */
106static void *untagged_iteration_fn(void *arg) 108static void *untagged_iteration_fn(void *arg)
107{ 109{
108 struct radix_tree_iter iter; 110 XA_STATE(xas, &array, 0);
109 void **slot; 111 void *entry;
110 112
111 rcu_register_thread(); 113 rcu_register_thread();
112 114
113 while (!test_complete) { 115 while (!test_complete) {
116 xas_set(&xas, 0);
114 rcu_read_lock(); 117 rcu_read_lock();
115 radix_tree_for_each_slot(slot, &tree, &iter, 0) { 118 xas_for_each(&xas, entry, ULONG_MAX) {
116 void *entry = radix_tree_deref_slot(slot); 119 if (xas_retry(&xas, entry))
117 if (unlikely(!entry))
118 continue; 120 continue;
119 121
120 if (radix_tree_deref_retry(entry)) {
121 slot = radix_tree_iter_retry(&iter);
122 continue;
123 }
124
125 if (rand_r(&seeds[1]) % 50 == 0) { 122 if (rand_r(&seeds[1]) % 50 == 0) {
126 slot = radix_tree_iter_resume(slot, &iter); 123 xas_pause(&xas);
127 rcu_read_unlock(); 124 rcu_read_unlock();
128 rcu_barrier(); 125 rcu_barrier();
129 rcu_read_lock(); 126 rcu_read_lock();
@@ -138,7 +135,7 @@ static void *untagged_iteration_fn(void *arg)
138} 135}
139 136
140/* 137/*
141 * Randomly remove entries to help induce radix_tree_iter_retry() calls in the 138 * Randomly remove entries to help induce retries in the
142 * two iteration functions. 139 * two iteration functions.
143 */ 140 */
144static void *remove_entries_fn(void *arg) 141static void *remove_entries_fn(void *arg)
@@ -147,12 +144,13 @@ static void *remove_entries_fn(void *arg)
147 144
148 while (!test_complete) { 145 while (!test_complete) {
149 int pgoff; 146 int pgoff;
147 struct item *item;
150 148
151 pgoff = rand_r(&seeds[2]) % MAX_IDX; 149 pgoff = rand_r(&seeds[2]) % MAX_IDX;
152 150
153 pthread_mutex_lock(&tree_lock); 151 item = xa_erase(&array, pgoff);
154 item_delete(&tree, pgoff); 152 if (item)
155 pthread_mutex_unlock(&tree_lock); 153 item_free(item, pgoff);
156 } 154 }
157 155
158 rcu_unregister_thread(); 156 rcu_unregister_thread();
@@ -165,8 +163,7 @@ static void *tag_entries_fn(void *arg)
165 rcu_register_thread(); 163 rcu_register_thread();
166 164
167 while (!test_complete) { 165 while (!test_complete) {
168 tag_tagged_items(&tree, &tree_lock, 0, MAX_IDX, 10, TAG, 166 tag_tagged_items(&array, 0, MAX_IDX, 10, TAG, NEW_TAG);
169 NEW_TAG);
170 } 167 }
171 rcu_unregister_thread(); 168 rcu_unregister_thread();
172 return NULL; 169 return NULL;
@@ -217,5 +214,5 @@ void iteration_test(unsigned order, unsigned test_duration)
217 } 214 }
218 } 215 }
219 216
220 item_kill_tree(&tree); 217 item_kill_tree(&array);
221} 218}
diff --git a/tools/testing/radix-tree/linux/bug.h b/tools/testing/radix-tree/linux/bug.h
index 23b8ed52f8c8..03dc8a57eb99 100644
--- a/tools/testing/radix-tree/linux/bug.h
+++ b/tools/testing/radix-tree/linux/bug.h
@@ -1 +1,2 @@
1#include <stdio.h>
1#include "asm/bug.h" 2#include "asm/bug.h"
diff --git a/tools/testing/radix-tree/linux/kconfig.h b/tools/testing/radix-tree/linux/kconfig.h
new file mode 100644
index 000000000000..6c8675859913
--- /dev/null
+++ b/tools/testing/radix-tree/linux/kconfig.h
@@ -0,0 +1 @@
#include "../../../../include/linux/kconfig.h"
diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/radix-tree/linux/kernel.h
index 426f32f28547..4568248222ae 100644
--- a/tools/testing/radix-tree/linux/kernel.h
+++ b/tools/testing/radix-tree/linux/kernel.h
@@ -14,7 +14,12 @@
14#include "../../../include/linux/kconfig.h" 14#include "../../../include/linux/kconfig.h"
15 15
16#define printk printf 16#define printk printf
17#define pr_info printk
17#define pr_debug printk 18#define pr_debug printk
18#define pr_cont printk 19#define pr_cont printk
19 20
21#define __acquires(x)
22#define __releases(x)
23#define __must_hold(x)
24
20#endif /* _KERNEL_H */ 25#endif /* _KERNEL_H */
diff --git a/tools/testing/radix-tree/linux/lockdep.h b/tools/testing/radix-tree/linux/lockdep.h
new file mode 100644
index 000000000000..565fccdfe6e9
--- /dev/null
+++ b/tools/testing/radix-tree/linux/lockdep.h
@@ -0,0 +1,11 @@
1#ifndef _LINUX_LOCKDEP_H
2#define _LINUX_LOCKDEP_H
3struct lock_class_key {
4 unsigned int a;
5};
6
7static inline void lockdep_set_class(spinlock_t *lock,
8 struct lock_class_key *key)
9{
10}
11#endif /* _LINUX_LOCKDEP_H */
diff --git a/tools/testing/radix-tree/linux/radix-tree.h b/tools/testing/radix-tree/linux/radix-tree.h
index 24f13d27a8da..d1635a5bef02 100644
--- a/tools/testing/radix-tree/linux/radix-tree.h
+++ b/tools/testing/radix-tree/linux/radix-tree.h
@@ -2,7 +2,6 @@
2#ifndef _TEST_RADIX_TREE_H 2#ifndef _TEST_RADIX_TREE_H
3#define _TEST_RADIX_TREE_H 3#define _TEST_RADIX_TREE_H
4 4
5#include "generated/map-shift.h"
6#include "../../../../include/linux/radix-tree.h" 5#include "../../../../include/linux/radix-tree.h"
7 6
8extern int kmalloc_verbose; 7extern int kmalloc_verbose;
diff --git a/tools/testing/radix-tree/linux/rcupdate.h b/tools/testing/radix-tree/linux/rcupdate.h
index 73ed33658203..fd280b070fdb 100644
--- a/tools/testing/radix-tree/linux/rcupdate.h
+++ b/tools/testing/radix-tree/linux/rcupdate.h
@@ -6,5 +6,7 @@
6 6
7#define rcu_dereference_raw(p) rcu_dereference(p) 7#define rcu_dereference_raw(p) rcu_dereference(p)
8#define rcu_dereference_protected(p, cond) rcu_dereference(p) 8#define rcu_dereference_protected(p, cond) rcu_dereference(p)
9#define rcu_dereference_check(p, cond) rcu_dereference(p)
10#define RCU_INIT_POINTER(p, v) (p) = (v)
9 11
10#endif 12#endif
diff --git a/tools/testing/radix-tree/main.c b/tools/testing/radix-tree/main.c
index b741686e53d6..77a44c54998f 100644
--- a/tools/testing/radix-tree/main.c
+++ b/tools/testing/radix-tree/main.c
@@ -214,7 +214,7 @@ void copy_tag_check(void)
214 } 214 }
215 215
216// printf("\ncopying tags...\n"); 216// printf("\ncopying tags...\n");
217 tagged = tag_tagged_items(&tree, NULL, start, end, ITEMS, 0, 1); 217 tagged = tag_tagged_items(&tree, start, end, ITEMS, XA_MARK_0, XA_MARK_1);
218 218
219// printf("checking copied tags\n"); 219// printf("checking copied tags\n");
220 assert(tagged == count); 220 assert(tagged == count);
@@ -223,7 +223,7 @@ void copy_tag_check(void)
223 /* Copy tags in several rounds */ 223 /* Copy tags in several rounds */
224// printf("\ncopying tags...\n"); 224// printf("\ncopying tags...\n");
225 tmp = rand() % (count / 10 + 2); 225 tmp = rand() % (count / 10 + 2);
226 tagged = tag_tagged_items(&tree, NULL, start, end, tmp, 0, 2); 226 tagged = tag_tagged_items(&tree, start, end, tmp, XA_MARK_0, XA_MARK_2);
227 assert(tagged == count); 227 assert(tagged == count);
228 228
229// printf("%lu %lu %lu\n", tagged, tmp, count); 229// printf("%lu %lu %lu\n", tagged, tmp, count);
@@ -236,63 +236,6 @@ void copy_tag_check(void)
236 item_kill_tree(&tree); 236 item_kill_tree(&tree);
237} 237}
238 238
239static void __locate_check(struct radix_tree_root *tree, unsigned long index,
240 unsigned order)
241{
242 struct item *item;
243 unsigned long index2;
244
245 item_insert_order(tree, index, order);
246 item = item_lookup(tree, index);
247 index2 = find_item(tree, item);
248 if (index != index2) {
249 printv(2, "index %ld order %d inserted; found %ld\n",
250 index, order, index2);
251 abort();
252 }
253}
254
255static void __order_0_locate_check(void)
256{
257 RADIX_TREE(tree, GFP_KERNEL);
258 int i;
259
260 for (i = 0; i < 50; i++)
261 __locate_check(&tree, rand() % INT_MAX, 0);
262
263 item_kill_tree(&tree);
264}
265
266static void locate_check(void)
267{
268 RADIX_TREE(tree, GFP_KERNEL);
269 unsigned order;
270 unsigned long offset, index;
271
272 __order_0_locate_check();
273
274 for (order = 0; order < 20; order++) {
275 for (offset = 0; offset < (1 << (order + 3));
276 offset += (1UL << order)) {
277 for (index = 0; index < (1UL << (order + 5));
278 index += (1UL << order)) {
279 __locate_check(&tree, index + offset, order);
280 }
281 if (find_item(&tree, &tree) != -1)
282 abort();
283
284 item_kill_tree(&tree);
285 }
286 }
287
288 if (find_item(&tree, &tree) != -1)
289 abort();
290 __locate_check(&tree, -1, 0);
291 if (find_item(&tree, &tree) != -1)
292 abort();
293 item_kill_tree(&tree);
294}
295
296static void single_thread_tests(bool long_run) 239static void single_thread_tests(bool long_run)
297{ 240{
298 int i; 241 int i;
@@ -303,10 +246,6 @@ static void single_thread_tests(bool long_run)
303 rcu_barrier(); 246 rcu_barrier();
304 printv(2, "after multiorder_check: %d allocated, preempt %d\n", 247 printv(2, "after multiorder_check: %d allocated, preempt %d\n",
305 nr_allocated, preempt_count); 248 nr_allocated, preempt_count);
306 locate_check();
307 rcu_barrier();
308 printv(2, "after locate_check: %d allocated, preempt %d\n",
309 nr_allocated, preempt_count);
310 tag_check(); 249 tag_check();
311 rcu_barrier(); 250 rcu_barrier();
312 printv(2, "after tag_check: %d allocated, preempt %d\n", 251 printv(2, "after tag_check: %d allocated, preempt %d\n",
@@ -365,6 +304,7 @@ int main(int argc, char **argv)
365 rcu_register_thread(); 304 rcu_register_thread();
366 radix_tree_init(); 305 radix_tree_init();
367 306
307 xarray_tests();
368 regression1_test(); 308 regression1_test();
369 regression2_test(); 309 regression2_test();
370 regression3_test(); 310 regression3_test();
diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c
index 7bf405638b0b..ff27a74d9762 100644
--- a/tools/testing/radix-tree/multiorder.c
+++ b/tools/testing/radix-tree/multiorder.c
@@ -20,230 +20,39 @@
20 20
21#include "test.h" 21#include "test.h"
22 22
23#define for_each_index(i, base, order) \ 23static int item_insert_order(struct xarray *xa, unsigned long index,
24 for (i = base; i < base + (1 << order); i++) 24 unsigned order)
25
26static void __multiorder_tag_test(int index, int order)
27{
28 RADIX_TREE(tree, GFP_KERNEL);
29 int base, err, i;
30
31 /* our canonical entry */
32 base = index & ~((1 << order) - 1);
33
34 printv(2, "Multiorder tag test with index %d, canonical entry %d\n",
35 index, base);
36
37 err = item_insert_order(&tree, index, order);
38 assert(!err);
39
40 /*
41 * Verify we get collisions for covered indices. We try and fail to
42 * insert an exceptional entry so we don't leak memory via
43 * item_insert_order().
44 */
45 for_each_index(i, base, order) {
46 err = __radix_tree_insert(&tree, i, order,
47 (void *)(0xA0 | RADIX_TREE_EXCEPTIONAL_ENTRY));
48 assert(err == -EEXIST);
49 }
50
51 for_each_index(i, base, order) {
52 assert(!radix_tree_tag_get(&tree, i, 0));
53 assert(!radix_tree_tag_get(&tree, i, 1));
54 }
55
56 assert(radix_tree_tag_set(&tree, index, 0));
57
58 for_each_index(i, base, order) {
59 assert(radix_tree_tag_get(&tree, i, 0));
60 assert(!radix_tree_tag_get(&tree, i, 1));
61 }
62
63 assert(tag_tagged_items(&tree, NULL, 0, ~0UL, 10, 0, 1) == 1);
64 assert(radix_tree_tag_clear(&tree, index, 0));
65
66 for_each_index(i, base, order) {
67 assert(!radix_tree_tag_get(&tree, i, 0));
68 assert(radix_tree_tag_get(&tree, i, 1));
69 }
70
71 assert(radix_tree_tag_clear(&tree, index, 1));
72
73 assert(!radix_tree_tagged(&tree, 0));
74 assert(!radix_tree_tagged(&tree, 1));
75
76 item_kill_tree(&tree);
77}
78
79static void __multiorder_tag_test2(unsigned order, unsigned long index2)
80{ 25{
81 RADIX_TREE(tree, GFP_KERNEL); 26 XA_STATE_ORDER(xas, xa, index, order);
82 unsigned long index = (1 << order); 27 struct item *item = item_create(index, order);
83 index2 += index;
84
85 assert(item_insert_order(&tree, 0, order) == 0);
86 assert(item_insert(&tree, index2) == 0);
87
88 assert(radix_tree_tag_set(&tree, 0, 0));
89 assert(radix_tree_tag_set(&tree, index2, 0));
90
91 assert(tag_tagged_items(&tree, NULL, 0, ~0UL, 10, 0, 1) == 2);
92
93 item_kill_tree(&tree);
94}
95
96static void multiorder_tag_tests(void)
97{
98 int i, j;
99
100 /* test multi-order entry for indices 0-7 with no sibling pointers */
101 __multiorder_tag_test(0, 3);
102 __multiorder_tag_test(5, 3);
103
104 /* test multi-order entry for indices 8-15 with no sibling pointers */
105 __multiorder_tag_test(8, 3);
106 __multiorder_tag_test(15, 3);
107
108 /*
109 * Our order 5 entry covers indices 0-31 in a tree with height=2.
110 * This is broken up as follows:
111 * 0-7: canonical entry
112 * 8-15: sibling 1
113 * 16-23: sibling 2
114 * 24-31: sibling 3
115 */
116 __multiorder_tag_test(0, 5);
117 __multiorder_tag_test(29, 5);
118
119 /* same test, but with indices 32-63 */
120 __multiorder_tag_test(32, 5);
121 __multiorder_tag_test(44, 5);
122
123 /*
124 * Our order 8 entry covers indices 0-255 in a tree with height=3.
125 * This is broken up as follows:
126 * 0-63: canonical entry
127 * 64-127: sibling 1
128 * 128-191: sibling 2
129 * 192-255: sibling 3
130 */
131 __multiorder_tag_test(0, 8);
132 __multiorder_tag_test(190, 8);
133
134 /* same test, but with indices 256-511 */
135 __multiorder_tag_test(256, 8);
136 __multiorder_tag_test(300, 8);
137
138 __multiorder_tag_test(0x12345678UL, 8);
139
140 for (i = 1; i < 10; i++)
141 for (j = 0; j < (10 << i); j++)
142 __multiorder_tag_test2(i, j);
143}
144
145static void multiorder_check(unsigned long index, int order)
146{
147 unsigned long i;
148 unsigned long min = index & ~((1UL << order) - 1);
149 unsigned long max = min + (1UL << order);
150 void **slot;
151 struct item *item2 = item_create(min, order);
152 RADIX_TREE(tree, GFP_KERNEL);
153
154 printv(2, "Multiorder index %ld, order %d\n", index, order);
155
156 assert(item_insert_order(&tree, index, order) == 0);
157
158 for (i = min; i < max; i++) {
159 struct item *item = item_lookup(&tree, i);
160 assert(item != 0);
161 assert(item->index == index);
162 }
163 for (i = 0; i < min; i++)
164 item_check_absent(&tree, i);
165 for (i = max; i < 2*max; i++)
166 item_check_absent(&tree, i);
167 for (i = min; i < max; i++)
168 assert(radix_tree_insert(&tree, i, item2) == -EEXIST);
169
170 slot = radix_tree_lookup_slot(&tree, index);
171 free(*slot);
172 radix_tree_replace_slot(&tree, slot, item2);
173 for (i = min; i < max; i++) {
174 struct item *item = item_lookup(&tree, i);
175 assert(item != 0);
176 assert(item->index == min);
177 }
178
179 assert(item_delete(&tree, min) != 0);
180
181 for (i = 0; i < 2*max; i++)
182 item_check_absent(&tree, i);
183}
184
185static void multiorder_shrink(unsigned long index, int order)
186{
187 unsigned long i;
188 unsigned long max = 1 << order;
189 RADIX_TREE(tree, GFP_KERNEL);
190 struct radix_tree_node *node;
191
192 printv(2, "Multiorder shrink index %ld, order %d\n", index, order);
193 28
194 assert(item_insert_order(&tree, 0, order) == 0); 29 do {
195 30 xas_lock(&xas);
196 node = tree.rnode; 31 xas_store(&xas, item);
197 32 xas_unlock(&xas);
198 assert(item_insert(&tree, index) == 0); 33 } while (xas_nomem(&xas, GFP_KERNEL));
199 assert(node != tree.rnode);
200
201 assert(item_delete(&tree, index) != 0);
202 assert(node == tree.rnode);
203
204 for (i = 0; i < max; i++) {
205 struct item *item = item_lookup(&tree, i);
206 assert(item != 0);
207 assert(item->index == 0);
208 }
209 for (i = max; i < 2*max; i++)
210 item_check_absent(&tree, i);
211
212 if (!item_delete(&tree, 0)) {
213 printv(2, "failed to delete index %ld (order %d)\n", index, order);
214 abort();
215 }
216
217 for (i = 0; i < 2*max; i++)
218 item_check_absent(&tree, i);
219}
220
221static void multiorder_insert_bug(void)
222{
223 RADIX_TREE(tree, GFP_KERNEL);
224 34
225 item_insert(&tree, 0); 35 if (!xas_error(&xas))
226 radix_tree_tag_set(&tree, 0, 0); 36 return 0;
227 item_insert_order(&tree, 3 << 6, 6);
228 37
229 item_kill_tree(&tree); 38 free(item);
39 return xas_error(&xas);
230} 40}
231 41
232void multiorder_iteration(void) 42void multiorder_iteration(struct xarray *xa)
233{ 43{
234 RADIX_TREE(tree, GFP_KERNEL); 44 XA_STATE(xas, xa, 0);
235 struct radix_tree_iter iter; 45 struct item *item;
236 void **slot;
237 int i, j, err; 46 int i, j, err;
238 47
239 printv(1, "Multiorder iteration test\n");
240
241#define NUM_ENTRIES 11 48#define NUM_ENTRIES 11
242 int index[NUM_ENTRIES] = {0, 2, 4, 8, 16, 32, 34, 36, 64, 72, 128}; 49 int index[NUM_ENTRIES] = {0, 2, 4, 8, 16, 32, 34, 36, 64, 72, 128};
243 int order[NUM_ENTRIES] = {1, 1, 2, 3, 4, 1, 0, 1, 3, 0, 7}; 50 int order[NUM_ENTRIES] = {1, 1, 2, 3, 4, 1, 0, 1, 3, 0, 7};
244 51
52 printv(1, "Multiorder iteration test\n");
53
245 for (i = 0; i < NUM_ENTRIES; i++) { 54 for (i = 0; i < NUM_ENTRIES; i++) {
246 err = item_insert_order(&tree, index[i], order[i]); 55 err = item_insert_order(xa, index[i], order[i]);
247 assert(!err); 56 assert(!err);
248 } 57 }
249 58
@@ -252,14 +61,14 @@ void multiorder_iteration(void)
252 if (j <= (index[i] | ((1 << order[i]) - 1))) 61 if (j <= (index[i] | ((1 << order[i]) - 1)))
253 break; 62 break;
254 63
255 radix_tree_for_each_slot(slot, &tree, &iter, j) { 64 xas_set(&xas, j);
256 int height = order[i] / RADIX_TREE_MAP_SHIFT; 65 xas_for_each(&xas, item, ULONG_MAX) {
257 int shift = height * RADIX_TREE_MAP_SHIFT; 66 int height = order[i] / XA_CHUNK_SHIFT;
67 int shift = height * XA_CHUNK_SHIFT;
258 unsigned long mask = (1UL << order[i]) - 1; 68 unsigned long mask = (1UL << order[i]) - 1;
259 struct item *item = *slot;
260 69
261 assert((iter.index | mask) == (index[i] | mask)); 70 assert((xas.xa_index | mask) == (index[i] | mask));
262 assert(iter.shift == shift); 71 assert(xas.xa_node->shift == shift);
263 assert(!radix_tree_is_internal_node(item)); 72 assert(!radix_tree_is_internal_node(item));
264 assert((item->index | mask) == (index[i] | mask)); 73 assert((item->index | mask) == (index[i] | mask));
265 assert(item->order == order[i]); 74 assert(item->order == order[i]);
@@ -267,18 +76,15 @@ void multiorder_iteration(void)
267 } 76 }
268 } 77 }
269 78
270 item_kill_tree(&tree); 79 item_kill_tree(xa);
271} 80}
272 81
273void multiorder_tagged_iteration(void) 82void multiorder_tagged_iteration(struct xarray *xa)
274{ 83{
275 RADIX_TREE(tree, GFP_KERNEL); 84 XA_STATE(xas, xa, 0);
276 struct radix_tree_iter iter; 85 struct item *item;
277 void **slot;
278 int i, j; 86 int i, j;
279 87
280 printv(1, "Multiorder tagged iteration test\n");
281
282#define MT_NUM_ENTRIES 9 88#define MT_NUM_ENTRIES 9
283 int index[MT_NUM_ENTRIES] = {0, 2, 4, 16, 32, 40, 64, 72, 128}; 89 int index[MT_NUM_ENTRIES] = {0, 2, 4, 16, 32, 40, 64, 72, 128};
284 int order[MT_NUM_ENTRIES] = {1, 0, 2, 4, 3, 1, 3, 0, 7}; 90 int order[MT_NUM_ENTRIES] = {1, 0, 2, 4, 3, 1, 3, 0, 7};
@@ -286,13 +92,15 @@ void multiorder_tagged_iteration(void)
286#define TAG_ENTRIES 7 92#define TAG_ENTRIES 7
287 int tag_index[TAG_ENTRIES] = {0, 4, 16, 40, 64, 72, 128}; 93 int tag_index[TAG_ENTRIES] = {0, 4, 16, 40, 64, 72, 128};
288 94
95 printv(1, "Multiorder tagged iteration test\n");
96
289 for (i = 0; i < MT_NUM_ENTRIES; i++) 97 for (i = 0; i < MT_NUM_ENTRIES; i++)
290 assert(!item_insert_order(&tree, index[i], order[i])); 98 assert(!item_insert_order(xa, index[i], order[i]));
291 99
292 assert(!radix_tree_tagged(&tree, 1)); 100 assert(!xa_marked(xa, XA_MARK_1));
293 101
294 for (i = 0; i < TAG_ENTRIES; i++) 102 for (i = 0; i < TAG_ENTRIES; i++)
295 assert(radix_tree_tag_set(&tree, tag_index[i], 1)); 103 xa_set_mark(xa, tag_index[i], XA_MARK_1);
296 104
297 for (j = 0; j < 256; j++) { 105 for (j = 0; j < 256; j++) {
298 int k; 106 int k;
@@ -304,23 +112,23 @@ void multiorder_tagged_iteration(void)
304 break; 112 break;
305 } 113 }
306 114
307 radix_tree_for_each_tagged(slot, &tree, &iter, j, 1) { 115 xas_set(&xas, j);
116 xas_for_each_marked(&xas, item, ULONG_MAX, XA_MARK_1) {
308 unsigned long mask; 117 unsigned long mask;
309 struct item *item = *slot;
310 for (k = i; index[k] < tag_index[i]; k++) 118 for (k = i; index[k] < tag_index[i]; k++)
311 ; 119 ;
312 mask = (1UL << order[k]) - 1; 120 mask = (1UL << order[k]) - 1;
313 121
314 assert((iter.index | mask) == (tag_index[i] | mask)); 122 assert((xas.xa_index | mask) == (tag_index[i] | mask));
315 assert(!radix_tree_is_internal_node(item)); 123 assert(!xa_is_internal(item));
316 assert((item->index | mask) == (tag_index[i] | mask)); 124 assert((item->index | mask) == (tag_index[i] | mask));
317 assert(item->order == order[k]); 125 assert(item->order == order[k]);
318 i++; 126 i++;
319 } 127 }
320 } 128 }
321 129
322 assert(tag_tagged_items(&tree, NULL, 0, ~0UL, TAG_ENTRIES, 1, 2) == 130 assert(tag_tagged_items(xa, 0, ULONG_MAX, TAG_ENTRIES, XA_MARK_1,
323 TAG_ENTRIES); 131 XA_MARK_2) == TAG_ENTRIES);
324 132
325 for (j = 0; j < 256; j++) { 133 for (j = 0; j < 256; j++) {
326 int mask, k; 134 int mask, k;
@@ -332,297 +140,31 @@ void multiorder_tagged_iteration(void)
332 break; 140 break;
333 } 141 }
334 142
335 radix_tree_for_each_tagged(slot, &tree, &iter, j, 2) { 143 xas_set(&xas, j);
336 struct item *item = *slot; 144 xas_for_each_marked(&xas, item, ULONG_MAX, XA_MARK_2) {
337 for (k = i; index[k] < tag_index[i]; k++) 145 for (k = i; index[k] < tag_index[i]; k++)
338 ; 146 ;
339 mask = (1 << order[k]) - 1; 147 mask = (1 << order[k]) - 1;
340 148
341 assert((iter.index | mask) == (tag_index[i] | mask)); 149 assert((xas.xa_index | mask) == (tag_index[i] | mask));
342 assert(!radix_tree_is_internal_node(item)); 150 assert(!xa_is_internal(item));
343 assert((item->index | mask) == (tag_index[i] | mask)); 151 assert((item->index | mask) == (tag_index[i] | mask));
344 assert(item->order == order[k]); 152 assert(item->order == order[k]);
345 i++; 153 i++;
346 } 154 }
347 } 155 }
348 156
349 assert(tag_tagged_items(&tree, NULL, 1, ~0UL, MT_NUM_ENTRIES * 2, 1, 0) 157 assert(tag_tagged_items(xa, 1, ULONG_MAX, MT_NUM_ENTRIES * 2, XA_MARK_1,
350 == TAG_ENTRIES); 158 XA_MARK_0) == TAG_ENTRIES);
351 i = 0; 159 i = 0;
352 radix_tree_for_each_tagged(slot, &tree, &iter, 0, 0) { 160 xas_set(&xas, 0);
353 assert(iter.index == tag_index[i]); 161 xas_for_each_marked(&xas, item, ULONG_MAX, XA_MARK_0) {
162 assert(xas.xa_index == tag_index[i]);
354 i++; 163 i++;
355 } 164 }
165 assert(i == TAG_ENTRIES);
356 166
357 item_kill_tree(&tree); 167 item_kill_tree(xa);
358}
359
360/*
361 * Basic join checks: make sure we can't find an entry in the tree after
362 * a larger entry has replaced it
363 */
364static void multiorder_join1(unsigned long index,
365 unsigned order1, unsigned order2)
366{
367 unsigned long loc;
368 void *item, *item2 = item_create(index + 1, order1);
369 RADIX_TREE(tree, GFP_KERNEL);
370
371 item_insert_order(&tree, index, order2);
372 item = radix_tree_lookup(&tree, index);
373 radix_tree_join(&tree, index + 1, order1, item2);
374 loc = find_item(&tree, item);
375 if (loc == -1)
376 free(item);
377 item = radix_tree_lookup(&tree, index + 1);
378 assert(item == item2);
379 item_kill_tree(&tree);
380}
381
382/*
383 * Check that the accounting of exceptional entries is handled correctly
384 * by joining an exceptional entry to a normal pointer.
385 */
386static void multiorder_join2(unsigned order1, unsigned order2)
387{
388 RADIX_TREE(tree, GFP_KERNEL);
389 struct radix_tree_node *node;
390 void *item1 = item_create(0, order1);
391 void *item2;
392
393 item_insert_order(&tree, 0, order2);
394 radix_tree_insert(&tree, 1 << order2, (void *)0x12UL);
395 item2 = __radix_tree_lookup(&tree, 1 << order2, &node, NULL);
396 assert(item2 == (void *)0x12UL);
397 assert(node->exceptional == 1);
398
399 item2 = radix_tree_lookup(&tree, 0);
400 free(item2);
401
402 radix_tree_join(&tree, 0, order1, item1);
403 item2 = __radix_tree_lookup(&tree, 1 << order2, &node, NULL);
404 assert(item2 == item1);
405 assert(node->exceptional == 0);
406 item_kill_tree(&tree);
407}
408
409/*
410 * This test revealed an accounting bug for exceptional entries at one point.
411 * Nodes were being freed back into the pool with an elevated exception count
412 * by radix_tree_join() and then radix_tree_split() was failing to zero the
413 * count of exceptional entries.
414 */
415static void multiorder_join3(unsigned int order)
416{
417 RADIX_TREE(tree, GFP_KERNEL);
418 struct radix_tree_node *node;
419 void **slot;
420 struct radix_tree_iter iter;
421 unsigned long i;
422
423 for (i = 0; i < (1 << order); i++) {
424 radix_tree_insert(&tree, i, (void *)0x12UL);
425 }
426
427 radix_tree_join(&tree, 0, order, (void *)0x16UL);
428 rcu_barrier();
429
430 radix_tree_split(&tree, 0, 0);
431
432 radix_tree_for_each_slot(slot, &tree, &iter, 0) {
433 radix_tree_iter_replace(&tree, &iter, slot, (void *)0x12UL);
434 }
435
436 __radix_tree_lookup(&tree, 0, &node, NULL);
437 assert(node->exceptional == node->count);
438
439 item_kill_tree(&tree);
440}
441
442static void multiorder_join(void)
443{
444 int i, j, idx;
445
446 for (idx = 0; idx < 1024; idx = idx * 2 + 3) {
447 for (i = 1; i < 15; i++) {
448 for (j = 0; j < i; j++) {
449 multiorder_join1(idx, i, j);
450 }
451 }
452 }
453
454 for (i = 1; i < 15; i++) {
455 for (j = 0; j < i; j++) {
456 multiorder_join2(i, j);
457 }
458 }
459
460 for (i = 3; i < 10; i++) {
461 multiorder_join3(i);
462 }
463}
464
465static void check_mem(unsigned old_order, unsigned new_order, unsigned alloc)
466{
467 struct radix_tree_preload *rtp = &radix_tree_preloads;
468 if (rtp->nr != 0)
469 printv(2, "split(%u %u) remaining %u\n", old_order, new_order,
470 rtp->nr);
471 /*
472 * Can't check for equality here as some nodes may have been
473 * RCU-freed while we ran. But we should never finish with more
474 * nodes allocated since they should have all been preloaded.
475 */
476 if (nr_allocated > alloc)
477 printv(2, "split(%u %u) allocated %u %u\n", old_order, new_order,
478 alloc, nr_allocated);
479}
480
481static void __multiorder_split(int old_order, int new_order)
482{
483 RADIX_TREE(tree, GFP_ATOMIC);
484 void **slot;
485 struct radix_tree_iter iter;
486 unsigned alloc;
487 struct item *item;
488
489 radix_tree_preload(GFP_KERNEL);
490 assert(item_insert_order(&tree, 0, old_order) == 0);
491 radix_tree_preload_end();
492
493 /* Wipe out the preloaded cache or it'll confuse check_mem() */
494 radix_tree_cpu_dead(0);
495
496 item = radix_tree_tag_set(&tree, 0, 2);
497
498 radix_tree_split_preload(old_order, new_order, GFP_KERNEL);
499 alloc = nr_allocated;
500 radix_tree_split(&tree, 0, new_order);
501 check_mem(old_order, new_order, alloc);
502 radix_tree_for_each_slot(slot, &tree, &iter, 0) {
503 radix_tree_iter_replace(&tree, &iter, slot,
504 item_create(iter.index, new_order));
505 }
506 radix_tree_preload_end();
507
508 item_kill_tree(&tree);
509 free(item);
510}
511
512static void __multiorder_split2(int old_order, int new_order)
513{
514 RADIX_TREE(tree, GFP_KERNEL);
515 void **slot;
516 struct radix_tree_iter iter;
517 struct radix_tree_node *node;
518 void *item;
519
520 __radix_tree_insert(&tree, 0, old_order, (void *)0x12);
521
522 item = __radix_tree_lookup(&tree, 0, &node, NULL);
523 assert(item == (void *)0x12);
524 assert(node->exceptional > 0);
525
526 radix_tree_split(&tree, 0, new_order);
527 radix_tree_for_each_slot(slot, &tree, &iter, 0) {
528 radix_tree_iter_replace(&tree, &iter, slot,
529 item_create(iter.index, new_order));
530 }
531
532 item = __radix_tree_lookup(&tree, 0, &node, NULL);
533 assert(item != (void *)0x12);
534 assert(node->exceptional == 0);
535
536 item_kill_tree(&tree);
537}
538
539static void __multiorder_split3(int old_order, int new_order)
540{
541 RADIX_TREE(tree, GFP_KERNEL);
542 void **slot;
543 struct radix_tree_iter iter;
544 struct radix_tree_node *node;
545 void *item;
546
547 __radix_tree_insert(&tree, 0, old_order, (void *)0x12);
548
549 item = __radix_tree_lookup(&tree, 0, &node, NULL);
550 assert(item == (void *)0x12);
551 assert(node->exceptional > 0);
552
553 radix_tree_split(&tree, 0, new_order);
554 radix_tree_for_each_slot(slot, &tree, &iter, 0) {
555 radix_tree_iter_replace(&tree, &iter, slot, (void *)0x16);
556 }
557
558 item = __radix_tree_lookup(&tree, 0, &node, NULL);
559 assert(item == (void *)0x16);
560 assert(node->exceptional > 0);
561
562 item_kill_tree(&tree);
563
564 __radix_tree_insert(&tree, 0, old_order, (void *)0x12);
565
566 item = __radix_tree_lookup(&tree, 0, &node, NULL);
567 assert(item == (void *)0x12);
568 assert(node->exceptional > 0);
569
570 radix_tree_split(&tree, 0, new_order);
571 radix_tree_for_each_slot(slot, &tree, &iter, 0) {
572 if (iter.index == (1 << new_order))
573 radix_tree_iter_replace(&tree, &iter, slot,
574 (void *)0x16);
575 else
576 radix_tree_iter_replace(&tree, &iter, slot, NULL);
577 }
578
579 item = __radix_tree_lookup(&tree, 1 << new_order, &node, NULL);
580 assert(item == (void *)0x16);
581 assert(node->count == node->exceptional);
582 do {
583 node = node->parent;
584 if (!node)
585 break;
586 assert(node->count == 1);
587 assert(node->exceptional == 0);
588 } while (1);
589
590 item_kill_tree(&tree);
591}
592
593static void multiorder_split(void)
594{
595 int i, j;
596
597 for (i = 3; i < 11; i++)
598 for (j = 0; j < i; j++) {
599 __multiorder_split(i, j);
600 __multiorder_split2(i, j);
601 __multiorder_split3(i, j);
602 }
603}
604
605static void multiorder_account(void)
606{
607 RADIX_TREE(tree, GFP_KERNEL);
608 struct radix_tree_node *node;
609 void **slot;
610
611 item_insert_order(&tree, 0, 5);
612
613 __radix_tree_insert(&tree, 1 << 5, 5, (void *)0x12);
614 __radix_tree_lookup(&tree, 0, &node, NULL);
615 assert(node->count == node->exceptional * 2);
616 radix_tree_delete(&tree, 1 << 5);
617 assert(node->exceptional == 0);
618
619 __radix_tree_insert(&tree, 1 << 5, 5, (void *)0x12);
620 __radix_tree_lookup(&tree, 1 << 5, &node, &slot);
621 assert(node->count == node->exceptional * 2);
622 __radix_tree_replace(&tree, node, slot, NULL, NULL);
623 assert(node->exceptional == 0);
624
625 item_kill_tree(&tree);
626} 168}
627 169
628bool stop_iteration = false; 170bool stop_iteration = false;
@@ -645,68 +187,45 @@ static void *creator_func(void *ptr)
645 187
646static void *iterator_func(void *ptr) 188static void *iterator_func(void *ptr)
647{ 189{
648 struct radix_tree_root *tree = ptr; 190 XA_STATE(xas, ptr, 0);
649 struct radix_tree_iter iter;
650 struct item *item; 191 struct item *item;
651 void **slot;
652 192
653 while (!stop_iteration) { 193 while (!stop_iteration) {
654 rcu_read_lock(); 194 rcu_read_lock();
655 radix_tree_for_each_slot(slot, tree, &iter, 0) { 195 xas_for_each(&xas, item, ULONG_MAX) {
656 item = radix_tree_deref_slot(slot); 196 if (xas_retry(&xas, item))
657
658 if (!item)
659 continue; 197 continue;
660 if (radix_tree_deref_retry(item)) {
661 slot = radix_tree_iter_retry(&iter);
662 continue;
663 }
664 198
665 item_sanity(item, iter.index); 199 item_sanity(item, xas.xa_index);
666 } 200 }
667 rcu_read_unlock(); 201 rcu_read_unlock();
668 } 202 }
669 return NULL; 203 return NULL;
670} 204}
671 205
672static void multiorder_iteration_race(void) 206static void multiorder_iteration_race(struct xarray *xa)
673{ 207{
674 const int num_threads = sysconf(_SC_NPROCESSORS_ONLN); 208 const int num_threads = sysconf(_SC_NPROCESSORS_ONLN);
675 pthread_t worker_thread[num_threads]; 209 pthread_t worker_thread[num_threads];
676 RADIX_TREE(tree, GFP_KERNEL);
677 int i; 210 int i;
678 211
679 pthread_create(&worker_thread[0], NULL, &creator_func, &tree); 212 pthread_create(&worker_thread[0], NULL, &creator_func, xa);
680 for (i = 1; i < num_threads; i++) 213 for (i = 1; i < num_threads; i++)
681 pthread_create(&worker_thread[i], NULL, &iterator_func, &tree); 214 pthread_create(&worker_thread[i], NULL, &iterator_func, xa);
682 215
683 for (i = 0; i < num_threads; i++) 216 for (i = 0; i < num_threads; i++)
684 pthread_join(worker_thread[i], NULL); 217 pthread_join(worker_thread[i], NULL);
685 218
686 item_kill_tree(&tree); 219 item_kill_tree(xa);
687} 220}
688 221
222static DEFINE_XARRAY(array);
223
689void multiorder_checks(void) 224void multiorder_checks(void)
690{ 225{
691 int i; 226 multiorder_iteration(&array);
692 227 multiorder_tagged_iteration(&array);
693 for (i = 0; i < 20; i++) { 228 multiorder_iteration_race(&array);
694 multiorder_check(200, i);
695 multiorder_check(0, i);
696 multiorder_check((1UL << i) + 1, i);
697 }
698
699 for (i = 0; i < 15; i++)
700 multiorder_shrink((1UL << (i + RADIX_TREE_MAP_SHIFT)), i);
701
702 multiorder_insert_bug();
703 multiorder_tag_tests();
704 multiorder_iteration();
705 multiorder_tagged_iteration();
706 multiorder_join();
707 multiorder_split();
708 multiorder_account();
709 multiorder_iteration_race();
710 229
711 radix_tree_cpu_dead(0); 230 radix_tree_cpu_dead(0);
712} 231}
diff --git a/tools/testing/radix-tree/regression1.c b/tools/testing/radix-tree/regression1.c
index 0aece092f40e..a61c7bcbc72d 100644
--- a/tools/testing/radix-tree/regression1.c
+++ b/tools/testing/radix-tree/regression1.c
@@ -44,7 +44,6 @@
44#include "regression.h" 44#include "regression.h"
45 45
46static RADIX_TREE(mt_tree, GFP_KERNEL); 46static RADIX_TREE(mt_tree, GFP_KERNEL);
47static pthread_mutex_t mt_lock = PTHREAD_MUTEX_INITIALIZER;
48 47
49struct page { 48struct page {
50 pthread_mutex_t lock; 49 pthread_mutex_t lock;
@@ -53,12 +52,12 @@ struct page {
53 unsigned long index; 52 unsigned long index;
54}; 53};
55 54
56static struct page *page_alloc(void) 55static struct page *page_alloc(int index)
57{ 56{
58 struct page *p; 57 struct page *p;
59 p = malloc(sizeof(struct page)); 58 p = malloc(sizeof(struct page));
60 p->count = 1; 59 p->count = 1;
61 p->index = 1; 60 p->index = index;
62 pthread_mutex_init(&p->lock, NULL); 61 pthread_mutex_init(&p->lock, NULL);
63 62
64 return p; 63 return p;
@@ -80,53 +79,33 @@ static void page_free(struct page *p)
80static unsigned find_get_pages(unsigned long start, 79static unsigned find_get_pages(unsigned long start,
81 unsigned int nr_pages, struct page **pages) 80 unsigned int nr_pages, struct page **pages)
82{ 81{
83 unsigned int i; 82 XA_STATE(xas, &mt_tree, start);
84 unsigned int ret; 83 struct page *page;
85 unsigned int nr_found; 84 unsigned int ret = 0;
86 85
87 rcu_read_lock(); 86 rcu_read_lock();
88restart: 87 xas_for_each(&xas, page, ULONG_MAX) {
89 nr_found = radix_tree_gang_lookup_slot(&mt_tree, 88 if (xas_retry(&xas, page))
90 (void ***)pages, NULL, start, nr_pages);
91 ret = 0;
92 for (i = 0; i < nr_found; i++) {
93 struct page *page;
94repeat:
95 page = radix_tree_deref_slot((void **)pages[i]);
96 if (unlikely(!page))
97 continue; 89 continue;
98 90
99 if (radix_tree_exception(page)) {
100 if (radix_tree_deref_retry(page)) {
101 /*
102 * Transient condition which can only trigger
103 * when entry at index 0 moves out of or back
104 * to root: none yet gotten, safe to restart.
105 */
106 assert((start | i) == 0);
107 goto restart;
108 }
109 /*
110 * No exceptional entries are inserted in this test.
111 */
112 assert(0);
113 }
114
115 pthread_mutex_lock(&page->lock); 91 pthread_mutex_lock(&page->lock);
116 if (!page->count) { 92 if (!page->count)
117 pthread_mutex_unlock(&page->lock); 93 goto unlock;
118 goto repeat; 94
119 }
120 /* don't actually update page refcount */ 95 /* don't actually update page refcount */
121 pthread_mutex_unlock(&page->lock); 96 pthread_mutex_unlock(&page->lock);
122 97
123 /* Has the page moved? */ 98 /* Has the page moved? */
124 if (unlikely(page != *((void **)pages[i]))) { 99 if (unlikely(page != xas_reload(&xas)))
125 goto repeat; 100 goto put_page;
126 }
127 101
128 pages[ret] = page; 102 pages[ret] = page;
129 ret++; 103 ret++;
104 continue;
105unlock:
106 pthread_mutex_unlock(&page->lock);
107put_page:
108 xas_reset(&xas);
130 } 109 }
131 rcu_read_unlock(); 110 rcu_read_unlock();
132 return ret; 111 return ret;
@@ -145,30 +124,30 @@ static void *regression1_fn(void *arg)
145 for (j = 0; j < 1000000; j++) { 124 for (j = 0; j < 1000000; j++) {
146 struct page *p; 125 struct page *p;
147 126
148 p = page_alloc(); 127 p = page_alloc(0);
149 pthread_mutex_lock(&mt_lock); 128 xa_lock(&mt_tree);
150 radix_tree_insert(&mt_tree, 0, p); 129 radix_tree_insert(&mt_tree, 0, p);
151 pthread_mutex_unlock(&mt_lock); 130 xa_unlock(&mt_tree);
152 131
153 p = page_alloc(); 132 p = page_alloc(1);
154 pthread_mutex_lock(&mt_lock); 133 xa_lock(&mt_tree);
155 radix_tree_insert(&mt_tree, 1, p); 134 radix_tree_insert(&mt_tree, 1, p);
156 pthread_mutex_unlock(&mt_lock); 135 xa_unlock(&mt_tree);
157 136
158 pthread_mutex_lock(&mt_lock); 137 xa_lock(&mt_tree);
159 p = radix_tree_delete(&mt_tree, 1); 138 p = radix_tree_delete(&mt_tree, 1);
160 pthread_mutex_lock(&p->lock); 139 pthread_mutex_lock(&p->lock);
161 p->count--; 140 p->count--;
162 pthread_mutex_unlock(&p->lock); 141 pthread_mutex_unlock(&p->lock);
163 pthread_mutex_unlock(&mt_lock); 142 xa_unlock(&mt_tree);
164 page_free(p); 143 page_free(p);
165 144
166 pthread_mutex_lock(&mt_lock); 145 xa_lock(&mt_tree);
167 p = radix_tree_delete(&mt_tree, 0); 146 p = radix_tree_delete(&mt_tree, 0);
168 pthread_mutex_lock(&p->lock); 147 pthread_mutex_lock(&p->lock);
169 p->count--; 148 p->count--;
170 pthread_mutex_unlock(&p->lock); 149 pthread_mutex_unlock(&p->lock);
171 pthread_mutex_unlock(&mt_lock); 150 xa_unlock(&mt_tree);
172 page_free(p); 151 page_free(p);
173 } 152 }
174 } else { 153 } else {
diff --git a/tools/testing/radix-tree/regression2.c b/tools/testing/radix-tree/regression2.c
index 424b91c77831..f2c7e640a919 100644
--- a/tools/testing/radix-tree/regression2.c
+++ b/tools/testing/radix-tree/regression2.c
@@ -53,9 +53,9 @@
53#include "regression.h" 53#include "regression.h"
54#include "test.h" 54#include "test.h"
55 55
56#define PAGECACHE_TAG_DIRTY 0 56#define PAGECACHE_TAG_DIRTY XA_MARK_0
57#define PAGECACHE_TAG_WRITEBACK 1 57#define PAGECACHE_TAG_WRITEBACK XA_MARK_1
58#define PAGECACHE_TAG_TOWRITE 2 58#define PAGECACHE_TAG_TOWRITE XA_MARK_2
59 59
60static RADIX_TREE(mt_tree, GFP_KERNEL); 60static RADIX_TREE(mt_tree, GFP_KERNEL);
61unsigned long page_count = 0; 61unsigned long page_count = 0;
@@ -92,7 +92,7 @@ void regression2_test(void)
92 /* 1. */ 92 /* 1. */
93 start = 0; 93 start = 0;
94 end = max_slots - 2; 94 end = max_slots - 2;
95 tag_tagged_items(&mt_tree, NULL, start, end, 1, 95 tag_tagged_items(&mt_tree, start, end, 1,
96 PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE); 96 PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
97 97
98 /* 2. */ 98 /* 2. */
diff --git a/tools/testing/radix-tree/regression3.c b/tools/testing/radix-tree/regression3.c
index ace2543c3eda..9f9a3b280f56 100644
--- a/tools/testing/radix-tree/regression3.c
+++ b/tools/testing/radix-tree/regression3.c
@@ -69,21 +69,6 @@ void regression3_test(void)
69 continue; 69 continue;
70 } 70 }
71 } 71 }
72 radix_tree_delete(&root, 1);
73
74 first = true;
75 radix_tree_for_each_contig(slot, &root, &iter, 0) {
76 printv(2, "contig %ld %p\n", iter.index, *slot);
77 if (first) {
78 radix_tree_insert(&root, 1, ptr);
79 first = false;
80 }
81 if (radix_tree_deref_retry(*slot)) {
82 printv(2, "retry at %ld\n", iter.index);
83 slot = radix_tree_iter_retry(&iter);
84 continue;
85 }
86 }
87 72
88 radix_tree_for_each_slot(slot, &root, &iter, 0) { 73 radix_tree_for_each_slot(slot, &root, &iter, 0) {
89 printv(2, "slot %ld %p\n", iter.index, *slot); 74 printv(2, "slot %ld %p\n", iter.index, *slot);
@@ -93,14 +78,6 @@ void regression3_test(void)
93 } 78 }
94 } 79 }
95 80
96 radix_tree_for_each_contig(slot, &root, &iter, 0) {
97 printv(2, "contig %ld %p\n", iter.index, *slot);
98 if (!iter.index) {
99 printv(2, "next at %ld\n", iter.index);
100 slot = radix_tree_iter_resume(slot, &iter);
101 }
102 }
103
104 radix_tree_tag_set(&root, 0, 0); 81 radix_tree_tag_set(&root, 0, 0);
105 radix_tree_tag_set(&root, 1, 0); 82 radix_tree_tag_set(&root, 1, 0);
106 radix_tree_for_each_tagged(slot, &root, &iter, 0, 0) { 83 radix_tree_for_each_tagged(slot, &root, &iter, 0, 0) {
diff --git a/tools/testing/radix-tree/tag_check.c b/tools/testing/radix-tree/tag_check.c
index 543181e4847b..f898957b1a19 100644
--- a/tools/testing/radix-tree/tag_check.c
+++ b/tools/testing/radix-tree/tag_check.c
@@ -24,7 +24,7 @@ __simple_checks(struct radix_tree_root *tree, unsigned long index, int tag)
24 item_tag_set(tree, index, tag); 24 item_tag_set(tree, index, tag);
25 ret = item_tag_get(tree, index, tag); 25 ret = item_tag_get(tree, index, tag);
26 assert(ret != 0); 26 assert(ret != 0);
27 ret = tag_tagged_items(tree, NULL, first, ~0UL, 10, tag, !tag); 27 ret = tag_tagged_items(tree, first, ~0UL, 10, tag, !tag);
28 assert(ret == 1); 28 assert(ret == 1);
29 ret = item_tag_get(tree, index, !tag); 29 ret = item_tag_get(tree, index, !tag);
30 assert(ret != 0); 30 assert(ret != 0);
@@ -321,7 +321,7 @@ static void single_check(void)
321 assert(ret == 0); 321 assert(ret == 0);
322 verify_tag_consistency(&tree, 0); 322 verify_tag_consistency(&tree, 0);
323 verify_tag_consistency(&tree, 1); 323 verify_tag_consistency(&tree, 1);
324 ret = tag_tagged_items(&tree, NULL, first, 10, 10, 0, 1); 324 ret = tag_tagged_items(&tree, first, 10, 10, XA_MARK_0, XA_MARK_1);
325 assert(ret == 1); 325 assert(ret == 1);
326 ret = radix_tree_gang_lookup_tag(&tree, (void **)items, 0, BATCH, 1); 326 ret = radix_tree_gang_lookup_tag(&tree, (void **)items, 0, BATCH, 1);
327 assert(ret == 1); 327 assert(ret == 1);
@@ -331,34 +331,6 @@ static void single_check(void)
331 item_kill_tree(&tree); 331 item_kill_tree(&tree);
332} 332}
333 333
334void radix_tree_clear_tags_test(void)
335{
336 unsigned long index;
337 struct radix_tree_node *node;
338 struct radix_tree_iter iter;
339 void **slot;
340
341 RADIX_TREE(tree, GFP_KERNEL);
342
343 item_insert(&tree, 0);
344 item_tag_set(&tree, 0, 0);
345 __radix_tree_lookup(&tree, 0, &node, &slot);
346 radix_tree_clear_tags(&tree, node, slot);
347 assert(item_tag_get(&tree, 0, 0) == 0);
348
349 for (index = 0; index < 1000; index++) {
350 item_insert(&tree, index);
351 item_tag_set(&tree, index, 0);
352 }
353
354 radix_tree_for_each_slot(slot, &tree, &iter, 0) {
355 radix_tree_clear_tags(&tree, iter.node, slot);
356 assert(item_tag_get(&tree, iter.index, 0) == 0);
357 }
358
359 item_kill_tree(&tree);
360}
361
362void tag_check(void) 334void tag_check(void)
363{ 335{
364 single_check(); 336 single_check();
@@ -376,5 +348,4 @@ void tag_check(void)
376 thrash_tags(); 348 thrash_tags();
377 rcu_barrier(); 349 rcu_barrier();
378 printv(2, "after thrash_tags: %d allocated\n", nr_allocated); 350 printv(2, "after thrash_tags: %d allocated\n", nr_allocated);
379 radix_tree_clear_tags_test();
380} 351}
diff --git a/tools/testing/radix-tree/test.c b/tools/testing/radix-tree/test.c
index def6015570b2..a15d0512e633 100644
--- a/tools/testing/radix-tree/test.c
+++ b/tools/testing/radix-tree/test.c
@@ -25,11 +25,6 @@ int item_tag_get(struct radix_tree_root *root, unsigned long index, int tag)
25 return radix_tree_tag_get(root, index, tag); 25 return radix_tree_tag_get(root, index, tag);
26} 26}
27 27
28int __item_insert(struct radix_tree_root *root, struct item *item)
29{
30 return __radix_tree_insert(root, item->index, item->order, item);
31}
32
33struct item *item_create(unsigned long index, unsigned int order) 28struct item *item_create(unsigned long index, unsigned int order)
34{ 29{
35 struct item *ret = malloc(sizeof(*ret)); 30 struct item *ret = malloc(sizeof(*ret));
@@ -39,21 +34,15 @@ struct item *item_create(unsigned long index, unsigned int order)
39 return ret; 34 return ret;
40} 35}
41 36
42int item_insert_order(struct radix_tree_root *root, unsigned long index, 37int item_insert(struct radix_tree_root *root, unsigned long index)
43 unsigned order)
44{ 38{
45 struct item *item = item_create(index, order); 39 struct item *item = item_create(index, 0);
46 int err = __item_insert(root, item); 40 int err = radix_tree_insert(root, item->index, item);
47 if (err) 41 if (err)
48 free(item); 42 free(item);
49 return err; 43 return err;
50} 44}
51 45
52int item_insert(struct radix_tree_root *root, unsigned long index)
53{
54 return item_insert_order(root, index, 0);
55}
56
57void item_sanity(struct item *item, unsigned long index) 46void item_sanity(struct item *item, unsigned long index)
58{ 47{
59 unsigned long mask; 48 unsigned long mask;
@@ -63,16 +52,21 @@ void item_sanity(struct item *item, unsigned long index)
63 assert((item->index | mask) == (index | mask)); 52 assert((item->index | mask) == (index | mask));
64} 53}
65 54
55void item_free(struct item *item, unsigned long index)
56{
57 item_sanity(item, index);
58 free(item);
59}
60
66int item_delete(struct radix_tree_root *root, unsigned long index) 61int item_delete(struct radix_tree_root *root, unsigned long index)
67{ 62{
68 struct item *item = radix_tree_delete(root, index); 63 struct item *item = radix_tree_delete(root, index);
69 64
70 if (item) { 65 if (!item)
71 item_sanity(item, index); 66 return 0;
72 free(item); 67
73 return 1; 68 item_free(item, index);
74 } 69 return 1;
75 return 0;
76} 70}
77 71
78static void item_free_rcu(struct rcu_head *head) 72static void item_free_rcu(struct rcu_head *head)
@@ -82,9 +76,9 @@ static void item_free_rcu(struct rcu_head *head)
82 free(item); 76 free(item);
83} 77}
84 78
85int item_delete_rcu(struct radix_tree_root *root, unsigned long index) 79int item_delete_rcu(struct xarray *xa, unsigned long index)
86{ 80{
87 struct item *item = radix_tree_delete(root, index); 81 struct item *item = xa_erase(xa, index);
88 82
89 if (item) { 83 if (item) {
90 item_sanity(item, index); 84 item_sanity(item, index);
@@ -176,59 +170,30 @@ void item_full_scan(struct radix_tree_root *root, unsigned long start,
176} 170}
177 171
178/* Use the same pattern as tag_pages_for_writeback() in mm/page-writeback.c */ 172/* Use the same pattern as tag_pages_for_writeback() in mm/page-writeback.c */
179int tag_tagged_items(struct radix_tree_root *root, pthread_mutex_t *lock, 173int tag_tagged_items(struct xarray *xa, unsigned long start, unsigned long end,
180 unsigned long start, unsigned long end, unsigned batch, 174 unsigned batch, xa_mark_t iftag, xa_mark_t thentag)
181 unsigned iftag, unsigned thentag)
182{ 175{
183 unsigned long tagged = 0; 176 XA_STATE(xas, xa, start);
184 struct radix_tree_iter iter; 177 unsigned int tagged = 0;
185 void **slot; 178 struct item *item;
186 179
187 if (batch == 0) 180 if (batch == 0)
188 batch = 1; 181 batch = 1;
189 182
190 if (lock) 183 xas_lock_irq(&xas);
191 pthread_mutex_lock(lock); 184 xas_for_each_marked(&xas, item, end, iftag) {
192 radix_tree_for_each_tagged(slot, root, &iter, start, iftag) { 185 xas_set_mark(&xas, thentag);
193 if (iter.index > end) 186 if (++tagged % batch)
194 break;
195 radix_tree_iter_tag_set(root, &iter, thentag);
196 tagged++;
197 if ((tagged % batch) != 0)
198 continue; 187 continue;
199 slot = radix_tree_iter_resume(slot, &iter);
200 if (lock) {
201 pthread_mutex_unlock(lock);
202 rcu_barrier();
203 pthread_mutex_lock(lock);
204 }
205 }
206 if (lock)
207 pthread_mutex_unlock(lock);
208
209 return tagged;
210}
211 188
212/* Use the same pattern as find_swap_entry() in mm/shmem.c */ 189 xas_pause(&xas);
213unsigned long find_item(struct radix_tree_root *root, void *item) 190 xas_unlock_irq(&xas);
214{ 191 rcu_barrier();
215 struct radix_tree_iter iter; 192 xas_lock_irq(&xas);
216 void **slot;
217 unsigned long found = -1;
218 unsigned long checked = 0;
219
220 radix_tree_for_each_slot(slot, root, &iter, 0) {
221 if (*slot == item) {
222 found = iter.index;
223 break;
224 }
225 checked++;
226 if ((checked % 4) != 0)
227 continue;
228 slot = radix_tree_iter_resume(slot, &iter);
229 } 193 }
194 xas_unlock_irq(&xas);
230 195
231 return found; 196 return tagged;
232} 197}
233 198
234static int verify_node(struct radix_tree_node *slot, unsigned int tag, 199static int verify_node(struct radix_tree_node *slot, unsigned int tag,
@@ -281,43 +246,31 @@ static int verify_node(struct radix_tree_node *slot, unsigned int tag,
281 246
282void verify_tag_consistency(struct radix_tree_root *root, unsigned int tag) 247void verify_tag_consistency(struct radix_tree_root *root, unsigned int tag)
283{ 248{
284 struct radix_tree_node *node = root->rnode; 249 struct radix_tree_node *node = root->xa_head;
285 if (!radix_tree_is_internal_node(node)) 250 if (!radix_tree_is_internal_node(node))
286 return; 251 return;
287 verify_node(node, tag, !!root_tag_get(root, tag)); 252 verify_node(node, tag, !!root_tag_get(root, tag));
288} 253}
289 254
290void item_kill_tree(struct radix_tree_root *root) 255void item_kill_tree(struct xarray *xa)
291{ 256{
292 struct radix_tree_iter iter; 257 XA_STATE(xas, xa, 0);
293 void **slot; 258 void *entry;
294 struct item *items[32];
295 int nfound;
296
297 radix_tree_for_each_slot(slot, root, &iter, 0) {
298 if (radix_tree_exceptional_entry(*slot))
299 radix_tree_delete(root, iter.index);
300 }
301 259
302 while ((nfound = radix_tree_gang_lookup(root, (void **)items, 0, 32))) { 260 xas_for_each(&xas, entry, ULONG_MAX) {
303 int i; 261 if (!xa_is_value(entry)) {
304 262 item_free(entry, xas.xa_index);
305 for (i = 0; i < nfound; i++) {
306 void *ret;
307
308 ret = radix_tree_delete(root, items[i]->index);
309 assert(ret == items[i]);
310 free(items[i]);
311 } 263 }
264 xas_store(&xas, NULL);
312 } 265 }
313 assert(radix_tree_gang_lookup(root, (void **)items, 0, 32) == 0); 266
314 assert(root->rnode == NULL); 267 assert(xa_empty(xa));
315} 268}
316 269
317void tree_verify_min_height(struct radix_tree_root *root, int maxindex) 270void tree_verify_min_height(struct radix_tree_root *root, int maxindex)
318{ 271{
319 unsigned shift; 272 unsigned shift;
320 struct radix_tree_node *node = root->rnode; 273 struct radix_tree_node *node = root->xa_head;
321 if (!radix_tree_is_internal_node(node)) { 274 if (!radix_tree_is_internal_node(node)) {
322 assert(maxindex == 0); 275 assert(maxindex == 0);
323 return; 276 return;
diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h
index 92d901eacf49..1ee4b2c0ad10 100644
--- a/tools/testing/radix-tree/test.h
+++ b/tools/testing/radix-tree/test.h
@@ -11,13 +11,11 @@ struct item {
11}; 11};
12 12
13struct item *item_create(unsigned long index, unsigned int order); 13struct item *item_create(unsigned long index, unsigned int order);
14int __item_insert(struct radix_tree_root *root, struct item *item);
15int item_insert(struct radix_tree_root *root, unsigned long index); 14int item_insert(struct radix_tree_root *root, unsigned long index);
16void item_sanity(struct item *item, unsigned long index); 15void item_sanity(struct item *item, unsigned long index);
17int item_insert_order(struct radix_tree_root *root, unsigned long index, 16void item_free(struct item *item, unsigned long index);
18 unsigned order);
19int item_delete(struct radix_tree_root *root, unsigned long index); 17int item_delete(struct radix_tree_root *root, unsigned long index);
20int item_delete_rcu(struct radix_tree_root *root, unsigned long index); 18int item_delete_rcu(struct xarray *xa, unsigned long index);
21struct item *item_lookup(struct radix_tree_root *root, unsigned long index); 19struct item *item_lookup(struct radix_tree_root *root, unsigned long index);
22 20
23void item_check_present(struct radix_tree_root *root, unsigned long index); 21void item_check_present(struct radix_tree_root *root, unsigned long index);
@@ -29,11 +27,10 @@ void item_full_scan(struct radix_tree_root *root, unsigned long start,
29 unsigned long nr, int chunk); 27 unsigned long nr, int chunk);
30void item_kill_tree(struct radix_tree_root *root); 28void item_kill_tree(struct radix_tree_root *root);
31 29
32int tag_tagged_items(struct radix_tree_root *, pthread_mutex_t *, 30int tag_tagged_items(struct xarray *, unsigned long start, unsigned long end,
33 unsigned long start, unsigned long end, unsigned batch, 31 unsigned batch, xa_mark_t iftag, xa_mark_t thentag);
34 unsigned iftag, unsigned thentag);
35unsigned long find_item(struct radix_tree_root *, void *item);
36 32
33void xarray_tests(void);
37void tag_check(void); 34void tag_check(void);
38void multiorder_checks(void); 35void multiorder_checks(void);
39void iteration_test(unsigned order, unsigned duration); 36void iteration_test(unsigned order, unsigned duration);
diff --git a/tools/testing/radix-tree/xarray.c b/tools/testing/radix-tree/xarray.c
new file mode 100644
index 000000000000..e61e43efe463
--- /dev/null
+++ b/tools/testing/radix-tree/xarray.c
@@ -0,0 +1,35 @@
1// SPDX-License-Identifier: GPL-2.0+
2/*
3 * xarray.c: Userspace shim for XArray test-suite
4 * Copyright (c) 2018 Matthew Wilcox <willy@infradead.org>
5 */
6
7#define XA_DEBUG
8#include "test.h"
9
10#define module_init(x)
11#define module_exit(x)
12#define MODULE_AUTHOR(x)
13#define MODULE_LICENSE(x)
14#define dump_stack() assert(0)
15
16#include "../../../lib/xarray.c"
17#undef XA_DEBUG
18#include "../../../lib/test_xarray.c"
19
20void xarray_tests(void)
21{
22 xarray_checks();
23 xarray_exit();
24}
25
26int __weak main(void)
27{
28 radix_tree_init();
29 xarray_tests();
30 radix_tree_cpu_dead(1);
31 rcu_barrier();
32 if (nr_allocated)
33 printf("nr_allocated = %d\n", nr_allocated);
34 return 0;
35}